In [70]:
pdf = pd.read_csv(constants.COVID_FILE)

In [71]:
pdf

Unnamed: 0,region_id,date,c1_school_closing,c1_flag,c2_workplace_closing,c2_flag,c3_cancel_public_events,c3_flag,c4_restrictions_on_gatherings,c4_flag,...,h4_emergency_investment_in_healthcare,h5_investment_in_vaccines,h6_facial_coverings,h6_flag,h7_vaccination_policy,h7_flag,h8_protection_of_elderly_people,h8_flag,confirmed_cases,confirmed_deaths
0,AND,2021-04-28,1,1,2,1,2,1,4,1,...,0,0,3,1,4,1,2,1,13148,125
1,AND,2021-04-29,1,1,2,1,2,1,4,1,...,0,0,3,1,4,1,2,1,13198,125
2,AND,2021-04-30,1,1,2,1,2,1,4,1,...,0,0,3,1,4,1,2,1,13232,125
3,AND,2021-05-01,1,1,2,1,2,1,4,1,...,0,0,3,1,4,1,2,1,13232,125
4,AND,2021-05-02,1,1,2,1,2,1,4,1,...,0,0,3,1,4,1,2,1,13282,127
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9996,ZAF,2021-07-21,2,1,2,1,2,1,4,1,...,0,0,3,1,3,1,3,1,2327472,68192
9997,ZAF,2021-07-22,2,1,2,1,2,1,4,1,...,0,0,3,1,3,1,3,1,2342330,68625
9998,ZAF,2021-07-23,2,1,2,1,2,1,4,1,...,0,0,3,1,3,1,3,1,2356049,69075
9999,ZAF,2021-07-24,2,1,2,1,2,1,4,1,...,0,0,3,1,3,1,3,1,2368105,69488


In [53]:
import lightgbm as lgb
import random
import numpy as np
import pandas as pd


import upkeep.model as model
import upkeep.constants as constants


COL_REGION = "region_id"
COL_PREDICTION = "prediction"
COL_CASES = "confirmed_cases"
COL_WINDOW = "prediction_window"


def get_buckets():
    bucket_size = constants.N_PREDICTION_DAYS // constants.N_BUCKETS
    return [(i * bucket_size, bucket_size * (i + 1) - 1)
            for i in range(constants.N_BUCKETS)]


def get_cases_column_names():
    return ["confirmed_cases_{}_days_ago".format(constants.N_INPUT_DAYS - 1 - i)
            for i in range(constants.N_INPUT_DAYS - 1)]


def random_from_tuple(t):
    return random.randint(t[0], t[1])


def get_rolling_values(df, column, window):
    return np.lib.stride_tricks.as_strided(df[column],
                                           (len(df) - (window - 1), window),
                                           (df[column].values.strides * 2))

def add_cases_columns(inputdf, window=constants.N_INPUT_DAYS):
    arrays = get_rolling_values(inputdf, COL_CASES, window)
    inputdf.reset_index(inplace=True, drop=True)
    inputdf.loc[window - 1:, get_cases_column_names() + [COL_CASES]] = arrays
    return inputdf


def cases_columns_to_percentages(inputdf):
    cases_columns = get_cases_column_names() + [COL_PREDICTION]
    for cases_column in cases_columns:
        inputdf.loc[:, cases_column] = inputdf.loc[:, cases_column] / inputdf.loc[:, COL_CASES]
        inputdf.loc[:, cases_column] = inputdf.loc[:, cases_column].shift(1)
    return inputdf


def add_prediction_rows(inputdf, window=1):
    # constants.N_PREDICTION_DAYS
    arrays = get_rolling_values(inputdf, COL_CASES, window)
    inputdf.reset_index(inplace=True, drop=True)
    inputdf.loc[window - 1:, COL_PREDICTION] = pd.Series(list(arrays))
    inputdf.loc[:, COL_PREDICTION] = inputdf \
        .groupby(COL_REGION)[COL_PREDICTION] \
        .transform(lambda x: x.shift(-1))
    inputdf = cases_columns_to_percentages(inputdf)
    inputdf.dropna(inplace=True)
    inputdf[COL_PREDICTION] = inputdf[COL_PREDICTION] \
        .apply(lambda x: [[i, n] for i, n in enumerate(x)])
    
    #sampled_dates = [random_from_tuple(bucket) for bucket in get_buckets()]
    #inputdf[COL_PREDICTION] = inputdf[COL_PREDICTION] \
        #.apply(lambda x: [x[date] for date in sampled_dates])  # only predictions for random sampled dates
    inputdf = inputdf.explode(COL_PREDICTION).iloc[window:]
    inputdf[COL_WINDOW] = inputdf[COL_PREDICTION].apply(lambda x: x[0] + 1)
    inputdf[COL_PREDICTION] = inputdf[COL_PREDICTION].apply(lambda x: x[1])
    return inputdf.drop(columns=[COL_WINDOW])  # COL_CASES, 


def preprocess_confirmed_cases_input(data):
    inputdf = data[data[COL_CASES] > 1000]
    inputdf.loc[:, constants.FLAG_COLUMNS] = inputdf \
        .loc[:, constants.FLAG_COLUMNS].fillna(1)
    inputdf.loc[:, constants.INPUT_COLUMNS] = inputdf \
        .groupby(COL_REGION)[constants.INPUT_COLUMNS] \
        .transform(lambda x: x.ffill())
    
    
    
    inputdf = add_cases_columns(inputdf)
    inputdf = add_prediction_rows(inputdf)
    
    
    casesCols = get_cases_column_names() + [COL_PREDICTION]
    
    inputdf[casesCols] = inputdf.groupby(COL_REGION)[casesCols].shift(-21)  # the time periods should be sorted
    return inputdf.drop(columns=[COL_CASES]).dropna().reset_index(drop=True)


def prepare_train_data(input):
    X = input[constants.INPUT_COLUMNS + get_cases_column_names()]  #  + [COL_WINDOW]
    y = input[COL_PREDICTION]
    return lgb.Dataset(X, y)


def load_train_data(filename):
    return lgb.Dataset(filename)


def train_model(train_data: lgb.Dataset):
    return lgb.train(constants.MODEL_PARAMS, train_data)


def train(data):
    confirmed_cases_input = preprocess_confirmed_cases_input(data)
    train_data = prepare_train_data(confirmed_cases_input)
    return lgb.train(constants.MODEL_PARAMS, train_data,
                      num_boost_round=constants.MODEL_NUM_ROUNDS)

In [55]:
model = train(pdf)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1612
[LightGBM] [Info] Number of data points in the train set: 7071, number of used features: 32
[LightGBM] [Info] Start training from score 1.005251


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  inputdf[COL_PREDICTION] = inputdf[COL_PREDICTION] \


In [60]:
# prediction without gradient
model.predict(preprocess_confirmed_cases_input(pdf)[constants.INPUT_COLUMNS + get_cases_column_names()])

# outputs array

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, val, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = empty_value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using

array([1.00524942, 1.00524942, 1.00524942, ..., 1.00524097, 1.00524097,
       1.00523503])

In [None]:
# implement prediction mechanism for 3 months, looping and predicting one day at a time - multi-step


In [72]:

from sktime.forecasting.compose import make_reduction, TransformedTargetForecaster
from sktime.forecasting.model_selection import ExpandingWindowSplitter, ForecastingGridSearchCV


def create_forecaster():
    
    # creating forecaster with LightGBM
    regressor = lgb.LGBMRegressor()
    forecaster = make_reduction(regressor, window_length=5, strategy="recursive")
    
    return forecaster

def grid_serch_forecaster(train, test, forecaster, param_grid):

    # Grid search on window_length
    cv = ExpandingWindowSplitter(initial_window=int(len(train) * 0.7))
    gscv = ForecastingGridSearchCV(
        forecaster, strategy="refit", cv=cv, param_grid=param_grid
    )
    gscv.fit(train)
    print(f"best params: {gscv.best_params_}")
    
    # forecasting
    fh=np.arange(len(test))+1
    y_pred = gscv.predict(fh=fh)
    mae, mape = plot_forecast(train, test, y_pred)

    return mae, mape
    
param_grid = {"window_length": [5, 10, 15, 20, 25, 30]} # parameter set to be grid searched
forecaster = create_forecaster()
sun_lgb_mae, sun_lgb_mape = grid_serch_forecaster(preprocess_confirmed_cases_input(pdf)[constants.INPUT_COLUMNS + get_cases_column_names()], preprocess_confirmed_cases_input(pdf)[COL_PREDICTION], forecaster, param_grid)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, val, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = empty_value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using

ValueError: y must be univariate, but found 38 variables.

In [73]:
preprocess_confirmed_cases_input(pdf)[COL_PREDICTION]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  inputdf[COL_PREDICTION] = inputdf[COL_PREDICTION] \


0       1.006657
1       1.006222
2       1.005834
3       1.004511
4       1.002702
          ...   
7066    1.003385
7067    1.005622
7068    1.003354
7069    1.000371
7070    1.004456
Name: prediction, Length: 7071, dtype: float64

In [None]:
# best fit line gradients