In [None]:
# !pip install --upgrade pip
# !pip install swifter

In [None]:
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
import numpy as np, pandas as pd, matplotlib.pyplot as plt
plt.style.use('ggplot')

from dask_ml import preprocessing
from datetime import datetime, timedelta

from patsy import dmatrices
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.preprocessing import MinMaxScaler

import swifter

import os, sys, gc, time, warnings, pickle, psutil, random
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
#DECLARE CONSTANTS
NUM_ITEMS = 30490 # 3490 prods * 10 stores
nrows = int(365 * 2 * NUM_ITEMS)
FIRST_DAY = 350
h = 28 
max_lags = 57
tr_last = 1913
fday = datetime(2016,4, 25)

In [None]:
CAL_DTYPES={"event_name_1": "category", "event_name_2": "category", "event_type_1": "category", 
         "event_type_2": "category", "weekday": "category", 'wm_yr_wk': 'int16', "wday": "int16",
        "month": "int16", "year": "int16", "snap_CA": "float32", 'snap_TX': 'float32', 'snap_WI': 'float32' }
PRICE_DTYPES = {"store_id": "category", "item_id": "category", "wm_yr_wk": "int16","sell_price":"float32" }

In [None]:
# simple function to read the data in the competition files
def load_sub(PATH='/kaggle/input/', submission_only=False):
    print('Reading files...')
    if submission_only:
        sample_submission = pd.read_csv(PATH+'m5-forecasting-accuracy/sample_submission.csv').pipe(reduce_mem_usage)
        return sample_submission
    
def cal_transform(df):
    sc = MinMaxScaler(feature_range=(0, 1))
    cols = ['wholesale_inventory','wholesale_sales','retai_t_f_sales','urban_cpi','cpi','unemp_rate','ppi','gdp']
    df_scaled = sc.fit_transform(df[cols])
    df_scaled = pd.DataFrame(df_scaled)
    for i, col in enumerate(cols):
        df[col] = df_scaled[i]
    return df

def create_dt(is_train = True, nrows = None, first_day = 1200, PATH='/kaggle/input/'):
    prices = pd.read_csv("../input/m5-forecasting-accuracy/sell_prices.csv", dtype = PRICE_DTYPES)
    for col, col_dtype in PRICE_DTYPES.items():
        if col_dtype == "category":
            prices[col] = prices[col].cat.codes.astype("int16")
            prices[col] -= prices[col].min()
            
    cal = pd.read_csv(PATH+'m5-cal-mod2/calendar_mod_2.csv', dtype = CAL_DTYPES)
    cal = cal_transform(cal).pipe(reduce_mem_usage)

    cal["date"] = pd.to_datetime(cal["date"])
    for col, col_dtype in CAL_DTYPES.items():
        if col_dtype == "category":
            cal[col] = cal[col].cat.codes.astype("int16")
            cal[col] -= cal[col].min()
    
    start_day = max(1 if is_train  else tr_last-max_lags, first_day)
    numcols = [f"d_{day}" for day in range(start_day,tr_last+1)]
    catcols = ['id', 'item_id', 'dept_id','store_id', 'cat_id', 'state_id']
    dtype = {numcol:"float32" for numcol in numcols} 
    dtype.update({col: "category" for col in catcols if col != "id"})
    dt = pd.read_csv("../input/m5-forecasting-accuracy/sales_train_validation.csv", 
                     nrows = nrows, usecols = catcols + numcols, dtype = dtype)
    
    for col in catcols:
        if col != "id":
            dt[col] = dt[col].cat.codes.astype("int16")
            dt[col] -= dt[col].min()
    
    if not is_train:
        for day in range(tr_last+1, tr_last+ 28 +1):
            dt[f"d_{day}"] = np.nan
    
    dt = pd.melt(dt,
                  id_vars = catcols,
                  value_vars = [col for col in dt.columns if col.startswith("d_")],
                  var_name = "d",
                  value_name = "sales")
    
    dt = dt.merge(cal, on= "d", copy = False)
    dt = dt.merge(prices, on = ["store_id", "item_id", "wm_yr_wk"], copy = False)
    
    dt = dt.assign(d = dt.d.str[2:].astype(int))
    dt = reduce_mem_usage(dt)
    
    return dt

In [None]:
def create_fea(dt):
    lags = [7, 28]
    lag_cols = [f"lag_{lag}" for lag in lags ]
    for lag, lag_col in zip(lags, lag_cols):
        dt[lag_col] = dt[["id","sales"]].groupby("id")["sales"].shift(lag)

    wins = [7, 28]
    for win in wins :
        for lag,lag_col in zip(lags, lag_cols):
            dt[f"rmean_{lag}_{win}"] = dt[["id", lag_col]].groupby("id")[lag_col].transform(lambda x : x.rolling(win).mean())

    date_features = {
        "wday": "weekday",
        "week": "weekofyear",
        "month": "month",
        "quarter": "quarter",
        "year": "year",
        "mday": "day",
    }
    
    for date_feat_name, date_feat_func in date_features.items():
        if date_feat_name in dt.columns:
            dt[date_feat_name] = dt[date_feat_name].astype("int16")
        else:
            dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")
            
    dt = reduce_mem_usage(dt)

In [None]:
%%time
new_fe = False
if new_fe:
    df = create_dt(is_train=True, first_day= FIRST_DAY)
    create_fea(df)
    print('Saving to Pickle...')
    df.to_pickle('df.pkl')
else:
    df = pd.read_pickle('df.pkl')
    df = reduce_mem_usage(df)
gc.collect()

In [None]:
############ Truncating data set for fast testing #############

############ 90 days training
############ 28 day test period, 
############ 118 total days in dataset
############ Validation same as test set for testing purposes
##############################################################
df = df[(df.d > (1913 - 118)) & (df.d <= 1913)].reset_index(drop=True)
df.dropna(inplace = True)
gc.collect()

####################### Masks for data #######################
train_mask = df['d']<=(1913-28)

# Test mask, also used here as validation
test_mask = df['d']>(1913-28)

################### Feature columns ########################
keep_fe = ['cat_id', 'state_id','sales', 'wday', 'month','event_name_1', 'event_name_2', 'wholesale_inventory','wholesale_sales', 'retai_t_f_sales', 'urban_cpi', 'cpi', 'unemp_rate',
       'ppi', 'gdp', 'sell_price', 'lag_7', 'lag_28', 'rmean_7_7','rmean_28_7', 'rmean_7_28', 'rmean_28_28']
# remove_features = [col for col in df.columns if col not in keep_fe]
# features_columns = [col for col in list(df) if col not in remove_features]

df_test = df[test_mask][keep_fe]
df_test = df_test.dropna()

df_train = df[train_mask][keep_fe]
df_train = df_train.dropna()

# We also need the test_id for submission function 
test_id = df[test_mask][['id', 'd']].reset_index(drop=True)

print('Training data set length='+str(df_train.shape))
print('Testing data set length='+str(df_test.shape))
gc.collect()

In [None]:
del df
gc.collect()

In [None]:
#Setup the regression expression in patsy notation. 
#We are telling patsy that sales is our dependent variable and it depends on the other regression variables
expr = """sales ~ cat_id + state_id + wday + month + event_name_1 + event_name_2 + wholesale_inventory + wholesale_sales + retai_t_f_sales + urban_cpi + cpi + unemp_rate + ppi + gdp + sell_price + lag_7 + lag_28 + rmean_7_7 + rmean_28_7 + rmean_7_28 + rmean_28_28"""

#Set up the X and y matrices for the training and testing data sets
y_train, X_train = dmatrices(expr, df_train) #, return_type='dataframe'
y_test, X_test = dmatrices(expr, df_test) #, return_type='dataframe'

In [None]:
def get_lambda(df_train, y_train, X_train):
    #Using the statsmodels GLM class, train the Poisson regression model on the training data set
    poisson_training_results = sm.GLM(y_train, X_train, family=sm.families.Poisson()).fit()
    
    #print out the training summary
    print(poisson_training_results.summary())
    
    #Add the λ vector as a new column called 'LAMBDA' to the Data Frame of the training data set
    df_train['LAMBDA'] = poisson_training_results.mu
    
    #auxiliary OLS regression add a derived column called auxiliary OLS regression'AUX_OLS_DEP' to the pandas Data Frame. This new column will store the values of the dependent variable of the OLS regression
    df_train['AUX_OLS_DEP'] = df_train.swifter.apply(lambda x: ((x['sales'] - x['LAMBDA'])**2 - x['sales']) / x['LAMBDA'], axis=1)
    
    return df_train

In [None]:
%%time
df_train = get_lambda(df_train, y_train, X_train)
gc.collect()

In [None]:
print('Saving to Pickle...')
df_train.to_pickle('df_train.pkl')
# df_train = pd.read_pickle('df_train.pkl')

In [None]:
def get_ols_aux(df_train, y_train, X_train):
    #use patsy to form the model specification for the OLSR
    ols_expr = """AUX_OLS_DEP ~ LAMBDA - 1"""
    
    #Configure and fit the OLSR model
    aux_olsr_results = smf.ols(ols_expr, df_train).fit()
    alpha = aux_olsr_results.params[0]
    return alpha

In [None]:
%%time
alpha = get_ols_aux(df_train, y_train, X_train)
gc.collect()

In [None]:
del df_test, df_train
gc.collect()

In [None]:
def get_nbm(y_train, X_train, alpha):
    #train the NB2 model on the training data set
    nb2_training = sm.GLM(y_train, X_train,family=sm.families.NegativeBinomial(alpha=alpha))
    gc.collect()
    nb2_training_results = nb2_training.fit()
    #print the training summary
    print(nb2_training_results.summary())
    
    return nb2_training_results

In [None]:
%%time
nb2_training_results = get_nbm(y_train, X_train, alpha)
gc.collect()

In [None]:
def make_preds(nb2_training_results, X_test, test_id):
    #make some predictions using our trained NB2 model
    nb2_predictions = nb2_training_results.get_prediction(X_test)
    predictions_summary_frame = nb2_predictions.summary_frame()
    y_pred = predictions_summary_frame['mean']
    test_id['sales'] = y_pred.values
    y_true = y_test['sales']
    
    submission = load_sub(submission_only=True)
    
    predictions = test_id[['id', 'd', 'sales']]
    predictions = pd.pivot(predictions, index = 'id', columns = 'd', values = 'sales').reset_index()
    predictions.columns = ['id'] + ['F' + str(i + 1) for i in range(28)]

    evaluation_rows = [row for row in submission['id'] if 'evaluation' in row] 
    evaluation = submission[submission['id'].isin(evaluation_rows)]

    validation = submission[['id']].merge(predictions, on = 'id')
    final = pd.concat([validation, evaluation])
    final.to_csv('submission.csv', index = False)
    del submission, validation, evaluation, predictions
    gc.collect
    return final, y_pred, y_true

In [None]:
%%time
final, y_pred, y_true = make_preds(nb2_training_results, X_test, test_id)

In [None]:
fig = plt.figure(figsize=(40,20))
fig.suptitle('Predicted versus actual')
predicted, = plt.plot(X_test.index, y_pred, 'g--', label='Predicted sales')
actual, = plt.plot(X_test.index, y_true, 'r--', label='Actual sales')
plt.legend(handles=[predicted, actual])
plt.show()