In [21]:
from project_lib import Project
project = Project.access()

In [22]:
from datetime import date, timedelta
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import gc

from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

In [23]:
def create_kaggle_submission_file(test):
    predictions = test[['id', 'date', 'demand']]
    predictions = pd.pivot(predictions, index = 'id', columns = 'date', values = 'demand').reset_index()
    predictions.columns = ['id'] + ['F' + str(i + 1) for i in range(28)]
    
    submission = pd.read_csv('/project_data/data_asset/sample_submission.csv')
    evaluation_rows = [row for row in submission['id'] if 'evaluation' in row] 
    evaluation = submission[submission['id'].isin(evaluation_rows)]

    validation = submission[['id']].merge(predictions, on = 'id')
    final = pd.concat([validation, evaluation])
    return final

### Read training data

In [24]:
data = pd.read_pickle("/home/wsuser/work/project_data_assets/data_asset/full_data.pkl")
products = pd.read_pickle("/home/wsuser/work/project_data_assets/data_asset/products.pkl")

In [25]:
data = pd.merge(data, products, on = 'id')
gc.collect()

11443

In [26]:
from data_preprocessing import CategoricalEncoder

cat_columns = ["item_id", "dept_id", "cat_id", "store_id", "state_id"]
encoder = CategoricalEncoder(cat_columns)
encoder.encode(data)
gc.collect()

0

In [27]:
train_df = data.loc[data.part == 'train']
test_df = data.loc[data.part == 'test1']
del data
gc.collect()

7

### Training/validation split

Last 28 days are used for evaluation

#### Data for evaluation

In [28]:
sales_train_validation = pd.read_csv('/project_data/data_asset/sales_train_validation.csv')
prices = pd.read_csv('/project_data/data_asset/sell_prices.csv')
calendar = pd.read_csv('/project_data/data_asset/calendar.csv')

train_fold_df = sales_train_validation.iloc[:, :-28]
valid_fold_df = sales_train_validation.iloc[:, -28:]

In [29]:
def describe_ts(ts, ts_id):
    descr_table = ts.describe()
    return pd.DataFrame({
        "id": ts_id,
        "mean": [descr_table["mean"]],
        "std": [descr_table["std"]],
        "min_value": [descr_table["min"]],
        "max_value": [descr_table["max"]],
        "median": [ts.median()],
        "num_zeros": [(ts == 0).sum()],
        "num_na": [ts.isna().sum()]
    })

In [30]:
out = pd.DataFrame(columns = ["id", "mean", "std", "min_value", "max_value", "median", "num_zeros", "num_na"])
for index, row in train_fold_df.iterrows():
    stats_row = describe_ts(row.iloc[6:].astype(int), row.id)
    out = out.append(stats_row)

In [57]:
out

Unnamed: 0,id,mean,std,min_value,max_value,median,num_zeros,num_na
0,HOBBIES_1_001_CA_1_validation,0.303979,0.672014,0.0,5.0,0.0,1480,0
0,HOBBIES_1_002_CA_1_validation,0.260477,0.572320,0.0,5.0,0.0,1485,0
0,HOBBIES_1_003_CA_1_validation,0.144297,0.480374,0.0,6.0,0.0,1682,0
0,HOBBIES_1_004_CA_1_validation,1.717241,1.991794,0.0,15.0,1.0,608,0
0,HOBBIES_1_005_CA_1_validation,0.960743,1.294094,0.0,9.0,0.0,943,0
0,HOBBIES_1_006_CA_1_validation,0.862069,1.577801,0.0,10.0,0.0,1242,0
0,HOBBIES_1_007_CA_1_validation,0.219098,0.502917,0.0,3.0,0.0,1541,0
0,HOBBIES_1_008_CA_1_validation,7.215915,9.098223,0.0,91.0,4.0,531,0
0,HOBBIES_1_009_CA_1_validation,1.190451,2.018575,0.0,20.0,0.0,1015,0
0,HOBBIES_1_010_CA_1_validation,0.719894,0.922732,0.0,6.0,0.0,981,0


#### Data for training

In [None]:
x_train = train_df.loc[(train_df['date'] > '2010-01-01') & (train_df['date'] <= '2016-03-27')]
y_train = x_train['demand']
x_val = train_df.loc[(train_df['date'] > '2016-03-27') & (train_df['date'] <= '2016-04-24')]
y_val = x_val['demand']
del train_df
gc.collect()

In [None]:
def plot_train_val_split(train_dates, train_values, val_dates, val_values):
    plt.figure(figsize = (20, 10))
    plt.plot(train_dates, train_values)
    plt.plot(val_dates, val_values)
    plt.show()

In [None]:
plot_train_val_split(
    pd.to_datetime(x_train.loc[x_train.id == "HOBBIES_1_001_CA_1_validation", "date"]).values,
    x_train.loc[x_train.id == "HOBBIES_1_001_CA_1_validation", "demand"].values,
    pd.to_datetime(x_val.loc[x_val.id == "HOBBIES_1_001_CA_1_validation", "date"]).values,
    x_val.loc[x_val.id == "HOBBIES_1_001_CA_1_validation", "demand"].values,
)

### Light GBM

In [None]:
!pip install lightgbm

In [None]:
import lightgbm as lgb

In [None]:
features = ['event_type_1', 'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 'shift_t28', 
             'shift_t29', 'shift_t30', 'rolling_std_t7', 'rolling_std_t30', 'rolling_std_t60',
             'rolling_std_t90', 'rolling_std_t180', 'rolling_mean_t7',
             'rolling_mean_t30', 'rolling_mean_t60', 'rolling_mean_t90',
             'rolling_mean_t180', 'rolling_skew_t30', 'rolling_kurt_t30',
             'price_change_t365', 'rolling_price_std_t7', 'rolling_price_std_t30', 'year', 
             'quarter', 'month', 'week', 'day', 'dayofweek', 'is_weekend', 'item_id', 'dept_id',
             'store_id', 'state_id']

In [None]:
# define random hyperparameters
params = {
    'boosting_type': 'gbdt',
    'metric': 'rmse',
    'objective': 'regression',
    'n_jobs': -1,
    'seed': 42,
    'bagging_fraction': 0.75,
    'bagging_freq': 10, 
    'colsample_bytree': 0.75}

# Initial features
#features = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'year', 'month', 'wm_yr_wk', 'wday', 'event_name_1',
#            'event_type_1', 'event_name_2', 'event_type_2', 'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 'lag_t28', 'lag_t29', 
#            'lag_t30', 'rolling_mean_t7', 'rolling_std_t7', 'rolling_mean_t30', 'rolling_mean_t90', 'rolling_mean_t180',
#            'rolling_std_t30', 'price_change_t1', 'price_change_t365', 'rolling_price_std_t7', 'rolling_price_std_t30']


train_set = lgb.Dataset(x_train[features], y_train)
val_set = lgb.Dataset(x_val[features], y_val)

model = lgb.train(params, train_set, num_boost_round = 2500, early_stopping_rounds = 1000, valid_sets = [train_set, val_set], verbose_eval = 100)
val_pred = model.predict(x_val[features])
val_score = np.sqrt(mean_squared_error(val_pred, y_val))
print(f'Our val rmse score is {val_score}')

del train_set, val_set
gc.collect()

In [None]:
y_pred = model.predict(x_val[features])
x_val['demand'] = y_pred
kaggle_submission = create_kaggle_submission_file(x_val)

In [None]:
from metrics import WRMSSEEvaluator

validation_rows = [row for row in kaggle_submission['id'] if 'validation' in row] 
kaggle_submission = kaggle_submission[kaggle_submission['id'].isin(validation_rows)]
valid_preds = kaggle_submission.iloc[:, 1:]
valid_preds.columns = valid_fold_df.columns

evaluator = WRMSSEEvaluator(train_fold_df, valid_fold_df, calendar, prices)
evaluator.score(valid_preds)

In [None]:
rand_indices = np.random.randint(0, valid_fold_df.index.max(), 20)

for pred, true in zip(valid_preds.iloc[rand_indices].iterrows(), valid_fold_df.iloc[rand_indices].iterrows()):
    plt.figure(figsize = (20, 10))
    plt.plot(np.arange(0, true[1].shape[0]), true[1].values)
    plt.plot(np.arange(0, true[1].shape[0]), pred[1].values)
    plt.show()

### Creating submission

In [None]:
y_pred = model.predict(test_df[features])
test_df['demand'] = y_pred
kaggle_submission = create_kaggle_submission_file(test_df)

In [None]:
kaggle_submission.head()

In [None]:
project.save_data("my_kaggle_submission.csv", kaggle_submission.to_csv(index = False), overwrite=True)