In [1]:
!pip install tqdm -U

Requirement already up-to-date: tqdm in /opt/conda/envs/Python-3.6/lib/python3.6/site-packages (4.45.0)


In [2]:
from project_lib import Project
project = Project.access()

In [3]:
from datetime import date, timedelta
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import gc

from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

In [4]:
def create_kaggle_submission_file(test):
    predictions = test[['id', 'date', 'demand']]
    predictions = pd.pivot(predictions, index = 'id', columns = 'date', values = 'demand').reset_index()
    predictions.columns = ['id'] + ['F' + str(i + 1) for i in range(28)]
    
    submission = pd.read_csv('/project_data/data_asset/sample_submission.csv')
    evaluation_rows = [row for row in submission['id'] if 'evaluation' in row] 
    evaluation = submission[submission['id'].isin(evaluation_rows)]

    validation = submission[['id']].merge(predictions, on = 'id')
    final = pd.concat([validation, evaluation])
    return final

### Read training data

In [5]:
data = pd.read_pickle("/home/wsuser/work/project_data_assets/data_asset/full_data.pkl")
products = pd.read_pickle("/home/wsuser/work/project_data_assets/data_asset/products.pkl")

In [6]:
data = pd.merge(data, products, on = 'id')
gc.collect()

34

In [7]:
from data_preprocessing import CategoricalEncoder

cat_columns = ["item_id", "dept_id", "cat_id", "store_id", "state_id"]
encoder = CategoricalEncoder(cat_columns)
encoder.encode(data)
gc.collect()

0

In [8]:
train_df = data.loc[data.part == 'train']
test_df = data.loc[data.part == 'test1']
del data
gc.collect()

0

### Training/validation split

Last 28 days are used for evaluation

#### Data for evaluation

In [None]:
sales_train_validation = pd.read_csv('/project_data/data_asset/sales_train_validation.csv')
prices = pd.read_csv('/project_data/data_asset/sell_prices.csv')
calendar = pd.read_csv('/project_data/data_asset/calendar.csv')

train_fold_df = sales_train_validation.iloc[:, :-28]
valid_fold_df = sales_train_validation.iloc[:, -28:]

#### Data for training

In [None]:
x_train = train_df.loc[(train_df['date'] > '2013-01-01') & (train_df['date'] <= '2016-03-27')]
y_train = x_train['demand']
x_val = train_df.loc[(train_df['date'] > '2016-03-27') & (train_df['date'] <= '2016-04-24')]
y_val = x_val['demand']
del train_df
gc.collect()

0

In [None]:
def plot_train_val_split(train_dates, train_values, val_dates, val_values):
    plt.figure(figsize = (20, 10))
    plt.plot(train_dates, train_values)
    plt.plot(val_dates, val_values)
    plt.show()

In [None]:
plot_train_val_split(
    pd.to_datetime(x_train.loc[x_train.id == "HOBBIES_1_001_CA_1_validation", "date"]).values,
    x_train.loc[x_train.id == "HOBBIES_1_001_CA_1_validation", "demand"].values,
    pd.to_datetime(x_val.loc[x_val.id == "HOBBIES_1_001_CA_1_validation", "date"]).values,
    x_val.loc[x_val.id == "HOBBIES_1_001_CA_1_validation", "demand"].values,
)

### Naive method

Calculate mean demand in train set

In [None]:
predictions = x_train.groupby("id")["demand"].mean().reset_index()

In [None]:
predictions_by_date = pd.merge(x_val, x_train.groupby("id")["demand"].mean(), on = 'id')
predictions_by_date = predictions_by_date[["id", "date", "demand_y"]]
predictions_by_date.columns = ["id", "date", "demand"]
kaggle_submission = create_kaggle_submission_file(predictions_by_date)

In [None]:
validation_rows = [row for row in kaggle_submission['id'] if 'validation' in row] 
kaggle_submission = kaggle_submission[kaggle_submission['id'].isin(validation_rows)]
valid_preds = kaggle_submission.iloc[:, 1:]
valid_preds.columns = valid_fold_df.columns

evaluator = WRMSSEEvaluator(train_fold_df, valid_fold_df, calendar, prices)
evaluator.score(valid_preds)

### ARMA

In [None]:
import statsmodels.api as sm
from statsmodels import tsa

In [None]:
def apply_stats_ARMA_model(df):
    prediction = []
    for index, row in df.iterrows():
        ts = row.T.loc["d_1":"d_1885"]
        ts.index = pd.date_range(start = "2011-01-29", end = "2016-03-27")
        model = sm.tsa.ARMA(np.asarray(ts), (3,0)).fit(disp=False)
        forecast, stderr, conf_int = model.forecast(steps = 28)
        prediction.append([row.id] + forecast.tolist())
    return pd.DataFrame(prediction)

In [None]:
submission = apply_stats_ARMA_model(train_fold_df.iloc[:10])
submission.columns = kaggle_submission.columns

In [None]:
valid_preds = submission.iloc[:, 1:]
valid_preds.columns = valid_fold_df.columns

evaluator = WRMSSEEvaluator(train_fold_df, valid_fold_df, calendar, prices)
evaluator.score(valid_preds)

### Exponential smoothing

In [None]:
def apply_stats_exp_smooth_model(df):
    prediction = []
    for index, row in df.iterrows():
        ts = row.T.loc["d_1":"d_1885"]
        ts.index = pd.date_range(start = "2011-01-29", end = "2016-03-27")
        model = tsa.holtwinters.ExponentialSmoothing(np.asarray(ts)).fit()
        forecast = model.forecast(steps = 28)
        prediction.append([row.id] + forecast.tolist())
    return pd.DataFrame(prediction)

In [None]:
submission = apply_stats_exp_smooth_model(train_fold_df)
submission.columns = kaggle_submission.columns

In [None]:
valid_preds = submission.iloc[:, 1:]
valid_preds.columns = valid_fold_df.columns

evaluator = WRMSSEEvaluator(train_fold_df, valid_fold_df, calendar, prices)
evaluator.score(valid_preds)

### Light GBM

In [None]:
!pip install lightgbm

In [None]:
import lightgbm as lgb

In [None]:
features = ['event_type_1', 'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 'shift_t28', 
             'shift_t29', 'shift_t30', 'rolling_std_t7', 'rolling_std_t30', 'rolling_std_t60',
             'rolling_std_t90', 'rolling_std_t180', 'rolling_mean_t7',
             'rolling_mean_t30', 'rolling_mean_t60', 'rolling_mean_t90',
             'rolling_mean_t180', 'rolling_skew_t30', 'rolling_kurt_t30',
             'price_change_t365', 'rolling_price_std_t7', 'rolling_price_std_t30', 'year', 
             'quarter', 'month', 'week', 'day', 'dayofweek', 'is_weekend', 'item_id', 'dept_id',
             'store_id', 'state_id']

In [None]:
# define random hyperparameters
"""
params = {
    'boosting_type': 'gbdt',
    'metric': 'rmse',
    'objective': 'regression',
    'n_jobs': -1,
    'seed': 42,
    'bagging_fraction': 0.75,
    'bagging_freq': 10, 
    'colsample_bytree': 0.75}
"""

params = {
        "objective" : "poisson",
        "metric" :"rmse",
        "force_row_wise" : True,
        "learning_rate" : 0.075,
#         "sub_feature" : 0.8,
        "sub_row" : 0.75,
        "bagging_freq" : 1,
        "lambda_l2" : 0.1,
#         "nthread" : 4
        "metric": ["rmse"],
    'verbosity': 1,
    'num_iterations' : 1500,
}

# Initial features
#features = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'year', 'month', 'wm_yr_wk', 'wday', 'event_name_1',
#            'event_type_1', 'event_name_2', 'event_type_2', 'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 'lag_t28', 'lag_t29', 
#            'lag_t30', 'rolling_mean_t7', 'rolling_std_t7', 'rolling_mean_t30', 'rolling_mean_t90', 'rolling_mean_t180',
#            'rolling_std_t30', 'price_change_t1', 'price_change_t365', 'rolling_price_std_t7', 'rolling_price_std_t30']


train_set = lgb.Dataset(x_train[features], y_train)
val_set = lgb.Dataset(x_val[features], y_val)

model = lgb.train(params, train_set, valid_sets = [train_set, val_set], verbose_eval = 100)
val_pred = model.predict(x_val[features])
val_score = np.sqrt(mean_squared_error(val_pred, y_val))
print(f'Our val rmse score is {val_score}')

del train_set, val_set
gc.collect()



[100]	training's rmse: 2.52089	valid_1's rmse: 2.18867
[200]	training's rmse: 2.44592	valid_1's rmse: 2.16706
[300]	training's rmse: 2.40357	valid_1's rmse: 2.15259


In [None]:
y_pred = model.predict(x_val[features])
x_val['demand'] = y_pred
kaggle_submission = create_kaggle_submission_file(x_val)

In [None]:
from metrics import WRMSSEEvaluator

validation_rows = [row for row in kaggle_submission['id'] if 'validation' in row] 
kaggle_submission = kaggle_submission[kaggle_submission['id'].isin(validation_rows)]
valid_preds = kaggle_submission.iloc[:, 1:]
valid_preds.columns = valid_fold_df.columns

evaluator = WRMSSEEvaluator(train_fold_df, valid_fold_df, calendar, prices)
evaluator.score(valid_preds)

In [None]:
rand_indices = np.random.randint(0, valid_fold_df.index.max(), 20)

for pred, true in zip(valid_preds.iloc[rand_indices].iterrows(), valid_fold_df.iloc[rand_indices].iterrows()):
    plt.figure(figsize = (20, 10))
    plt.plot(np.arange(0, true[1].shape[0]), true[1].values)
    plt.plot(np.arange(0, true[1].shape[0]), pred[1].values)
    plt.show()

### CatBoost

In [None]:
!pip install catboost -U

In [None]:
x_train.fillna(0, inplace = True)
x_val.fillna(0, inplace = True)

In [None]:
from catboost import Pool, CatBoostRegressor

# initialize Pool
train_pool = Pool(x_train[features], 
                  y_train.astype(int), 
                  cat_features=['item_id', 'dept_id', 'store_id', 'state_id'])
test_pool = Pool(x_val[features], 
                 cat_features=['item_id', 'dept_id', 'store_id', 'state_id'])

del x_train
gc.collect()

# specify the training parameters 
model = CatBoostRegressor(iterations=100, 
                          loss_function='RMSE', 
                          used_ram_limit=34360000000)
#train the model
model.fit(train_pool)
# make the prediction using the resulting model
preds = model.predict(test_pool)
print(preds)

In [None]:
# specify the training parameters 
model = CatBoostRegressor(iterations=100, 
                          loss_function='RMSE', 
                          used_ram_limit=34360000000)
#train the model
model.fit(train_pool)
# make the prediction using the resulting model
preds = model.predict(test_pool)
print(preds)

In [None]:
x_val['demand'] = preds
kaggle_submission = create_kaggle_submission_file(x_val)

In [None]:
from metrics import WRMSSEEvaluator

validation_rows = [row for row in kaggle_submission['id'] if 'validation' in row] 
kaggle_submission = kaggle_submission[kaggle_submission['id'].isin(validation_rows)]
valid_preds = kaggle_submission.iloc[:, 1:]
valid_preds.columns = valid_fold_df.columns

evaluator = WRMSSEEvaluator(train_fold_df, valid_fold_df, calendar, prices)
evaluator.score(valid_preds)

In [None]:
val_pool = Pool(test_df[features], 
                cat_features=['item_id', 'dept_id', 'store_id', 'state_id'])
y_pred = model.predict(val_pool)
test_df['demand'] = y_pred
kaggle_submission = create_kaggle_submission_file(test_df)
project.save_data("my_kaggle_submission.csv", kaggle_submission.to_csv(index = False), overwrite=True)

### Creating submission

In [None]:
y_pred = model.predict(test_df[features])
test_df['demand'] = y_pred
kaggle_submission = create_kaggle_submission_file(test_df)

In [None]:
kaggle_submission.head()

In [None]:
project.save_data("my_kaggle_submission.csv", kaggle_submission.to_csv(index = False), overwrite=True)