In [17]:
from project_lib import Project
project = Project.access()

### Simplest baseline model
The average of prices for last 30 days

In [None]:
import pandas as pd
import numpy as np

In [None]:
sales_train_validation = pd.read_csv('/project_data/data_asset/sales_train_validation.csv')
sales_train_validation.head()

In [None]:
sample_submission = pd.read_csv('/project_data/data_asset/sample_submission.csv')
sample_submission.head()

In [None]:
d_cols = [column for column in sales_train_validation.columns if 'd_' in column] 
sales_avg_map = sales_train_validation.set_index('id')[d_cols[-30:]].mean(axis=1).to_dict()

In [None]:
fcols = [f for f in sample_submission.columns if 'F' in f]
for f in fcols:
    sample_submission[f] = sample_submission['id'].map(sales_avg_map).fillna(0)

In [None]:
#project.save_data("my_submission.csv", sample_submission.to_csv(index=False), overwrite=True)

### Light GBM

In [None]:
#!pip install lightgbm

In [3]:
from datetime import date, timedelta
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import gc

### Read training data

In [4]:
data = pd.read_pickle("/home/wsuser/work/project_data_assets/data_asset/full_data.pkl")

In [5]:
train_df = data.loc[data.part == 'train']
test_df = data.loc[data.part == 'test1']
del data

### Training/validation split

Last 28 days are used for evaluation

In [6]:
x_train = train_df.loc[train_df['date'] <= '2016-03-27']
y_train = x_train['demand']
x_val = train_df.loc[(train_df['date'] > '2016-03-27') & (train_df['date'] <= '2016-04-24')]
y_val = x_val['demand']
del train_df

### Light GBM

In [7]:
# define random hyperparammeters
params = {
    'boosting_type': 'gbdt',
    'metric': 'rmse',
    'objective': 'regression',
    'n_jobs': -1,
    'seed': 42,
    'learning_rate': 0.1,
    'bagging_fraction': 0.75,
    'bagging_freq': 10, 
    'colsample_bytree': 0.75}

features = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'year', 'month', 'wm_yr_wk', 'wday', 'event_name_1',
            'event_type_1', 'event_name_2', 'event_type_2', 'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 'lag_t28', 'lag_t29', 
            'lag_t30', 'rolling_mean_t7', 'rolling_std_t7', 'rolling_mean_t30', 'rolling_mean_t90', 'rolling_mean_t180',
            'rolling_std_t30', 'price_change_t1', 'price_change_t365', 'rolling_price_std_t7', 'rolling_price_std_t30']


train_set = lgb.Dataset(x_train[features], y_train)
val_set = lgb.Dataset(x_val[features], y_val)

model = lgb.train(params, train_set, num_boost_round = 2500, early_stopping_rounds = 50, valid_sets = [train_set, val_set], verbose_eval = 100)
val_pred = model.predict(x_val[features])
val_score = np.sqrt(mean_squared_error(val_pred, y_val))
print(f'Our val rmse score is {val_score}')

Training until validation scores don't improve for 50 rounds
[100]	training's rmse: 2.35535	valid_1's rmse: 2.1428
[200]	training's rmse: 2.30859	valid_1's rmse: 2.13539
Early stopping, best iteration is:
[161]	training's rmse: 2.3234	valid_1's rmse: 2.1331
Our val rmse score is 2.133101966070852


### Creating submission

In [8]:
def create_kaggle_submission_file(test):
    predictions = test[['id', 'date', 'demand']]
    predictions = pd.pivot(predictions, index = 'id', columns = 'date', values = 'demand').reset_index()
    predictions.columns = ['id'] + ['F' + str(i + 1) for i in range(28)]
    
    submission = pd.read_csv('/project_data/data_asset/sample_submission.csv')
    evaluation_rows = [row for row in submission['id'] if 'evaluation' in row] 
    evaluation = submission[submission['id'].isin(evaluation_rows)]

    validation = submission[['id']].merge(predictions, on = 'id')
    final = pd.concat([validation, evaluation])
    return final

In [9]:
y_pred = model.predict(test_df[features])
test_df['demand'] = y_pred
kaggle_submission = create_kaggle_submission_file(test_df)

In [18]:
project.save_data("my_kaggle_submission.csv", kaggle_submission.to_csv(index = False), overwrite=True)

{'file_name': 'my_kaggle_submission.csv',
 'message': 'File saved to project storage.',
 'asset_id': '84294923-66aa-481b-a5c7-d5792cf3efc3'}