#**Final Project for Coursera's 'How to Win a Data Science Competition'**
April, 2020


(Competition Info last updated:  3 years ago)

##**About this Competition**

You are provided with daily historical sales data. The task is to forecast the total amount of products sold in every shop for the test set. Note that the list of shops and products slightly changes every month. Creating a robust model that can handle such situations is part of the challenge.

Evaluation: root mean squared error (RMSE). True target values are clipped into [0,20] range.

.

##**File descriptions**

***sales_train.csv*** - the training set. Daily historical data from January 2013 to October 2015.

***test.csv*** - the test set. You need to forecast the sales for these shops and products for November 2015.

***sample_submission.csv*** - a sample submission file in the correct format.

***items.csv*** - supplemental information about the items/products.

***item_categories.csv***  - supplemental information about the items categories.

***shops.csv***- supplemental information about the shops.

.

##**Data fields**

***ID*** - an Id that represents a (Shop, Item) tuple within the test set

***shop_id*** - unique identifier of a shop

***item_id*** - unique identifier of a product

***item_category_id*** - unique identifier of item category

***item_cnt_day*** - number of products sold. You are predicting a monthly amount of this measure

***item_price*** - current price of an item

***date*** - date in format dd/mm/yyyy

***date_block_num*** - a consecutive month number. January 2013 is 0, February 2013 is 1,..., October 2015 is 33

***item_name*** - name of item

***shop_name*** - name of shop

***item_category_name*** - name of item category

#Load Files
Load competition data files and import helpful custom code libraries from shared GitHub repository

In [1]:
#Load libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from itertools import product
import time
import pickle
from lightgbm import LGBMRegressor
import lightgbm as lgb
import sklearn
import time

In [2]:
#only use the below code (drive.mount()) if using google collab and files are stored on a google drive directory
from google.colab import drive
drive.mount('/content/drive')


file_dir_path = '/content/drive/My Drive/Colab Notebooks/Coursera_Data_Science_Competitions_Kaggle_project/Kag'

KeyboardInterrupt: ignored

In [None]:
#Basic data files (provided by kaggle)
data_folder =  '/data_output'
items = pd.read_csv(file_dir_path + data_folder +  '/items.csv')
sales_train = pd.read_csv(file_dir_path + data_folder + '/sales_train.csv.gz')
test = pd.read_csv(file_dir_path + data_folder + '/test.csv.gz')

In [None]:
#Created data files
days_by_month = pd.read_csv(file_dir_path +  data_folder + '/days_by_month.csv')

In [None]:
#Code Controls - keep as set
load_model = True

Helpful functions

In [None]:
def lag_feature(df, lags, col):
    tmp = df[['date_block_num','shop_id','item_id',col]]
    for i in range(len(lags)):
      shifted_df = tmp.copy()
      shifted_df.columns = ['date_block_num','shop_id','item_id', col+'_lag_'+str(lags[i])]
      shifted_df['date_block_num'] += lags[i]   
      if i == 0:
        shifted_df_merged = shifted_df
      else:
        shifted_df_merged = pd.merge(shifted_df_merged, shifted_df, on=['date_block_num','shop_id','item_id'], how='left')
    #shifted_df_merged  = downcast_df(shifted_df_merged)
    return shifted_df_merged

import numpy as np
def downcast_df(data):
  cols_not_to_downcast = ['item_id', 'ID','item_cnt_month', 'sales_sum_by_month', 'cluster_code', 'price_']
  for col in data.columns:
    col_type = data[col].dtype
    if np.issubdtype(col_type, np.integer):
      if col in cols_not_to_downcast:
        data[col] = data[col].astype(np.int32)
      else:    
        data[col] = data[col].astype(np.int16)
    elif np.issubdtype(col_type, np.floating):
      if col in cols_not_to_downcast:
        data[col] = data[col].astype(np.float32) 
      else:    
        data[col] = data[col].astype(np.float16) #if below 32 df.describe() mean and std are NaN. but no difference in results for float16 vs float32 (so can use float16 if needed and memory crashes)
    else:
      pass
  return data

def upcast_df(data):
  cols_not_to_downcast = ['item_id', 'ID']
  for col in data.columns:
    col_type = data[col].dtype
    if np.issubdtype(col_type, np.integer):
      if col in cols_not_to_downcast:
        data[col] = data[col].astype(np.int32)
      else:    
        data[col] = data[col].astype(np.int32)
    elif np.issubdtype(col_type, np.floating):
      data[col] = data[col].astype(np.float64)
    else:
      pass
  return data


def infer_variable_types(data):
  variable_types = {'categorical': [], 'numerical': []}
  for col in data.columns:
    col_type = data[col].dtype
    if np.issubdtype(col_type, np.integer):
      variable_types['categorical'].append(col)
    elif np.issubdtype(col_type, np.floating):
      variable_types['numerical'].append(col)
    else:
      pass
  return variable_types 

def sort_variable_types(data, categorical_cols, numerical_cols):
  cols_not_to_downcast = ['item_id', 'ID']
  for col in data.columns:
    if col in categorical_cols:
      if col in cols_not_to_downcast:
        data[col] = data[col].astype(np.int32)
      else:    
        data[col] = data[col].astype(np.int8)
    elif col in numerical_cols:
      data[col] = data[col].astype(np.float16)
    else:
      pass
  return data

# Data Preparation




Make monthly table ("matrix") of Shop-Item pairs (using cartesian product of date_block_num, shop_id, item_id)

In [None]:
matrix = []
cols = ['date_block_num','shop_id','item_id']
for i in range(34):
    sales = sales_train[sales_train.date_block_num==i]
    matrix.append(np.array(list(product([i], sales.shop_id.unique(), sales.item_id.unique())), dtype='int16'))    
matrix = pd.DataFrame(np.vstack(matrix), columns=cols)

matrix['date_block_num'] = matrix['date_block_num'].astype(np.int8)
matrix['shop_id'] = matrix['shop_id'].astype(np.int8)
matrix['item_id'] = matrix['item_id'].astype(np.int16)
matrix.sort_values(cols,inplace=True)
print("monthly table is")
matrix.head()

Include test set  - and its test IDs (of shop-item pairs) given from test.csv[link text

In [None]:
test['date_block_num'] = 34
test['date_block_num'] = test['date_block_num'].astype(np.int8)
test['shop_id'] = test['shop_id'].astype(np.int8)
test['item_id'] = test['item_id'].astype(np.int16)

matrix = pd.concat([matrix, test], ignore_index=True, sort=False, keys=cols)
matrix.fillna(0, inplace=True) # 34 month
matrix['ID'] = matrix['ID'].astype(np.int32)

matrix

Add monthly item_count (sales) to the monthly table

In [None]:
ts = time.time()
monthly_sales = sales_train.groupby(['date_block_num', 'shop_id', 'item_id']).agg({'item_cnt_day': ['sum']})
monthly_sales.columns = ['item_cnt_month']
monthly_sales.reset_index(inplace=True)

matrix = pd.merge(matrix, monthly_sales, on=cols, how='left')
matrix['item_cnt_month'] = (matrix['item_cnt_month']
                                .fillna(0)
                                .clip(0,20) # NB clip target here
                                .astype(np.float32))
matrix.tail()

In [None]:
#matrix to merge after groupbys
matrix = pd.merge(matrix, items[['item_id', 'item_category_id']], on=['item_id'], how='left')
#sales_train to groupby later
sales_train = pd.merge(sales_train, items[['item_id', 'item_category_id']], on=['item_id'], how='left')

In [None]:
#Replace NaNs with 0
matrix.fillna(0, inplace=True)

#Downcast variables (smaller memory size)
matrix = downcast_df(matrix)
matrix.info()

Understanding dataframe created ("matrix")



In [None]:
#NOTE: For item_category_features we have a very small percentage of null values (maybe a few item ids present in matrix, not mapped to an item category during analysis)
df = matrix
df1 = df.describe(include = 'all')

df1.loc['dtype'] = df.dtypes
df1.loc['size'] = len(df)
df1.loc['% null count'] = df.isnull().mean()
df1.loc['count of 0s'] = df.apply(lambda col: (col.count() - np.count_nonzero(col)))
df1

### **Featue Generation/Engineering**

Time series features
*   Statistics of previous months (e.g. mean of item_price for a specific item/shop in previous months)
*   Trends of previous months - rate of change of the above statistics based features (e.g. rate of change of mean item_price from today to the past 3 months for a specific shop/item)




Stage 1: Statistics based features

> 1st step: Compute their Values


In [None]:
#----------------------------------------Features of TS&Stats v1 (v1.4) --------------------------------------------------
#Compute sum, mean, std, and median of item price/item cnt for the below categories:
#include count of items sold in a shop (size of shop), count of days in the month)


#1)
#per date_block_num and item_id
monthly_sales_per_item = sales_train.groupby(['date_block_num','item_id']).agg({'item_price': ['mean', 'std', 'median'],
                                                              'item_cnt_day': ['sum', 'mean', 'std', 'median']
                                                              })
monthly_sales_per_item.columns = ['price_mean_by_item', 'price_std_by_item', 'price_median_by_item',
                'sales_sum_by_item', 'sales_mean_by_item', 'sales_std_by_item', 'sales_median_by_item']
monthly_sales_per_item.reset_index(inplace=True)
matrix = pd.merge(matrix, monthly_sales_per_item, on=['date_block_num','item_id'], how='left')

#2)
#per date_block_num and shop_id
monthly_sales_per_shop = sales_train.groupby(['date_block_num','shop_id']).agg({'item_price': ['mean', 'std', 'median'],
                                                              'item_cnt_day': ['sum', 'mean', 'std', 'median']})
monthly_sales_per_shop.columns = ['price_mean_by_shop', 'price_std_by_shop', 'price_median_by_shop',
                'sales_sum_by_shop', 'sales_mean_by_shop', 'sales_std_by_shop', 'sales_median_by_shop']
monthly_sales_per_shop.reset_index(inplace=True)
matrix = pd.merge(matrix, monthly_sales_per_shop, on=['date_block_num','shop_id'], how='left')

#3)
#per date_block_num and item_category
monthly_sales_per_item_category = sales_train.groupby(['date_block_num','item_category_id']).agg({'item_price': ['mean', 'std', 'median'],
                                                              'item_cnt_day': ['sum', 'mean', 'std', 'median']})
monthly_sales_per_item_category.columns = ['price_mean_by_item_category', 'price_std_by_item_category', 'price_median_by_item_category',
                'sales_sum_by_item_category', 'sales_mean_by_item_category', 'sales_std_by_item_category', 'sales_median_by_item_category']

monthly_sales_per_item_category.reset_index(inplace=True)
matrix = pd.merge(matrix, monthly_sales_per_item_category, on=['date_block_num','item_category_id'], how='left')

#4)
#per date_block_num, item_id, and shop_id
monthly_sales_per_shop_and_item = sales_train.groupby(['date_block_num','item_id', 'shop_id']).agg({'item_price': ['mean', 'std', 'median'],
                                                              'item_cnt_day': ['sum', 'mean', 'std', 'median']})
monthly_sales_per_shop_and_item.columns = ['price_mean_by_item_and_shop', 'price_std_by_item_and_shop', 'price_median_by_item_and_shop',
                'sales_sum_by_item_and_shop', 'sales_mean_by_item_and_shop', 'sales_std_by_item_and_shop', 'sales_median_by_item_and_shop']

monthly_sales_per_shop_and_item.reset_index(inplace=True)
matrix = pd.merge(matrix, monthly_sales_per_shop_and_item, on=['date_block_num','item_id', 'shop_id'], how='left')

#5)
#per date_block_num
monthly_sales_per_date_block = sales_train.groupby(['date_block_num']).agg({'item_price': ['mean', 'std', 'median'],
                                                              'item_cnt_day': ['sum', 'mean', 'std', 'median']})
monthly_sales_per_date_block.columns = ['price_mean_by_month', 'price_std_by_month', 'price_median_by_month',
                'sales_sum_by_month', 'sales_mean_by_month', 'sales_std_by_month', 'sales_median_by_month']

monthly_sales_per_date_block.reset_index(inplace=True)
matrix = pd.merge(matrix, monthly_sales_per_date_block, on=['date_block_num'], how='left')

matrix.fillna(0, inplace=True) # 34 month
matrix = downcast_df(matrix)

matrix

TS_lags = [
          1, 3, 6, 12,
          2,           #Need 2,4,7,13 lags to calculate 1m, 3m, 6m, 12m trends 
          #4, 7, 13
          ]

TS_features = ['price_mean_by_item', 'sales_sum_by_item', 
              'price_mean_by_shop', 'sales_sum_by_shop',
              'price_mean_by_item_category', 'sales_sum_by_item_category',
              'price_mean_by_item_and_shop', 'sales_sum_by_item_and_shop',
              'price_mean_by_month', 'sales_sum_by_month']

Stats_lags = [1]

Stats_features = [
                  'price_std_by_item', 'price_median_by_item', 'sales_std_by_item', 'sales_median_by_item', 'sales_mean_by_item',
                  'price_std_by_shop', 'price_median_by_shop', 'sales_std_by_shop', 'sales_median_by_shop', 'sales_mean_by_shop',
                  'price_std_by_item_category', 'price_median_by_item_category', 'sales_std_by_item_category', 'sales_median_by_item_category', 'sales_mean_by_item_category',
                  'price_std_by_item_and_shop', 'price_median_by_item_and_shop',  'sales_std_by_item_and_shop', 'sales_median_by_item_and_shop', 'sales_mean_by_item_and_shop',
                  'price_std_by_month', 'price_median_by_month', 'sales_std_by_month', 'sales_median_by_month',  'sales_mean_by_month'
                ]
trend_lags = [2, 
              #4, 7, 13
              ]

matrix.head()

> 2nd step: Lag them (put them in the same row/month as the one you'll be using them to predict - e.g e.g if going to use 6month ago mean of item_price to predict item_cnt of next month, put 6 month ago mean of item_price in the same row as current month's values, used to predict next month)




In [None]:
ts = time.time()

#---------------------------Create Time series based features--------------------------
#TS based features = features computed based on stats (just mean in this case) of item price/cnt of shops or item at different previous months/lags

#ToDo: parallelize this process and the below

for i in range(len(TS_features)):
  matrix_lagged = lag_feature(matrix, TS_lags, TS_features[i])
  matrix = pd.merge(matrix, matrix_lagged, on=['date_block_num','shop_id','item_id'], how='left')
  matrix = downcast_df(matrix) 
del matrix_lagged
print(time.time()-ts)
matrix.tail()

In [None]:
#---------------------------Create Stats based features--------------------------------
#Stats based features = features computed based on stats of item price/cnt of shops or item for just the previous month
                                  
#Splitting Stats_features as below helped as a quick fix for the session getting crashed from running out of RAM (if GPU still crashes, use TPU for this one - has more RAM)
length = len(Stats_features)
index = length//3
Stats_features_first = Stats_features[:index]
Stats_features_second = Stats_features[index:(index*2)]
Stats_features_third = Stats_features[(index*2):]

for i in range(len(Stats_features_first)):
  matrix_lagged = lag_feature(matrix, Stats_lags, Stats_features_first[i])
  matrix = pd.merge(matrix, matrix_lagged, on=['date_block_num','shop_id','item_id'], how='left')
del matrix_lagged
print(Stats_features_first)

In [None]:
for i in range(len(Stats_features_second)):
  matrix_lagged = lag_feature(matrix, Stats_lags, Stats_features_second[i])
  matrix = pd.merge(matrix, matrix_lagged, on=['date_block_num','shop_id','item_id'], how='left')
del matrix_lagged
print(Stats_features_second)

In [None]:
for i in range(len(Stats_features_third)):
  matrix_lagged = lag_feature(matrix, Stats_lags, Stats_features_third[i])
  matrix = pd.merge(matrix, matrix_lagged, on=['date_block_num','shop_id','item_id'], how='left')
del matrix_lagged
print(Stats_features_third)
fetures_to_drop = TS_features + Stats_features #features are renamed and added as a new column within the lag_features functions, so remove these one
matrix = matrix.drop(fetures_to_drop, axis = 1)
matrix = matrix.fillna(0)
matrix[matrix['date_block_num']==13].head()

2nd Stage: Trend based features


> Rate of change of Time series based features (mean of price or item count at past lags/months). Rates of change are calclulated for the past 1m



In [None]:
ts = time.time()

for TS_feature in TS_features:
  for i in trend_lags:
    matrix['trend_' + TS_feature + '_lag_'+str(i-1)] = \
        (matrix[TS_feature +'_lag_'+str(i)] - matrix[TS_feature + '_lag_1']) / matrix[TS_feature + '_lag_1']
print(time.time()-ts)
matrix.tail()


In [None]:
import numpy as np
matrix = matrix.replace([np.inf, -np.inf], np.nan)
matrix.fillna(0, inplace=True)
matrix[matrix['date_block_num'] ==  14].head()

# Load/Save Preprocessed Data

In [None]:
file_dir_path_not_in_git = '/content/drive/My Drive/Colab Notebooks/Datasets/Kaggle_Coursera/'

Write

In [None]:
#only use if intending to write data
filename_to_write = 'TS&Stats_features_Submission'
data = matrix[matrix['date_block_num'] >= 14]
data.reset_index().astype('float32').to_feather(file_dir_path_not_in_git + filename_to_write + '.feather')

Read

In [None]:
filename_to_read = 'TS&Stats_features_Submission'
matrix = pd.read_feather(file_dir_path_not_in_git + filename_to_read + '.feather', columns=None, use_threads=True)
matrix = matrix.astype({'index': np.int32, 'shop_id':np.int32,'item_id':np.int32, 'date_block_num':np.int32, 'ID':np.int32}).set_index('index')
matrix = matrix.replace([np.inf, -np.inf], np.nan)
matrix.fillna(0, inplace=True)
matrix.tail()

Merge read features tables with any created features

In [None]:
matrix = pd.merge(matrix, days_by_month.rename(columns= {'month': 'date_block_num'}), on=['date_block_num'], how='left')
matrix.head()

In [None]:
#Bad mean encoding
moy_mean = matrix.groupby(['MoY']).agg({'item_cnt_month':'mean'})
moy_mean.columns = ['mean_sales_by_MoY']
moy_mean.reset_index(inplace=True)
matrix = pd.merge(matrix,moy_mean , on=['MoY'], how='left')
matrix.head()

# Modelling



*   Train/Val/Test split
*   Model Fit & Validate
*   Test/Submission Results





**Train/Test split**

In [None]:
data = matrix

use_toy_data = False #to be used just for code to run quicker when tests are needed to be made
if use_toy_data == True:
  train_start_index = 28
else:
  train_start_index = 14 #skip first 13 months - used to caclulate time series features
train_final_index = 28 #set to 28 usually: makes validation set to be 20% of the non-test data (threshold is surely debatable)

data = data[data['date_block_num'] >= train_start_index]

X_train = data[data.date_block_num <= train_final_index].drop(['item_cnt_month', 'ID'], axis=1)
y_train = data[data.date_block_num <= train_final_index]['item_cnt_month'].values
X_val = data[(data.date_block_num > train_final_index) & (data.date_block_num <= 33)].drop(['item_cnt_month', 'ID'], axis=1)
y_val = data[(data.date_block_num > train_final_index) & (data.date_block_num <= 33)]['item_cnt_month'].values
X_test = data[data.date_block_num == 34].drop(['item_cnt_month', 'ID'], axis=1)
try:
  X_test = pd.merge(test, X_test, on= ['date_block_num', 'item_id', 'shop_id']).drop(['ID'], axis = 1) #to ensure consistency in rows with test sumbission file
except:
  X_test = pd.merge(test, X_test, on= ['item_id', 'shop_id']).drop(['ID'], axis = 1) #to ensure consistency in rows with test sumbission file
del data

Features Normalised

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
LinRegFeaturesToDrop= ['date_block_num', 'shop_id', 'item_id', 'item_category_id']
scaler =  MinMaxScaler(feature_range=[-1,1])

X_train_LinReg = scaler.fit_transform(X_train.drop(LinRegFeaturesToDrop, axis = 1))
X_val_LinReg = scaler.transform(X_val.drop(LinRegFeaturesToDrop, axis = 1))
X_test_LinReg = scaler.transform(X_test.drop(LinRegFeaturesToDrop, axis = 1))
feature_names_LinReg = X_train.drop(LinRegFeaturesToDrop, axis = 1).columns

Features in numpy

In [None]:
feature_names = X_train.columns
X_train_np = X_train.to_numpy(dtype = np.float32)
del X_train
X_val_np = X_val.to_numpy(dtype = np.float32)
del X_val
X_test_np = X_test.to_numpy(dtype = np.float32)
del X_test
X_train_np.nbytes/(10**6)

**Model Fit & Validate**

LightGBM

In [None]:
load_model = False

In [None]:
model_name = '/LGBM_submission_model' 
X_train_model, X_val_model, X_test_model = X_train_np, X_val_np, X_test_np #X_train_np, X_val_np, X_test_np
y_train_model = y_train

start = time.time()
if load_model == True:
  model = pickle.load(open(file_dir_path + model_name + '.sav', 'rb'))
else:
  model = LGBMRegressor(n_estimators=50, max_depth = 8, importance_type = 'gain')
  model.fit(X_train_model, y_train_model, eval_set=[(X_val_model, y_val), (X_train_model, y_train_model)], verbose=0)

y_pred_train, y_pred_val, y_pred_test =  model.predict(X_train_model).clip(0,20), model.predict(X_val_model).clip(0,20), model.predict(X_test_model).clip(0,20)
train_score, val_score = sklearn.metrics.r2_score(y_train_model, y_pred_train), sklearn.metrics.r2_score(y_val, y_pred_val)
train_score, val_score = sklearn.metrics.r2_score(y_train_model, y_pred_train), sklearn.metrics.r2_score(y_val, y_pred_val)
train_rmse, val_rmse = np.sqrt(sklearn.metrics.mean_squared_error(y_train_model, y_pred_train)), np.sqrt(sklearn.metrics.mean_squared_error(y_val, y_pred_val))
print('R^2 train_score is ' + str(train_score) + ' R^2 val_score is ' + str(val_score))
print('RMSE train_score is ' + str(train_rmse) + ' RMSE val_score is ' + str(val_rmse))
print(time.time()-start)

lgb.plot_metric(model)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.datasets import load_digits
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit


def plot_learning_curve(estimator, title, X, y, axes=None, ylim=None, cv=None,
                        n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
    """
    Generate 3 plots: the test and training learning curve, the training
    samples vs fit times curve, the fit times vs score curve.

    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.

    title : string
        Title for the chart.

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    axes : array of 3 axes, optional (default=None)
        Axes to use for plotting the curves.

    ylim : tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:

          - None, to use the default 5-fold cross-validation,
          - integer, to specify the number of folds.
          - :term:`CV splitter`,
          - An iterable yielding (train, test) splits as arrays of indices.

        For integer/None inputs, if ``y`` is binary or multiclass,
        :class:`StratifiedKFold` used. If the estimator is not a classifier
        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validators that can be used here.

    n_jobs : int or None, optional (default=None)
        Number of jobs to run in parallel.
        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
        for more details.

    train_sizes : array-like, shape (n_ticks,), dtype float or int
        Relative or absolute numbers of training examples that will be used to
        generate the learning curve. If the dtype is float, it is regarded as a
        fraction of the maximum size of the training set (that is determined
        by the selected validation method), i.e. it has to be within (0, 1].
        Otherwise it is interpreted as absolute sizes of the training sets.
        Note that for classification the number of samples usually have to
        be big enough to contain at least one sample from each class.
        (default: np.linspace(0.1, 1.0, 5))
    """
    if axes is None:
        _, axes = plt.subplots(1, 3, figsize=(20, 5))

    axes[0].set_title(title)
    if ylim is not None:
        axes[0].set_ylim(*ylim)
    axes[0].set_xlabel("Training examples")
    axes[0].set_ylabel("Score")

    train_sizes, train_scores, test_scores, fit_times, _ = \
        learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs,
                       train_sizes=train_sizes,
                       return_times=True)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    fit_times_mean = np.mean(fit_times, axis=1)
    fit_times_std = np.std(fit_times, axis=1)

    # Plot learning curve
    axes[0].grid()
    axes[0].fill_between(train_sizes, train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std, alpha=0.1,
                         color="r")
    axes[0].fill_between(train_sizes, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1,
                         color="g")
    axes[0].plot(train_sizes, train_scores_mean, 'o-', color="r",
                 label="Training score")
    axes[0].plot(train_sizes, test_scores_mean, 'o-', color="g",
                 label="Cross-validation score")
    axes[0].legend(loc="best")

    # Plot n_samples vs fit_times
    axes[1].grid()
    axes[1].plot(train_sizes, fit_times_mean, 'o-')
    axes[1].fill_between(train_sizes, fit_times_mean - fit_times_std,
                         fit_times_mean + fit_times_std, alpha=0.1)
    axes[1].set_xlabel("Training examples")
    axes[1].set_ylabel("fit_times")
    axes[1].set_title("Scalability of the model")

    # Plot fit_time vs score
    axes[2].grid()
    axes[2].plot(fit_times_mean, test_scores_mean, 'o-')
    axes[2].fill_between(fit_times_mean, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1)
    axes[2].set_xlabel("fit_times")
    axes[2].set_ylabel("Score")
    axes[2].set_title("Performance of the model")

    return plt


fig, axes = plt.subplots(3, 2, figsize=(10, 15))


title = "Learning Curves (Naive Bayes)"
# Cross validation with 100 iterations to get smoother mean test and train
# score curves, each time with 20% data randomly selected as a validation set.
cv = X

estimator = GaussianNB()
plot_learning_curve(estimator, title, X, y, axes=axes[:, 0], ylim=(0.7, 1.01),
                    cv=cv, n_jobs=4)

title = r"Learning Curves (SVM, RBF kernel, $\gamma=0.001$)"
# SVC is more expensive so we do a lower number of CV iterations:
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
estimator = SVC(gamma=0.001)
plot_learning_curve(estimator, title, X, y, axes=axes[:, 1], ylim=(0.7, 1.01),
                    cv=cv, n_jobs=4)

In [None]:
import matplotlib.pyplot as plt
# Plot feature importance - Results Visualization
feature_importance = model.feature_importances_
# make importances relative to max importance
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5
plt.figure(figsize=(13,30)) 
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, feature_names[sorted_idx])
plt.xlabel('Relative Importance')
plt.title('Variable Importance')
plt.tick_params(axis='y', which='major', labelsize = 13)
plt.show()
#plt.savefig('gbt_feature_importance.png')

In [None]:
np.mean(y_pred_test)

Submission prep

In [None]:
submission = pd.DataFrame({
    "ID": test.index, 
    "item_cnt_month": y_pred_test
})
submission.head()

Save submission

In [None]:
#save submission file
submission.to_csv(file_dir_path + model_name + '_submission.csv', index=False)

# save the model to disk
pickle.dump(model, open(file_dir_path + model_name + '.sav', 'wb'))

In [None]:
import os
os.getcwd()

In [None]:
submission.to_csv('testing_submission.csv', index=False)


NN

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from keras.metrics import RootMeanSquaredError

X_train_model, X_val_model, X_test_model = X_train_LinReg, X_val_LinReg, X_test_LinReg
y_train_model = y_train
'''
from sklearn.decomposition import PCA
# Make an instance of the Model
pca = PCA(.95)
pca.fit(X_train_model)
X_train_model, X_val_model, X_test_model = pca.transform(X_train_model), pca.transform(X_val_model), pca.transform(X_test_model)
'''

#subsample
'''
sampling_perc = 0.1 #percentage to sample out of total population of data points. 10**-4 takes 18 secs
random_indices = np.random.choice(len(X_train_model), int(len(X_train_model)*sampling_perc), replace = False)
X_train_model, y_train_model = X_train_model[random_indices], y_train[random_indices]
'''

#Train
model = Sequential()
model.add(Dense(20, input_dim=X_train_model.shape[1], activation='linear'))
model.add(Dense(20, activation='linear'))
model.add(Dense(1, activation='linear'))
model.compile(loss='mse', optimizer=Adam(learning_rate=0.01), metrics=[RootMeanSquaredError(name='rmse')])
model_history = model.fit(X_train_model, y_train_model, epochs=2, batch_size=3200, validation_data=(X_val_model, y_val), shuffle = True)

#Predict
y_pred_train, y_pred_val, y_pred_test =  model.predict(X_train_model).clip(0,20), model.predict(X_val_model).clip(0,20), model.predict(X_test_model).clip(0,20)
train_score, val_score = sklearn.metrics.r2_score(y_train_model, y_pred_train), sklearn.metrics.r2_score(y_val, y_pred_val)
train_rmse, val_rmse = np.sqrt(sklearn.metrics.mean_squared_error(y_train_model, y_pred_train)), np.sqrt(sklearn.metrics.mean_squared_error(y_val, y_pred_val))
print('R^2 train_score is ' + str(train_score) + ' R^2 val_score is ' + str(val_score))
print('RMSE train_score is ' + str(train_rmse) + ' RMSE val_score is ' + str(val_rmse))
plotKerasLearningCurves(model_history)

In [None]:
def plotKerasLearningCurves(history):
  plt.plot(history.history['loss'])
  plt.plot(history.history['val_loss'])
  plt.title('model loss')
  plt.ylabel('score')
  plt.xlabel('epoch')
  plt.legend(['train', 'test'], loc='upper left')
  plt.show()
  # summarize history for loss
  plt.plot(history.history['loss'])
  plt.plot(history.history['val_loss'])
  plt.title('model loss')
  plt.ylabel('loss')
  plt.xlabel('epoch')
  plt.legend(['train', 'test'], loc='upper left')
  plt.show()