<a href="https://colab.research.google.com/github/FutureAndroidLearn/coursera-kaggle-project/blob/master/04_Modelling_lag.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import pandas as pd 
import sklearn
import scipy.sparse 
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

In [0]:
import pandas as pd
import numpy as np
import gc
import matplotlib.pyplot as plt
%matplotlib inline 

pd.set_option('display.max_rows', 600)
pd.set_option('display.max_columns', 50)

from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from tqdm import tqdm_notebook

from itertools import product
import time

def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df

In [0]:
# Import data from csv files
sales_data = pd.read_csv('/content/drive/My Drive/coursera-kaggle-project/df_train_f1.zip') # Sales that include only shops included in the test data
shops_data = pd.read_csv('/content/drive/My Drive/coursera-kaggle-project/shops.csv')
categories_data = pd.read_csv('/content/drive/My Drive/coursera-kaggle-project/item_categories.csv')
items_data = pd.read_csv('/content/drive/My Drive/coursera-kaggle-project/items.csv')
test_data = pd.read_csv('/content/drive/My Drive/coursera-kaggle-project/test.csv').set_index('ID')

In [4]:
test_data.head()

Unnamed: 0_level_0,shop_id,item_id
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
0,5,5037
1,5,5320
2,5,5233
3,5,5232
4,5,5268


In [0]:
# Conditions 
df = sales_data[sales_data['date_block_num'] > 5]  # Max lag 6 months, first 6 months are not considered
items_up = sales_data[sales_data['date_block_num']>=28].item_id.unique() # Use products solds in the last 5 months
df = df[df['item_id'].isin(items_up)] 

In [6]:
df.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,months_open
483231,22.07.2013,6,28,51,249.0,1.0,34
483232,27.07.2013,6,28,51,249.0,1.0,34
483233,28.07.2013,6,28,59,249.0,1.0,34
483234,01.07.2013,6,28,53,299.0,1.0,34
483235,25.07.2013,6,28,45,299.0,1.0,34


In [0]:
## Initial Feature Matrix

# Create "grid" with columns
index_cols = ['shop_id', 'item_id', 'date_block_num']

# For every month we create a grid from all shops/items combinations from that month
grid = [] 
for block_num in df['date_block_num'].unique():
    cur_shops = df.loc[df['date_block_num'] == block_num, 'shop_id'].unique()
    cur_items = df.loc[df['date_block_num'] == block_num, 'item_id'].unique()
    grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))

# Turn the grid into a dataframe
grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)

# Groupby data to get shop-item-month aggregates
gb = df.groupby(index_cols,as_index=False).agg({'item_cnt_day':'sum'})

gb.rename({'item_cnt_day':'target'}, axis = 1, inplace =True)
all_data = pd.merge(grid, gb, how='left', on=index_cols)

all_data['target'] = (all_data['target']
                            .fillna(0)
                            .clip(0,20))

## Integrate test data with train data

test_data['date_block_num'] = 34
all_data = pd.concat([all_data, test_data], ignore_index=True, sort=False, keys=index_cols)
all_data.fillna(0, inplace=True) # 34 month

In [8]:
all_data.tail()

Unnamed: 0,shop_id,item_id,date_block_num,target
4706615,45,18454,34,0.0
4706616,45,16188,34,0.0
4706617,45,15757,34,0.0
4706618,45,19648,34,0.0
4706619,45,969,34,0.0


In [0]:
## Other Features

# Same as above but with shop-month aggregates
gb = sales_data.groupby(['shop_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':'sum'})
gb.rename({'item_cnt_day':'target_shop'}, axis = 1, inplace =True)
all_data = pd.merge(all_data, gb, how='left', on=['shop_id', 'date_block_num']).fillna(0)

# Same as above but with item-month aggregates
gb = sales_data.groupby(['item_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':'sum'})
gb.rename({'item_cnt_day':'target_item'}, axis = 1, inplace =True)
all_data = pd.merge(all_data, gb, how='left', on=['item_id', 'date_block_num']).fillna(0)

# Downcast dtypes from 64 to 32 bit to save memory
all_data = downcast_dtypes(all_data)
del grid, gb 
gc.collect();

# List of columns that we will use to create lags
#target, traget_item, taget_shop
cols_to_rename = list(all_data.columns.difference(index_cols))

In [10]:
all_data.tail()

Unnamed: 0,shop_id,item_id,date_block_num,target,target_shop,target_item
4706615,45,18454,34,0.0,0.0,0.0
4706616,45,16188,34,0.0,0.0,0.0
4706617,45,15757,34,0.0,0.0,0.0
4706618,45,19648,34,0.0,0.0,0.0
4706619,45,969,34,0.0,0.0,0.0


In [11]:
## Time lag features

shift_range = [1, 2, 3, 4, 5, 6]

for month_shift in tqdm_notebook(shift_range):
    train_shift = all_data[index_cols + cols_to_rename].copy()
    
    train_shift['date_block_num'] = train_shift['date_block_num'] + month_shift
    
    foo = lambda x: '{}_lag_{}'.format(x, month_shift) if x in cols_to_rename else x
    train_shift = train_shift.rename(columns=foo)

    all_data = pd.merge(all_data, train_shift, on=index_cols, how='left').fillna(0)

del train_shift

# List of all lagged features
fit_cols = [col for col in all_data.columns if col[-1] in [str(item) for item in shift_range]] 
# We will drop these at fitting stage
to_drop_cols = list(set(list(all_data.columns)) - (set(fit_cols)|set(index_cols))) + ['date_block_num'] 

# Category for each item
item_category_mapping = items_data[['item_id','item_category_id']].drop_duplicates()

all_data = pd.merge(all_data, item_category_mapping, how='left', on='item_id')
all_data = downcast_dtypes(all_data)
gc.collect();

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """


HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))




In [12]:
all_data.tail()

Unnamed: 0,shop_id,item_id,date_block_num,target,target_shop,target_item,target_lag_1,target_item_lag_1,target_shop_lag_1,target_lag_2,target_item_lag_2,target_shop_lag_2,target_lag_3,target_item_lag_3,target_shop_lag_3,target_lag_4,target_item_lag_4,target_shop_lag_4,target_lag_5,target_item_lag_5,target_shop_lag_5,target_lag_6,target_item_lag_6,target_shop_lag_6,item_category_id
4706615,45,18454,34,0.0,0.0,0.0,1.0,2.0,702.0,0.0,1.0,654.0,0.0,3.0,710.0,0.0,11.0,675.0,0.0,18.0,622.0,0.0,24.0,762.0,55
4706616,45,16188,34,0.0,0.0,0.0,0.0,1.0,702.0,0.0,3.0,654.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64
4706617,45,15757,34,0.0,0.0,0.0,0.0,5.0,702.0,0.0,3.0,654.0,0.0,4.0,710.0,0.0,4.0,675.0,0.0,8.0,622.0,0.0,10.0,762.0,55
4706618,45,19648,34,0.0,0.0,0.0,0.0,2.0,702.0,0.0,3.0,654.0,0.0,6.0,710.0,0.0,2.0,675.0,0.0,4.0,622.0,0.0,4.0,762.0,40
4706619,45,969,34,0.0,0.0,0.0,0.0,3.0,702.0,0.0,5.0,654.0,0.0,1.0,710.0,0.0,2.0,675.0,0.0,1.0,622.0,0.0,2.0,762.0,37


In [0]:
dates = all_data['date_block_num']

last_block = dates.max()

dates_train = dates[dates <  last_block-1]
dates_val  = dates[dates == last_block-1]
dates_test  = dates[dates == last_block]

X_train = all_data.loc[dates <  last_block-1].drop(to_drop_cols, axis=1)
X_val =  all_data.loc[dates == last_block-1].drop(to_drop_cols, axis=1)
X_test = all_data.loc[dates == last_block].drop(to_drop_cols, axis=1)

y_train = all_data.loc[dates <  last_block-1, 'target'].values
y_val =  all_data.loc[dates == last_block-1, 'target'].values
y_test =  all_data.loc[dates == last_block, 'target'].values

In [14]:
ts = time.time()

model = XGBRegressor(
    max_depth=8,
    n_estimators=50,
    min_child_weight=300, 
    colsample_bytree=0.8, 
    subsample=0.8, 
    eta=0.3,    
    seed=42)

model.fit(
    X_train, 
    y_train, 
    eval_metric="rmse", 
    eval_set=[(X_train, y_train), (X_val, y_val)], 
    verbose=True, 
    early_stopping_rounds = 5)

time.time() - ts

[0]	validation_0-rmse:1.33892	validation_1-rmse:1.0621
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 5 rounds.
[1]	validation_0-rmse:1.29378	validation_1-rmse:1.02717
[2]	validation_0-rmse:1.25754	validation_1-rmse:0.997743
[3]	validation_0-rmse:1.22063	validation_1-rmse:0.973777
[4]	validation_0-rmse:1.18927	validation_1-rmse:0.953998
[5]	validation_0-rmse:1.16353	validation_1-rmse:0.934426
[6]	validation_0-rmse:1.14321	validation_1-rmse:0.919648
[7]	validation_0-rmse:1.12442	validation_1-rmse:0.906617
[8]	validation_0-rmse:1.11084	validation_1-rmse:0.895506
[9]	validation_0-rmse:1.10232	validation_1-rmse:0.888633
[10]	validation_0-rmse:1.09126	validation_1-rmse:0.881992
[11]	validation_0-rmse:1.08092	validation_1-rmse:0.875128
[12]	validation_0-rmse:1.07337	validation_1-rmse:0.870822
[13]	validation_0-rmse:1.0658	validation_1-rmse:0.865596
[14]	validation_0-rmse:1.06031	validation_1-

1138.824010848999

In [20]:
pred_xgb = model.predict(X_val)

rmse_cl = np.sqrt(mean_squared_error(y_val, pred_xgb))
print(rmse_cl)

0.8416248


In [0]:
y_test = model.predict(X_test)

submission = pd.DataFrame({
    "ID": test_data.index, 
    "item_cnt_month": y_test
})
submission.to_csv('/content/drive/My Drive/coursera-kaggle-project/xgb_submission.csv', index=False)