In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import seaborn as sns

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize

from math import sqrt

from IPython.core import display as ICD

%matplotlib inline 

In [45]:
transactions_result = pd.read_csv('transactions_result_new1.csv')

In [46]:
transactions_result.drop(['Unnamed: 0'], axis = 1, inplace = True)

In [47]:
def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df.head()

In [48]:
downcast_dtypes(transactions_result)

Unnamed: 0,item_categories,item_id,shop_id,ID,date_block_num,item_price,target,item_price_minmax,item_price_std,target_shop_categories,...,target_item_categories_lag_26,target_shop_lag_26,target_lag_27,target_item_lag_27,target_item_categories_lag_27,target_shop_lag_27,target_lag_28,target_item_lag_28,target_item_categories_lag_28,target_shop_lag_28
0,19,5037,5,0,0,1960.580444,0,0.03311,0.519336,0,...,0,0,0,0,0,0,0,0,0,0
1,55,5320,5,1,0,0.0,0,0.0,0.0,0,...,0,0,0,0,0,0,0,0,0,0
2,19,5233,5,2,0,844.515991,0,0.014257,-0.098206,0,...,0,0,0,0,0,0,0,0,0,0
3,23,5232,5,3,0,792.52771,0,0.013379,-0.126972,0,...,0,0,0,0,0,0,0,0,0,0
4,20,5268,5,4,0,0.0,0,0.0,0.0,0,...,0,0,0,0,0,0,0,0,0,0


In [49]:
transactions_group = transactions_result.groupby(['shop_id','date_block_num'])['target'].sum().to_frame()

In [50]:
table = pd.pivot_table(transactions_group, values='target', index=['shop_id'],
                    columns=['date_block_num'])
table.drop([34], axis = 1, inplace = True)

In [51]:
colum = table.columns
for column in colum:
    table.rename(columns={column: 'data'+str(column)}, inplace=True)

In [52]:
table.head()

date_block_num,data0,data1,data2,data3,data4,data5,data6,data7,data8,data9,...,data24,data25,data26,data27,data28,data29,data30,data31,data32,data33
shop_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,248,135,237,168,159,255,242,257,403,345,...,806,765,609,672,732,704,718,877,785,715
3,187,221,272,180,213,266,260,317,469,337,...,788,581,622,656,634,604,505,627,649,606
4,459,496,563,63,303,387,415,521,652,594,...,956,788,792,763,771,700,771,889,705,811
5,0,187,362,252,296,403,359,447,446,492,...,1073,837,881,791,894,856,872,1218,1041,1032
6,867,1004,1299,751,672,1054,962,1170,1425,1420,...,1768,1788,1538,1573,1272,1305,1286,1455,1607,1733


In [53]:
group_items_shop = transactions_result.groupby(['shop_id','item_id'])['target'].sum().to_frame()

In [54]:
table_items_shop = pd.pivot_table(group_items_shop, values='target', index=['shop_id'],
                    columns=['item_id'])

In [55]:
table_items_shop.head()

item_id,30,31,32,33,38,42,45,51,53,57,...,22118,22137,22139,22145,22154,22162,22163,22164,22166,22167
shop_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,4,8,11,11,0,0,0,0,1,0,...,0,0,0,2,0,2,0,0,0,18
3,20,9,15,7,0,0,0,0,0,0,...,1,0,0,1,0,4,0,7,0,5
4,11,11,23,6,2,0,0,1,0,0,...,0,0,6,1,0,4,0,3,0,4
5,19,11,30,17,0,0,0,0,0,0,...,0,0,2,4,0,12,2,10,0,4
6,51,48,58,42,1,4,3,10,10,1,...,2,0,6,7,2,10,1,14,0,24


In [56]:
table = pd.concat([table, table_items_shop], axis = 1)

In [57]:
del table_items_shop
table.head()

Unnamed: 0_level_0,data0,data1,data2,data3,data4,data5,data6,data7,data8,data9,...,22118,22137,22139,22145,22154,22162,22163,22164,22166,22167
shop_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,248,135,237,168,159,255,242,257,403,345,...,0,0,0,2,0,2,0,0,0,18
3,187,221,272,180,213,266,260,317,469,337,...,1,0,0,1,0,4,0,7,0,5
4,459,496,563,63,303,387,415,521,652,594,...,0,0,6,1,0,4,0,3,0,4
5,0,187,362,252,296,403,359,447,446,492,...,0,0,2,4,0,12,2,10,0,4
6,867,1004,1299,751,672,1054,962,1170,1425,1420,...,2,0,6,7,2,10,1,14,0,24


In [58]:
X = table.values
X = normalize(X)

kmeans = KMeans(n_clusters=2, random_state=241)
kmeans.fit(X)
labels = pd.DataFrame(kmeans.labels_, columns = ['labels'])

In [59]:
labels['labels'].value_counts()

1    35
0     7
Name: labels, dtype: int64

In [60]:
table['cluster_shop'] = kmeans.labels_
map_shop_cluster = table['cluster_shop']

In [61]:
transactions_result['cluster_shop'] = transactions_result['shop_id'].map(map_shop_cluster)
downcast_dtypes(transactions_result)

Unnamed: 0,item_categories,item_id,shop_id,ID,date_block_num,item_price,target,item_price_minmax,item_price_std,target_shop_categories,...,target_shop_lag_26,target_lag_27,target_item_lag_27,target_item_categories_lag_27,target_shop_lag_27,target_lag_28,target_item_lag_28,target_item_categories_lag_28,target_shop_lag_28,cluster_shop
0,19,5037,5,0,0,1960.580444,0,0.03311,0.519336,0,...,0,0,0,0,0,0,0,0,0,1
1,55,5320,5,1,0,0.0,0,0.0,0.0,0,...,0,0,0,0,0,0,0,0,0,1
2,19,5233,5,2,0,844.515991,0,0.014257,-0.098206,0,...,0,0,0,0,0,0,0,0,0,1
3,23,5232,5,3,0,792.52771,0,0.013379,-0.126972,0,...,0,0,0,0,0,0,0,0,0,1
4,20,5268,5,4,0,0.0,0,0.0,0.0,0,...,0,0,0,0,0,0,0,0,0,1


In [62]:
transactions_group_item = transactions_result.groupby(['item_id','date_block_num'])['target'].sum().to_frame()

In [63]:
transactions_group_item.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,target
item_id,date_block_num,Unnamed: 2_level_1
30,0,0
30,1,599
30,2,394
30,3,105
30,4,45


In [64]:
table_item = pd.pivot_table(transactions_group_item, values='target', index=['item_id'],
                    columns=['date_block_num'], margins_name = ['date_block_num'])
table_item.drop([34], axis = 1, inplace = True)

In [65]:
table_item.head()

date_block_num,0,1,2,3,4,5,6,7,8,9,...,24,25,26,27,28,29,30,31,32,33
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
30,0,599,394,105,45,46,30,16,12,20,...,12,13,4,4,5,4,4,6,2,1
31,0,466,155,45,25,21,13,20,18,20,...,22,11,10,13,4,10,6,52,9,18
32,225,156,143,80,53,65,79,65,45,46,...,35,25,34,19,19,25,21,30,19,22
33,42,29,26,13,11,40,38,28,37,30,...,17,21,20,12,11,11,15,14,16,16
38,0,0,0,0,0,0,0,0,0,0,...,3,4,1,0,3,2,4,7,2,0


In [66]:
item_shop = transactions_result.groupby(['item_id','shop_id'])['target'].sum().to_frame()

In [67]:
item_shop.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,target
item_id,shop_id,Unnamed: 2_level_1
30,2,4
30,3,20
30,4,11
30,5,19
30,6,51


In [68]:
table_item_shop = pd.pivot_table(item_shop, values='target', index=['item_id'],
                    columns=['shop_id'])

In [69]:
colum = table_item_shop.columns

In [70]:
for column in colum:
    table_item_shop.rename(columns={column: 'id'+str(column)}, inplace=True)

In [71]:
table_item_shop.head()

shop_id,id2,id3,id4,id5,id6,id7,id10,id12,id14,id15,...,id48,id49,id50,id52,id53,id55,id56,id57,id58,id59
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
30,4,20,11,19,51,23,9,17,13,33,...,2,0,56,15,23,0,27,50,29,36
31,8,9,11,11,48,15,7,13,15,28,...,4,5,36,11,13,0,24,24,9,20
32,11,15,23,30,58,52,8,1,19,33,...,5,5,49,28,32,0,62,77,38,20
33,11,7,6,17,42,15,7,16,9,19,...,3,3,36,10,4,0,13,35,9,8
38,0,0,2,0,1,0,1,2,0,1,...,1,0,0,1,0,0,2,1,2,1


In [72]:
item_price = transactions_result.groupby(['item_id','date_block_num'])['item_price_minmax'].mean().to_frame()

In [73]:
item_price.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,item_price_minmax
item_id,date_block_num,Unnamed: 2_level_1
30,0,0.004426
30,1,0.006596
30,2,0.006628
30,3,0.006671
30,4,0.006537


In [74]:
table_item_price = pd.pivot_table(item_price, values='item_price_minmax', index=['item_id'],
                    columns=['date_block_num'])
table_item_price.drop([34], axis = 1, inplace = True)

In [75]:
colum = table_item_price.columns
for column in colum:
    table_item_price.rename(columns={column: 'data'+str(column)}, inplace=True)

In [76]:
table_item_price.head()

date_block_num,data0,data1,data2,data3,data4,data5,data6,data7,data8,data9,...,data24,data25,data26,data27,data28,data29,data30,data31,data32,data33
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
30,0.004426,0.006596,0.006628,0.006671,0.006537,0.006731,0.006619,0.006627,0.006731,0.003473,...,0.002846,0.002846,0.002846,0.002846,0.002846,0.002171,0.002171,0.002171,0.00201,0.002171
31,0.008748,0.011384,0.011607,0.011557,0.011799,0.011799,0.011799,0.011493,0.011799,0.009363,...,0.006784,0.006799,0.007083,0.006935,0.00677,0.006209,0.007013,0.003595,0.00328,0.00649
32,0.005801,0.005803,0.005791,0.005846,0.005862,0.005716,0.005846,0.005812,0.005715,0.003381,...,0.002413,0.002462,0.002472,0.002508,0.002508,0.002508,0.002508,0.002501,0.002508,0.002508
33,0.008339,0.008282,0.008231,0.008253,0.008421,0.003363,0.003263,0.003328,0.003338,0.003278,...,0.003353,0.003353,0.00326,0.003353,0.003353,0.003353,0.00326,0.003353,0.003305,0.003301
38,0.039401,0.039401,0.039401,0.039401,0.039401,0.039401,0.039401,0.039401,0.039401,0.039401,...,0.038236,0.040516,0.040516,0.039401,0.040516,0.040516,0.037704,0.037067,0.038826,0.039401


In [77]:
table_item = pd.concat([table_item, table_item_shop, table_item_price], axis = 1)

In [78]:
del table_item_shop
del table_item_price
table_item.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,data24,data25,data26,data27,data28,data29,data30,data31,data32,data33
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
30,0,599,394,105,45,46,30,16,12,20,...,0.002846,0.002846,0.002846,0.002846,0.002846,0.002171,0.002171,0.002171,0.00201,0.002171
31,0,466,155,45,25,21,13,20,18,20,...,0.006784,0.006799,0.007083,0.006935,0.00677,0.006209,0.007013,0.003595,0.00328,0.00649
32,225,156,143,80,53,65,79,65,45,46,...,0.002413,0.002462,0.002472,0.002508,0.002508,0.002508,0.002508,0.002501,0.002508,0.002508
33,42,29,26,13,11,40,38,28,37,30,...,0.003353,0.003353,0.00326,0.003353,0.003353,0.003353,0.00326,0.003353,0.003305,0.003301
38,0,0,0,0,0,0,0,0,0,0,...,0.038236,0.040516,0.040516,0.039401,0.040516,0.040516,0.037704,0.037067,0.038826,0.039401


In [79]:
X = table_item.iloc[:,1:]
X = normalize(X)

kmeans = KMeans(n_clusters=3, random_state=241)
kmeans.fit(X)
labels = pd.DataFrame(kmeans.labels_, columns = ['labels'])

In [80]:
labels['labels'].value_counts()

2    2773
1    1928
0     399
Name: labels, dtype: int64

In [81]:
table_item['cluster_item'] = kmeans.labels_

In [82]:
map_cluster = table_item['cluster_item']

In [83]:
transactions_result['cluster_item'] = transactions_result['item_id'].map(map_cluster)
downcast_dtypes(transactions_result)

Unnamed: 0,item_categories,item_id,shop_id,ID,date_block_num,item_price,target,item_price_minmax,item_price_std,target_shop_categories,...,target_lag_27,target_item_lag_27,target_item_categories_lag_27,target_shop_lag_27,target_lag_28,target_item_lag_28,target_item_categories_lag_28,target_shop_lag_28,cluster_shop,cluster_item
0,19,5037,5,0,0,1960.580444,0,0.03311,0.519336,0,...,0,0,0,0,0,0,0,0,1,2
1,55,5320,5,1,0,0.0,0,0.0,0.0,0,...,0,0,0,0,0,0,0,0,1,1
2,19,5233,5,2,0,844.515991,0,0.014257,-0.098206,0,...,0,0,0,0,0,0,0,0,1,1
3,23,5232,5,3,0,792.52771,0,0.013379,-0.126972,0,...,0,0,0,0,0,0,0,0,1,1
4,20,5268,5,4,0,0.0,0,0.0,0.0,0,...,0,0,0,0,0,0,0,0,1,1


In [84]:
cols = transactions_result.columns.tolist() 

In [85]:
print(cols)

['item_categories', 'item_id', 'shop_id', 'ID', 'date_block_num', 'item_price', 'target', 'item_price_minmax', 'item_price_std', 'target_shop_categories', 'target_shop', 'target_item', 'target_item_categories', 'item_target_enc', 'target_item_categories_lag_1', 'target_shop_lag_1', 'target_lag_2', 'target_item_lag_2', 'target_item_categories_lag_2', 'target_shop_lag_2', 'target_lag_3', 'target_item_lag_3', 'target_item_categories_lag_3', 'target_shop_lag_3', 'target_lag_4', 'target_item_lag_4', 'target_item_categories_lag_4', 'target_shop_lag_4', 'target_lag_5', 'target_item_lag_5', 'target_item_categories_lag_5', 'target_shop_lag_5', 'target_lag_12', 'target_item_lag_12', 'target_item_categories_lag_12', 'target_shop_lag_12', 'target_lag_13', 'target_item_lag_13', 'target_item_categories_lag_13', 'target_shop_lag_13', 'target_lag_14', 'target_item_lag_14', 'target_item_categories_lag_14', 'target_shop_lag_14', 'target_lag_15', 'target_item_lag_15', 'target_item_categories_lag_15', 'ta

In [86]:
cols = cols[3:4] + cols[2:3] + cols[1:2]+ cols[4:5] + cols[0:1] + cols[7:8] + cols[9:] + cols[6:7]

In [87]:
df_train = transactions_result[cols]
downcast_dtypes(df_train)

Unnamed: 0,ID,shop_id,item_id,date_block_num,item_categories,item_price_minmax,target_shop_categories,target_shop,target_item,target_item_categories,...,target_item_lag_27,target_item_categories_lag_27,target_shop_lag_27,target_lag_28,target_item_lag_28,target_item_categories_lag_28,target_shop_lag_28,cluster_shop,cluster_item,target
0,0,5,5037,0,19,0.03311,0,0,0,1013,...,0,0,0,0,0,0,0,1,2,0
1,1,5,5320,0,55,0.0,0,0,0,3237,...,0,0,0,0,0,0,0,1,1,0
2,2,5,5233,0,19,0.014257,0,0,0,1013,...,0,0,0,0,0,0,0,1,1,0
3,3,5,5232,0,23,0.013379,0,0,0,944,...,0,0,0,0,0,0,0,1,1,0
4,4,5,5268,0,20,0.0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0


In [88]:
rng = np.random.RandomState(1)
regr_model = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),
                          n_estimators=20, random_state=rng)

y_pred_list = []
shop_list = list(transactions_result['shop_id'].unique())
clus_list = list(transactions_result['cluster_item'].unique())

for shop_id in shop_list:
    for clus_id in clus_list:
        df_train_pred = df_train.loc[(df_train['shop_id'] == shop_id)&(df_train['cluster_item'] == clus_id)]
        df_train_X = df_train_pred.drop(['target'],axis = 1)
    
        X_train = df_train_X.loc[(df_train_X['date_block_num']>12)&(df_train_X['date_block_num']<33)].values
        #X_train = df_train_X.loc[df_train_X['date_block_num']<33].values
        X_val = df_train_X.loc[df_train_X['date_block_num']==33].values
        y_train = df_train_pred.loc[(df_train_pred['date_block_num']>12)&(df_train_pred['date_block_num']<33)]['target'].values
        #y_train = df_train_pred.loc[df_train_pred['date_block_num']<33]['target'].values
        y_val = df_train_pred.loc[df_train_pred['date_block_num']==33]['target'].values
    
        regr_model.fit(X_train, y_train)
    
        y_pred_shop_id = regr_model.predict(X_val)
        print(f'Test rmse for {shop_id} AdaBoost is {sqrt(mean_squared_error(y_val, y_pred_shop_id))}')
        y_pred_list.append(y_pred_shop_id)

Test rmse for 5 AdaBoost is 0.5178655710963765
Test rmse for 5 AdaBoost is 0.6476014658769736
Test rmse for 5 AdaBoost is 0.0
Test rmse for 4 AdaBoost is 0.5238433944310503
Test rmse for 4 AdaBoost is 0.539694707981558
Test rmse for 4 AdaBoost is 0.0
Test rmse for 6 AdaBoost is 0.5759198810120463
Test rmse for 6 AdaBoost is 0.9615149705509138
Test rmse for 6 AdaBoost is 0.0
Test rmse for 3 AdaBoost is 0.41140414191466773
Test rmse for 3 AdaBoost is 0.4873501499363599
Test rmse for 3 AdaBoost is 0.0
Test rmse for 2 AdaBoost is 0.44836973398290375
Test rmse for 2 AdaBoost is 0.6365528731192618
Test rmse for 2 AdaBoost is 0.0
Test rmse for 7 AdaBoost is 0.6152510543791232
Test rmse for 7 AdaBoost is 0.7646047425391842
Test rmse for 7 AdaBoost is 0.0
Test rmse for 10 AdaBoost is 0.44138636924322844
Test rmse for 10 AdaBoost is 0.42244767660709054
Test rmse for 10 AdaBoost is 0.0
Test rmse for 12 AdaBoost is 0.6046085079063109
Test rmse for 12 AdaBoost is 2.526771024000957
Test rmse for 12 

In [89]:
test = pd.read_csv('test.csv')
test['cluster_item'] = test['item_id'].map(map_cluster)
test_clus_list = []


i = 0
for shop_id in shop_list:
    for clus_id in clus_list:
        test_clus = test.loc[(test['shop_id']==shop_id)&(test['cluster_item']==clus_id)]
        test_clus['y_pred'] = y_pred_list[i]
        test_clus_list.append(test_clus)
        i += 1
    
for i in range(len(test_clus_list)):
    test_clus_list[i] = np.where(test_clus_list[i]<0, 0, test_clus_list[i])

y_pred = np.vstack(test_clus_list)
y_pred = pd.DataFrame(y_pred, columns = ['ID','shop_id','item_id','cluster_item','y_pred'])
y_pred = y_pred.sort_values(by=['ID'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


NameError: name 'y_pred_lr' is not defined

In [91]:
y_pred.head()

Unnamed: 0,ID,shop_id,item_id,cluster_item,y_pred
0,0.0,5.0,5037.0,2.0,0.51665
2773,1.0,5.0,5320.0,1.0,0.028176
2774,2.0,5.0,5233.0,1.0,1.275862
2775,3.0,5.0,5232.0,1.0,1.062743
2776,4.0,5.0,5268.0,1.0,0.028176


In [92]:
y_val = df_train.loc[df_train['date_block_num']==33]['target'].values
print('Test rmse for LightGBM is %f' % sqrt(mean_squared_error(y_val, y_pred['y_pred'])))

Test rmse for LightGBM is 0.834915


In [None]:
rng = np.random.RandomState(1)
regr_model_all_data = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),
                          n_estimators=20, random_state=rng)

df_train_pred = df_train.loc[(df_train['shop_id'] == shop_id)&(df_train['cluster_item'] == clus_id)]
df_train_X = df_train_pred.drop(['target'],axis = 1)
    
X_train = df_train_X.loc[(df_train_X['date_block_num']>12)&(df_train_X['date_block_num']<33)].values
#X_train = df_train_X.loc[df_train_X['date_block_num']<33].values
X_val = df_train_X.loc[df_train_X['date_block_num']==33].values
y_train = df_train_pred.loc[(df_train_pred['date_block_num']>12)&(df_train_pred['date_block_num']<33)]['target'].values
#y_train = df_train_pred.loc[df_train_pred['date_block_num']<33]['target'].values
y_val = df_train_pred.loc[df_train_pred['date_block_num']==33]['target'].values

regr_model_all_data.fit(X_train, y_train)
