In [2]:
__imp

The libraries have been loaded!


In [98]:
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion, _transform_one
import math

from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
import numpy as np

# Sklearn Pipelines

In [4]:
class clean_train(BaseEstimator, TransformerMixin):
    """Takes in train dataframe, cleans train data frame """

    def __init__(self):
        pass

    def clean_train(self,df):
        df['item_price_whole'] = df['item_price'].apply(lambda x : math.modf(x)[1])
        df['item_price_fraction'] = df['item_price'].apply(lambda x : math.modf(x)[0])*100
        df['date_new']=pd.to_datetime(df['date'], format = "%d.%m.%Y")
        df = df.drop_duplicates()
        df['Month'] = df['date_new'].dt.month
        df['Year'] = df['date_new'].dt.year
        df['Week']=df['date_new'].dt.week       
        return df  
    
    def transform(self, df, y=None):
        """ Transform function to apply clean_digits function above """
        return self.clean_train(df)

    def fit(self, df, y=None):
        """Returns `self` """
        return self

In [5]:
class create_pivot_from_raw_train(BaseEstimator, TransformerMixin):
    """Takes in raw dataframe, creates a pivot table ready for Machine learning """
    def __init__(self):
        self.week_level=False

    def create_pivot(self, df, y=None):
        Temp_Agg1 = df.groupby(['shop_id', 'item_id','Year','Month', 'Week']).agg(['sum'])['item_cnt_day']
        Temp_Agg1= Temp_Agg1.reset_index()
        if(not self.week_level):
            Temp_Pivot1 = pd.pivot_table(Temp_Agg1, values='sum', columns=['Year','Month'], index =['shop_id','item_id'],fill_value=0)
            Temp_Pivot1.reset_index(inplace=True)
            Temp_Pivot1.columns = [str(col[0])+"_"+str(col[1]) for col in Temp_Pivot1.columns.values]
            Temp_Pivot1.rename(columns={'shop_id_':'shop_id', 'item_id_':'item_id'}, inplace=True)            
#             Temp_Pivot1.to_hdf("./Shop_Item_Month_Level.h5", 'table', complelvel=9, complib='zlib')  

        else:
            Temp_Pivot1 = pd.pivot_table(Temp_Agg1, values='sum', columns=['Year','Month','Week'], index =['shop_id','item_id'],fill_value=0)
            Temp_Pivot1.reset_index(inplace=True)
            Temp_Pivot1.columns = [str(col[0])+"_"+str(col[1])+"_"+str(col[2]) for col in Temp_Pivot2.columns.values]
            Temp_Pivot1.rename(columns={'shop_id_':'shop_id', 'item_id_':'item_id'}, inplace=True)
#             Temp_Pivot1.to_hdf("./Shop_Item_Week_Level.h5", 'table', complelvel=9, complib='zlib')  
        return Temp_Pivot1
    
    def transform(self, df, y=None):
        """ Transform function to apply clean_digits function above """
        return self.create_pivot(df)

    def fit(self, df, y=None):
        """Returns `self` """
        return self

In [6]:
class create_item_level_grp(BaseEstimator, TransformerMixin):
    ''' creates a item level pivot ignores shop_id '''
    def __init__(self):
        self.week_level=False
        self.derive_new_columns= False

    def create_item_pivot(self,df):     
        Item_Agg_Week = df.groupby(['item_id','Year','Month','Week']).agg(['sum'])['item_cnt_day']
        Item_Agg_Week = Item_Agg_Week.reset_index()
        
        if(self.week_level):
            Pivot1 = pd.pivot_table(Item_Agg_Week, values='sum', columns=['Year','Month','Week'], index =['item_id'],fill_value=0)
            Pivot1.reset_index(inplace=True)
            Pivot1.columns = [str(col[0])+"_"+str(col[1])+"_"+str(col[2]) for col in Pivot1.columns.values]
            Pivot1.rename(columns={'item_id__':'item_id'}, inplace=True)
#             Pivot1.to_hdf("./Item_Agg_Week.h5", 'table', complevel=9, complib='zlib')
            if(self.derive_new_columns):
                Pivot1= derive_new_week(Pivot1)
            
        else:
            Pivot1 = pd.pivot_table(Item_Agg_Week, values='sum', columns=['Year','Month'], index =['item_id'],fill_value=0)
            Pivot1.reset_index(inplace=True)
            Pivot1.columns = [str(col[0])+"_"+str(col[1]) for col in Pivot1.columns.values]
            Pivot1.rename(columns={'item_id_':'item_id'}, inplace=True)
#             Pivot1.to_hdf("./Item_Agg_Month.h5", 'table', complevel=9, complib='zlib')
            if(self.derive_new_columns):
                Pivot1= derive_new_week(Pivot1)
        return Pivot1
    
    def transform(self, df, y=None):
        """ Transform function to apply clean_digits function above """
        return self.create_item_pivot(df)

    def fit(self, df, y=None):
        """Returns `self` """
        return self


In [90]:
class create_df_for_train(BaseEstimator, TransformerMixin):
    "Pipeline class for selecting required columns"
    def __init__(self,start_year = None,start_month = None, target_year = None, target_month=None, reqd_cols = None):
        self.target_year =target_year
        self.target_month = target_month
        self.reqd_cols=reqd_cols
        self.start_year = start_year
        self.start_month = start_month

    def select_cols(self,df):
        import re
        
        month_dictionary = {'Jan':1,'Feb':2 , 'Mar':3,  'Jul':7, 'Aug':8, 'Sep':9,
                           'Apr':4, 'May':5, 'Jun':6,   'Oct':10, 'Nov':11, 'Dec':12}
        if(self.reqd_cols ==None):
            print("Required columns missing -- please check")
        else:
            if(self.target_year ==None or self.target_month ==None):
                print("Pls check target year and month - they are missing")
            else:
                Start_Month = month_dictionary[self.start_month]
                Start_Col = str(self.start_year)+"_"+str(Start_Month)
                
                pattern1 = Start_Col
                indices1 = [i for i, x in enumerate(df.columns) if re.search(pattern1, x)]

                
                Target_Month = month_dictionary[self.target_month]
                Target_Col = str(self.target_year)+"_"+str(Target_Month)
                pattern2 = Target_Col
                indices2 = [i for i, x in enumerate(df.columns) if re.search(pattern2, x)]
                tmp = df[df.columns[indices1[0]:indices2[0]]]
                col_list = ["month_"+str(i) for i in range(tmp.shape[1],0,-1)]
                tmp.columns= col_list
#                 x_cols = self.reqd_cols +[df.columns[0,1,indices1[0]:indices2[0]]]
#                 y_cols = self.reqd_cols +[df.columns[indices2[0]]]
#                 print(x_cols)
#                 print(df.columns[0,1,indices1[0]:indices2[0]])
                print(pattern1, pattern2, indices1, indices2)
                return (pd.concat([df[self.reqd_cols],tmp ], axis=1),
                        pd.concat([df[self.reqd_cols], df[df.columns[indices2[0] ]]], axis=1)) 

    def transform(self, df, y=None):
        """ Transform function to apply clean_digits function above """
        return self.select_cols(df)

    def fit(self, df, y=None):
        """Returns `self` """
        return self
            
        
        

In [211]:
class create_month_cumulative(create_df_for_train):
    def transform(self, df,y=None):
        tmp,_  = self.select_cols(df)
        tmp_cols = ["cum_"+col for col in tmp.iloc[:,::-1].columns[:-2] ]
        new_names = list(tmp.columns[:2])+list(tmp_cols)
        print(new_names)
        tmp2 = pd.concat([ tmp.iloc[:,:2]   , tmp.iloc[:, ::-1].cumsum(axis=1).iloc[:,:-2]], axis=1)
        tmp2.columns = new_names
        return(tmp2)

In [83]:
pwd()

'/home/a115505/predicti'

# Functions

In [73]:
def run_prediction(start_year, start_month, target_year, target_month, RFModel, traindata):
    pipe4 = create_df_for_train(start_year = start_year, start_month=start_month, target_year=target_year, target_month=target_month,reqd_cols=['shop_id','item_id'])
    features,sales = pipe4.transform(traindata)
    pred = RFModel.predict(features.drop(['shop_id','item_id'],axis=1))
    print(np.sqrt(metrics.mean_squared_error(sales.iloc[:,-1],pred)))

In [32]:
def fit_and_cal(X_train,y_train,n_trees=40):
    regr = RandomForestRegressor(n_estimators=n_trees, min_samples_leaf=3, max_features=0.5, n_jobs=-1, oob_score=True)
    regr.fit(X_train,y_train)
    predictions = regr.predict(X_train)
    RF_metric1= np.sqrt(metrics.mean_squared_error(y_train,predictions))
    print(RF_metric1)
    return(regr)

In [122]:
def rf_feat_importance(m, df):
    return pd.DataFrame({'cols':df.columns, 'imp':m.feature_importances_}
                       ).sort_values('imp', ascending=False)

In [136]:
def oob_regression_r2_score(rf, X_train, y_train):
    """
    Compute out-of-bag (OOB) R^2 for a scikit-learn random forest
    regressor. We learned the guts of scikit's RF from the BSD licensed
    code:
    https://github.com/scikit-learn/scikit-learn/blob/a24c8b46/sklearn/ensemble/forest.py#L702
    """
    X = X_train.values
    y = y_train.values

    n_samples = len(X)
    predictions = np.zeros(n_samples)
    n_predictions = np.zeros(n_samples)
    for tree in rf.estimators_:
        unsampled_indices = _generate_unsampled_indices(tree.random_state, n_samples)
        tree_preds = tree.predict(X[unsampled_indices, :])
        predictions[unsampled_indices] += tree_preds
        n_predictions[unsampled_indices] += 1

    if (n_predictions == 0).any():
        warnings.warn("Too few trees; some variables do not have OOB scores.")
        n_predictions[n_predictions == 0] = 1

    predictions /= n_predictions

    oob_score = r2_score(y, predictions)
    return oob_score

In [137]:
def permutation_importances(rf, X_train, y_train, metric):
    baseline = metric(rf, X_train, y_train)
    imp = []
    for col in X_train.columns:
        save = X_train[col].copy()
        X_train[col] = np.random.permutation(X_train[col])
        m = metric(rf, X_train, y_train)
        X_train[col] = save
        imp.append(baseline - m)
    return np.array(imp)

# Read data files

In [9]:
item_categories = pd.read_csv("./item_categories.csv")
items = pd.read_csv("./items.csv")
shops= pd.read_csv("./shops.csv")
train = pd.read_csv('./sales_train.csv.gz', compression='gzip')
test = pd.read_csv("./test.csv.gz", compression = "gzip")

In [None]:
## Clearning train data frame

pipeline1 = Pipeline([
            ('clean_train', clean_train()),
        ])
tmp1 = pipeline1.fit_transform(train)

In [11]:
tmp1.to_hdf("./new/New_train.h5", 'table', complevel=9, complib='zlib')

# Creating ML ready data frame

In [12]:
pipeline2 = Pipeline([
            ('create_pivot2', create_pivot_from_raw_train()),
        ])
train_pivot = pipeline2.fit_transform(tmp1)

In [13]:
train_pivot.to_hdf("./new/train_pivot_month.h5", 'table', complevel=9, complib='zlib')

In [14]:
train_pivot.head()

Unnamed: 0,shop_id,item_id,2013_1,2013_2,2013_3,2013_4,2013_5,2013_6,2013_7,2013_8,...,2015_1,2015_2,2015_3,2015_4,2015_5,2015_6,2015_7,2015_8,2015_9,2015_10
0,0,30,0.0,10.333333,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,31,0.0,3.666667,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,32,2.0,2.5,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,33,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,35,1.0,3.5,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Creating item id level summary

In [15]:
pipe3 = create_item_level_grp()
item_grp = pipe3.transform(tmp1)

In [16]:
item_grp.head()

Unnamed: 0,item_id,2013_1,2013_2,2013_3,2013_4,2013_5,2013_6,2013_7,2013_8,2013_9,...,2015_1,2015_2,2015_3,2015_4,2015_5,2015_6,2015_7,2015_8,2015_9,2015_10
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
item_grp.to_hdf("./new/item_grp_month.h5", 'table', complevel=9, complib='zlib')

# Creating data frame given target month and year - For 2015 July

In [123]:
pipe4 = create_df_for_train(start_year = 2013, start_month='Jan', target_year=2015, target_month='Jul',reqd_cols=['shop_id','item_id'])
X,y = pipe4.transform(train_pivot)

2013_1 2015_7 [2, 11, 12, 13] [32]


In [124]:
drop_columns = ['shop_id','item_id']
JulyX = X.drop(drop_columns, axis=1)
JulyY = y.drop(drop_columns, axis=1)

In [118]:
# for i in range(5):
#     print(" fitting model number -- " , (i+1), fit_and_cal(Xnew,ynew.values.ravel()))

In [125]:
Model1 = fit_and_cal(JulyX,JulyY.values.ravel())

0.272171224818


# Generating predictions using above model for remaining months

In [120]:
run_prediction(2013,'Feb', 2015,'Aug', Model1,train_pivot)

2013_2 2015_8 [3] [33]
0.364491713901


In [121]:
run_prediction(2013,'Mar', 2015,'Sep', Model1,train_pivot)

2013_3 2015_9 [4] [34]
1.03562787621


Model that was built using data from 2013-Jan to 2015-June to predict sales in July works well for August predictions. But RMSE value increases while predicting September sales. Model built using most recent data might be working well.


So, lets check if this hypothesis is valid by building a model with data from 2013-jan to 2015-Aug to predict September sales.



# Creating dataset from 2013-Jan to 2015-Aug

In [126]:
pipe4 = create_df_for_train(start_year = 2013, start_month='Jan', target_year=2015, target_month='Aug',reqd_cols=['shop_id','item_id'])
X,y = pipe4.transform(train_pivot)

2013_1 2015_8 [2, 11, 12, 13] [33]


In [127]:
drop_columns = ['shop_id','item_id']
AugX = X.drop(drop_columns, axis=1)
AugY = y.drop(drop_columns, axis=1)

In [128]:
# for i in range(5):
#     print(" fitting model number -- " , (i+1), fit_and_cal(Xnew,ynew))

In [129]:
Model2 = fit_and_cal(AugX,AugY.values.ravel())

0.310654233327


In [115]:
run_prediction(2013,'Feb', 2015,'Sep', Model2,train_pivot)

2013_2 2015_9 [3] [34]
1.03351413883


There seems to be something else going on here. RMSE value decreased very mariginally from 1.035 to 1.033. Difference in RMSE decreas is not siginificant.

# Lets look at Feature improtance between these two models

In [134]:
Feat_Imp_Jul= rf_feat_importance(Model1,JulyX)
#As expected, most recent month as higher predictive power compared to other months. But some of the older months also have higher predictive power
# Feat_Imp_Jul.plot(x='cols',y='imp')
# Feat_Imp_Jul.sort_values('cols',ascending=False).plot(x='cols',y='imp')
Feat_Imp_Jul['Rank_RF'] = Feat_Imp_Jul['imp'].rank(ascending=False)

Unnamed: 0,cols,imp
29,month_1,0.34596
23,month_7,0.090728
28,month_2,0.087069
11,month_19,0.04719
24,month_6,0.043998
25,month_5,0.043755
27,month_3,0.043259
19,month_11,0.039756
20,month_10,0.035134
17,month_13,0.028259


In [140]:
from sklearn.ensemble.forest import _generate_unsampled_indices
from sklearn.metrics import r2_score
imp = permutation_importances(Model1, JulyX, JulyY,oob_regression_r2_score)

In [142]:
Imp_Df = pd.DataFrame({'colname':JulyX.columns, 'Imp_Val':imp}).sort_values(by= 'Imp_Val',ascending=False)
Imp_Df['Rank_New'] = Imp_Df['Imp_Val'].rank(ascending=False)

In [143]:
pd.merge(Feat_Imp_Jul, Imp_Df, how = 'inner',left_on = 'cols', right_on='colname'); Imp_Df.head(n=10)

Unnamed: 0,cols,imp,Rank_RF,Imp_Val,colname,Rank_New
0,month_1,0.34596,1.0,0.250355,month_1,1.0
1,month_7,0.090728,2.0,0.023662,month_7,4.0
2,month_2,0.087069,3.0,0.035185,month_2,2.0
3,month_19,0.04719,4.0,0.010635,month_19,8.0
4,month_6,0.043998,5.0,0.023241,month_6,5.0
5,month_5,0.043755,6.0,0.00729,month_5,13.0
6,month_3,0.043259,7.0,0.024335,month_3,3.0
7,month_11,0.039756,8.0,0.00145,month_11,30.0
8,month_10,0.035134,9.0,0.007731,month_10,12.0
9,month_13,0.028259,10.0,0.006425,month_13,17.0


Unnamed: 0,Imp_Val,colname,Rank_New
29,0.250355,month_1,1.0
28,0.035185,month_2,2.0
27,0.024335,month_3,3.0
23,0.023662,month_7,4.0
24,0.023241,month_6,5.0
0,0.012479,month_30,6.0
1,0.012137,month_29,7.0
11,0.010635,month_19,8.0
2,0.009831,month_28,9.0
6,0.009554,month_24,10.0


In [144]:
Feat_Imp_Aug= rf_feat_importance(Model2,AugX)
#As expected, most recent month as higher predictive power compared to other months. But some of the older months also have higher predictive power
# Feat_Imp_Jul.plot(x='cols',y='imp')
# Feat_Imp_Jul.sort_values('cols',ascending=False).plot(x='cols',y='imp')
Feat_Imp_Aug['Rank_RF'] = Feat_Imp_Aug['imp'].rank(ascending=False)

In [145]:
imp2 = permutation_importances(Model2, AugX, AugY,oob_regression_r2_score)

In [148]:
Imp_Df2 = pd.DataFrame({'colname':AugX.columns, 'Imp_Val':imp2}).sort_values(by= 'Imp_Val',ascending=False)
Imp_Df2['Rank_New'] = Imp_Df2['Imp_Val'].rank(ascending=False)

In [149]:
pd.merge(Feat_Imp_Aug, Imp_Df2, how = 'inner',left_on = 'cols', right_on='colname'); Imp_Df2.head(n=10)

Unnamed: 0,cols,imp,Rank_RF,Imp_Val,colname,Rank_New
0,month_1,0.277774,1.0,0.159239,month_1,1.0
1,month_2,0.183287,2.0,0.062917,month_2,2.0
2,month_8,0.053957,3.0,0.017061,month_8,8.0
3,month_3,0.049185,4.0,0.01138,month_3,17.0
4,month_7,0.044138,5.0,0.008252,month_7,23.0
5,month_4,0.040884,6.0,0.00883,month_4,21.0
6,month_14,0.038017,7.0,0.013806,month_14,11.0
7,month_12,0.030095,8.0,0.005818,month_12,31.0
8,month_5,0.02917,9.0,0.011826,month_5,14.0
9,month_11,0.028996,10.0,0.005976,month_11,30.0


Unnamed: 0,Imp_Val,colname,Rank_New
30,0.159239,month_1,1.0
29,0.062917,month_2,2.0
0,0.025514,month_31,3.0
1,0.021452,month_30,4.0
2,0.01973,month_29,5.0
5,0.017314,month_26,6.0
4,0.017285,month_27,7.0
23,0.017061,month_8,8.0
21,0.015811,month_10,9.0
3,0.01403,month_28,10.0


# New features

In [212]:
pipe4 = create_month_cumulative(start_year = 2013, start_month='Jan', target_year=2015, target_month='Aug',reqd_cols=['shop_id','item_id'])
cum_feat = pipe4.transform(train_pivot)

2013_1 2015_8 [2, 11, 12, 13] [33]
['shop_id', 'item_id', 'cum_month_1', 'cum_month_2', 'cum_month_3', 'cum_month_4', 'cum_month_5', 'cum_month_6', 'cum_month_7', 'cum_month_8', 'cum_month_9', 'cum_month_10', 'cum_month_11', 'cum_month_12', 'cum_month_13', 'cum_month_14', 'cum_month_15', 'cum_month_16', 'cum_month_17', 'cum_month_18', 'cum_month_19', 'cum_month_20', 'cum_month_21', 'cum_month_22', 'cum_month_23', 'cum_month_24', 'cum_month_25', 'cum_month_26', 'cum_month_27', 'cum_month_28', 'cum_month_29', 'cum_month_30', 'cum_month_31']


In [214]:
cum_feat.head()

Unnamed: 0,shop_id,item_id,cum_month_1,cum_month_2,cum_month_3,cum_month_4,cum_month_5,cum_month_6,cum_month_7,cum_month_8,...,cum_month_22,cum_month_23,cum_month_24,cum_month_25,cum_month_26,cum_month_27,cum_month_28,cum_month_29,cum_month_30,cum_month_31
0,0,30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.333333,10.333333
1,0,31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.666667,3.666667
2,0,32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5,4.5
3,0,33,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0
4,0,35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,4.5


In [215]:
# print(list(train_pivot.columns[:2])+(list(train_pivot.columns[-2:])))
train_pivot.columns[:4:]

Index(['shop_id', 'item_id', '2013_1', '2013_2'], dtype='object')

In [177]:
cum_feat.head()

Unnamed: 0,shop_id,item_id,month_1,month_2,month_3,month_4,month_5,month_6,month_7,month_8,...,month_22,month_23,month_24,month_25,month_26,month_27,month_28,month_29,month_30,month_31
0,0,30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.333333,10.333333
1,0,31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.666667,3.666667
2,0,32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5,4.5
3,0,33,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0
4,0,35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,4.5


In [169]:
cum_feat.columns[::-1]

Index(['item_id', 'shop_id', 'month_31', 'month_30', 'month_29', 'month_28',
       'month_27', 'month_26', 'month_25', 'month_24', 'month_23', 'month_22',
       'month_21', 'month_20', 'month_19', 'month_18', 'month_17', 'month_16',
       'month_15', 'month_14', 'month_13', 'month_12', 'month_11', 'month_10',
       'month_9', 'month_8', 'month_7', 'month_6', 'month_5', 'month_4',
       'month_3', 'month_2', 'month_1'],
      dtype='object')

In [159]:
# train_pivot.cumsum(axis=1)
train_pivot.iloc[:, ::-1].cumsum(axis=1).iloc[:,:-2]

Unnamed: 0,2015_10,2015_9,2015_8,2015_7,2015_6,2015_5,2015_4,2015_3,2015_2,2015_1,...,2013_10,2013_9,2013_8,2013_7,2013_6,2013_5,2013_4,2013_3,2013_2,2013_1
0,0.0,0.0,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,10.333333,10.333333
1,0.0,0.0,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,3.666667,3.666667
2,0.0,0.0,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2.500000,4.500000
3,0.0,0.0,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,2.000000
4,0.0,0.0,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,3.500000,4.500000
5,0.0,0.0,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,1.000000
6,0.0,0.0,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,1.000000
7,0.0,0.0,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,1.000000
8,0.0,0.0,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000
9,0.0,0.0,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,1.000000


In [126]:
pipe4 = create_df_for_train(start_year = 2013, start_month='Jan', target_year=2015, target_month='Aug',reqd_cols=['shop_id','item_id'])
X,y = pipe4.transform(train_pivot)

2013_1 2015_8 [2, 11, 12, 13] [33]


# Generating item_id level features

In [41]:
pipe5 = create_df_for_train(start_year=2013, start_month='Jan',target_year=2015, target_month='Jul',reqd_cols=['item_id'])
X_item,y_item = pipe5.transform(item_grp)

2013_1 2015_7 [1, 10, 11, 12] [31]


In [42]:
X_item.head()

Unnamed: 0,item_id,2013_1,2013_2,2013_3,2013_4,2013_5,2013_6,2013_7,2013_8,2013_9,...,2014_9,2014_10,2014_11,2014_12,2015_1,2015_2,2015_3,2015_4,2015_5,2015_6
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [43]:
y_item.head()

Unnamed: 0,item_id,2015_7
0,0,0.0
1,1,0.0
2,2,0.0
3,3,0.0
4,4,0.0
