### Importing required libraries

In [1018]:
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)
from lightgbm import LGBMRegressor
import time
from sklearn.model_selection import KFold

### Load Data

In [1019]:
train=pd.read_csv('Train.csv',parse_dates=['Date'])
test=pd.read_csv('Test.csv',parse_dates=['Date'])
sub=pd.read_excel('sample_submission.xlsx')

### Basic Understanding of Data

In [1020]:
train.head()

Unnamed: 0,Product,Product_Brand,Item_Category,Subcategory_1,Subcategory_2,Item_Rating,Date,Selling_Price
0,P-2610,B-659,bags wallets belts,bags,hand bags,4.3,2017-02-03,291.0
1,P-2453,B-3078,clothing,women s clothing,western wear,3.1,2015-07-01,897.0
2,P-6802,B-1810,home decor festive needs,showpieces,ethnic,3.5,2019-01-12,792.0
3,P-4452,B-3078,beauty and personal care,eye care,h2o plus eye care,4.0,2014-12-12,837.0
4,P-8454,B-3078,clothing,men s clothing,t shirts,4.3,2013-12-12,470.0


In [1021]:
train.tail()

Unnamed: 0,Product,Product_Brand,Item_Category,Subcategory_1,Subcategory_2,Item_Rating,Date,Selling_Price
2447,P-8870,B-2292,clothing,kids clothing,girls wear,2.3,2017-11-03,741.0
2448,P-10672,B-3078,footwear,women s footwear,casual shoes,1.9,2016-04-12,1590.0
2449,P-2134,B-479,clothing,men s clothing,t shirts,1.9,2019-04-19,995.0
2450,P-724,B-133,automotive,accessories spare parts,car interior exterior,2.7,2014-01-12,1598.0
2451,P-1154,B-174,bags wallets belts,bags,pouches and potlis,4.1,2019-03-03,397.0


In [1022]:
test.head()

Unnamed: 0,Product,Product_Brand,Item_Category,Subcategory_1,Subcategory_2,Item_Rating,Date
0,P-11284,B-2984,computers,network components,routers,4.3,2018-01-12
1,P-6580,B-1732,jewellery,bangles bracelets armlets,bracelets,3.0,2012-12-20
2,P-5843,B-3078,clothing,women s clothing,western wear,1.5,2014-01-12
3,P-5334,B-1421,jewellery,necklaces chains,necklaces,3.9,2019-01-12
4,P-5586,B-3078,clothing,women s clothing,western wear,1.4,2017-01-12


In [1023]:
test.tail()

Unnamed: 0,Product,Product_Brand,Item_Category,Subcategory_1,Subcategory_2,Item_Rating,Date
1046,P-9758,B-2543,sports fitness,outdoor adventure,camping hiking,2.0,2014-02-28
1047,P-11898,B-3197,jewellery,necklaces chains,necklaces,4.7,2019-01-12
1048,P-904,B-133,automotive,accessories spare parts,car interior exterior,3.5,2014-01-12
1049,P-1714,B-358,kitchen dining,lighting,bulbs,2.4,2013-06-23
1050,P-620,B-133,automotive,accessories spare parts,car interior exterior,3.1,2012-01-12


In [1024]:
train.dtypes

Product                  object
Product_Brand            object
Item_Category            object
Subcategory_1            object
Subcategory_2            object
Item_Rating             float64
Date             datetime64[ns]
Selling_Price           float64
dtype: object

In [1025]:
train.describe()

Unnamed: 0,Item_Rating,Selling_Price
count,2452.0,2452.0
mean,3.078467,2494.375612
std,1.187137,7115.256516
min,1.0,33.0
25%,2.0,371.0
50%,3.1,596.0
75%,4.1,1195.25
max,5.0,116289.0


In [1026]:
train['train_or_test']='train'
test['train_or_test']='test'
df=pd.concat([train,test])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  This is separate from the ipykernel package so we can avoid doing imports until


### Time Based Features

In [1027]:
def create_date_featues(df):

    df['Month'] = pd.to_datetime(df['Date']).dt.month

    df['DayOfyear'] = pd.to_datetime(df['Date']).dt.dayofyear

    df['Quarter'] = pd.to_datetime(df['Date']).dt.quarter 

    df['Is_month_start'] = pd.to_datetime(df['Date']).dt.is_month_start

    return df

In [1028]:
df=create_date_featues(df)

#### Replaced Unknown as NAN , created a boolean feat which return True for all NAN values. Replacing NAN of Subcategory_1 didnt give useful results ,so filled back with unknown

In [1029]:
df['Subcategory_1']=df['Subcategory_1'].replace('unknown', np.nan)
df['Subcategory_2']=df['Subcategory_2'].replace('unknown', np.nan)

cols_with_missing = [col for col in df.columns
                     if df[col].isnull().any()]
for col in cols_with_missing:
    df[col + '_was_missing'] = df[col].isnull()

df['Subcategory_1'].fillna('unknown',inplace=True)
#df['Subcategory_1'].fillna(df['Item_Category'],inplace=True)
#df['Subcategory_2'].fillna(df['Subcategory_1'],inplace=True)

#### Replaced Majority of NAN of sub_category_2 with mode grouped by with sub_cat_1 ,remaining few NA values with unknown as it is.

In [1030]:
df.reset_index(inplace=True)
def fast_mode(df, key_cols, value_col):
    """ 
    Calculate a column mode, by group, ignoring null values. 

    Parameters
    ----------
    df : pandas.DataFrame
        DataFrame over which to calcualate the mode. 
    key_cols : list of str
        Columns to groupby for calculation of mode.
    value_col : str
        Column for which to calculate the mode. 

    Return
    ------ 
    pandas.DataFrame
        One row for the mode of value_col per key_cols group. If ties, 
        returns the one which is sorted first. 
    """
    return (df.groupby(key_cols + [value_col]).size() 
              .to_frame('counts').reset_index() 
              .sort_values('counts', ascending=False) 
              .drop_duplicates(subset=key_cols)).drop(columns='counts')


df.loc[df.Subcategory_2.isnull(), 'Subcategory_2'] = df.Subcategory_1.map(fast_mode(df, ['Subcategory_1'], 'Subcategory_2').set_index('Subcategory_1').Subcategory_2)
del df['index']
df['Subcategory_2'].fillna('unknown',inplace=True)

### Interaction Features

In [1031]:
interaction1=df['Product_Brand']+'_'+df['Subcategory_1']
interaction2=df['Product_Brand']+'_'+df['Subcategory_2']

interaction3=df['Item_Category']+'_'+df['Subcategory_1']
interaction4=df['Item_Category']+'_'+df['Subcategory_2']

interaction5=df['Subcategory_1']+'_'+df['Subcategory_2']


df['PB_S1']=interaction1
df['PB_S2']=interaction2
df['IC_S1']=interaction3
df['IC_S2']=interaction4
df['S1_S2']=interaction5


### Label Encoding

In [1032]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

for col in ['Item_Category', 'Product','Subcategory_1', 'PB_S2', 'IC_S1','S1_S2']:
    df[col]= le.fit_transform(df[col]) 


### Frequency Encoding

In [1033]:
fe_pol = (df.groupby('Product_Brand').size()) / len(df)
df['Product_Brand_fe'] = df['Product_Brand'].apply(lambda x : fe_pol[x])

In [1034]:
fe_pol = (df.groupby('Subcategory_1').size()) / len(df)
df['Subcategory_1_fe'] = df['Subcategory_1'].apply(lambda x : fe_pol[x])

In [1035]:
fe_pol = (df.groupby('Subcategory_2').size()) / len(df)
df['Subcategory_2_fe'] = df['Subcategory_2'].apply(lambda x : fe_pol[x])

In [1036]:
fe_pol = (df.groupby('PB_S2').size()) / len(df)
df['PB_S2_fe'] = df['PB_S2'].apply(lambda x : fe_pol[x])

### Binary Encoding

In [1037]:
import category_encoders as ce
encoder=ce.BinaryEncoder(cols=['Product_Brand'])
dfbin=encoder.fit_transform(df['Product_Brand'])
df=pd.concat([df,dfbin],axis=1)
del df['Product_Brand']

In [1038]:
encoder=ce.BinaryEncoder(cols=['Subcategory_2'])
dfbin=encoder.fit_transform(df['Subcategory_2'])
df=pd.concat([df,dfbin],axis=1)
del df['Subcategory_2']

In [1039]:
import category_encoders as ce
encoder=ce.BinaryEncoder(cols=['PB_S1'])
dfbin=encoder.fit_transform(df['PB_S1'])
df=pd.concat([df,dfbin],axis=1)
del df['PB_S1']

In [1040]:
import category_encoders as ce
encoder=ce.BinaryEncoder(cols=['PB_S2'])
dfbin=encoder.fit_transform(df['PB_S2'])
df=pd.concat([df,dfbin],axis=1)
del df['PB_S2']

In [1041]:
import category_encoders as ce
encoder=ce.BinaryEncoder(cols=['IC_S2'])
dfbin=encoder.fit_transform(df['IC_S2'])
df=pd.concat([df,dfbin],axis=1)
del df['IC_S2']

### Getting back train and test

In [1042]:
train=df.loc[df.train_or_test.isin(['train'])]
test=df.loc[df.train_or_test.isin(['test'])]
train.drop(columns={'train_or_test'},axis=1,inplace=True)
test.drop(columns={'train_or_test'},axis=1,inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


### Log transforming target variable

In [1043]:
train['Selling_Price']=np.log(train['Selling_Price'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [1044]:
train.columns

Index(['Date', 'Item_Category', 'Item_Rating', 'Product', 'Selling_Price',
       'Subcategory_1', 'Month', 'DayOfyear', 'Quarter', 'Is_month_start',
       'Selling_Price_was_missing', 'Subcategory_1_was_missing',
       'Subcategory_2_was_missing', 'IC_S1', 'S1_S2', 'Product_Brand_fe',
       'Subcategory_1_fe', 'Subcategory_2_fe', 'PB_S2_fe', 'Product_Brand_0',
       'Product_Brand_1', 'Product_Brand_2', 'Product_Brand_3',
       'Product_Brand_4', 'Product_Brand_5', 'Product_Brand_6',
       'Product_Brand_7', 'Product_Brand_8', 'Product_Brand_9',
       'Product_Brand_10', 'Product_Brand_11', 'Subcategory_2_0',
       'Subcategory_2_1', 'Subcategory_2_2', 'Subcategory_2_3',
       'Subcategory_2_4', 'Subcategory_2_5', 'Subcategory_2_6',
       'Subcategory_2_7', 'Subcategory_2_8', 'Subcategory_2_9', 'PB_S1_0',
       'PB_S1_1', 'PB_S1_2', 'PB_S1_3', 'PB_S1_4', 'PB_S1_5', 'PB_S1_6',
       'PB_S1_7', 'PB_S1_8', 'PB_S1_9', 'PB_S1_10', 'PB_S1_11', 'PB_S2_0',
       'PB_S2_1', 'PB_S2

### Here i have droped features with low importance too

In [1045]:
x=train.drop(columns={'Selling_Price','Date','Selling_Price_was_missing','Subcategory_2_0','PB_S1_0','Product_Brand_1','PB_S2_0','Product_Brand_0','IC_S2_0'},axis=1)
y=train['Selling_Price']
test=test.drop(columns={'Selling_Price','Date','Selling_Price_was_missing','Subcategory_2_0','PB_S1_0','Product_Brand_1','PB_S2_0','Product_Brand_0','IC_S2_0'},axis=1)

In [1046]:
col=x.columns
from sklearn.preprocessing import StandardScaler
st=StandardScaler()
st.fit(x)
x=st.transform(x)
test=st.transform(test)
x=pd.DataFrame(x,columns=col)
test=pd.DataFrame(test,columns=col)

### Since we have log transformed target variable, calculating rmse will give rmsle

In [941]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(y, y_pred):
    return np.sqrt(mean_squared_error(y,y_pred))

### Kfold cross validating

In [1048]:
err = []
y_pred_tot_lgbm = []


fold = KFold(n_splits=5, shuffle=True, random_state=2020)
i = 1

for train_index, test_index in fold.split(x, y):
    x_train, x_val = x.iloc[train_index], x.iloc[test_index]
    y_train, y_val = y.iloc[train_index], y.iloc[test_index]
    m = LGBMRegressor(n_estimators=1000, num_leaves=127, max_depth=8,min_child_samples=4,
                      learning_rate=0.02, colsample_bytree=0.4, reg_alpha=0.5, reg_lambda=2)
    m.fit(x_train, y_train,eval_set=[(x_train,y_train),(x_val, y_val)],early_stopping_rounds=100,verbose=200,#eval_metric=rmsle(y_val,pred_y)
         )
    pred_y = m.predict(x_val)
    print(i, " err_lgbm: ",  rmse(y_val,pred_y))
    err.append(rmse(y_val,pred_y))
    pred_test = m.predict(test)
    i = i + 1
    y_pred_tot_lgbm.append(pred_test)
sum(err)/5

Training until validation scores don't improve for 100 rounds
[200]	training's l2: 0.149351	valid_1's l2: 0.460698
[400]	training's l2: 0.0805028	valid_1's l2: 0.454747
Early stopping, best iteration is:
[321]	training's l2: 0.101046	valid_1's l2: 0.453721
1  err_lgbm:  0.6735882994837199
Training until validation scores don't improve for 100 rounds
[200]	training's l2: 0.155561	valid_1's l2: 0.349065
[400]	training's l2: 0.0754913	valid_1's l2: 0.343882
Early stopping, best iteration is:
[325]	training's l2: 0.0954031	valid_1's l2: 0.343256
2  err_lgbm:  0.5858808382218244
Training until validation scores don't improve for 100 rounds
[200]	training's l2: 0.148606	valid_1's l2: 0.424671
Early stopping, best iteration is:
[278]	training's l2: 0.107175	valid_1's l2: 0.42208
3  err_lgbm:  0.6496770246450424
Training until validation scores don't improve for 100 rounds
[200]	training's l2: 0.146011	valid_1's l2: 0.433763
[400]	training's l2: 0.0675067	valid_1's l2: 0.425623
Early stopping,

0.6333882703343926

In [1049]:
0.633

0.633

### Feature Importance

In [1050]:
feat_imp = pd.DataFrame(m.feature_importances_, index=x.columns)

In [1051]:
feat_imp[0].sort_values(ascending=True)

Subcategory_1_was_missing        5
PB_S1_1                         60
Subcategory_2_1                 85
IC_S2_1                        116
Subcategory_2_was_missing      124
Subcategory_2_6                126
IC_S2_2                        140
Subcategory_2_2                150
Is_month_start                 154
PB_S2_5                        157
PB_S2_3                        171
Product_Brand_2                172
Product_Brand_9                185
IC_S2_8                        189
Subcategory_2_7                204
PB_S1_3                        205
PB_S2_1                        213
PB_S1_6                        223
PB_S1_2                        224
IC_S2_4                        227
IC_S2_9                        241
PB_S1_7                        249
PB_S2_4                        262
Product_Brand_11               265
Product_Brand_3                269
Subcategory_2_9                272
Subcategory_2_5                275
Product_Brand_7                281
IC_S2_3             

### Final Model on All train data

In [948]:
m = LGBMRegressor(n_estimators=325, num_leaves=127, max_depth=8,min_child_samples=4,
                      learning_rate=0.02, colsample_bytree=0.4, reg_alpha=0.5, reg_lambda=2,random_state=2020)
m.fit(x,y)
lgbpred=m.predict(test)

In [949]:
#sub['Selling_Price']=np.exp(lgbpred)
sub['Selling_Price']=lgbpred
#sub.to_excel('lgbm.xlsx',index=False)

In [707]:
lgb=pd.read_excel('lgbm.xlsx')
rf=pd.read_excel('randomforest.xlsx') # Random Forest Code is in another file.

### Blending lgbm and Rf

In [708]:
average=np.exp(0.45*lgb['Selling_Price']+0.55*rf['Selling_Price'])

In [710]:
sub['Selling_Price']=average
sub.to_excel('blending.xlsx',index=False)

### Kfold validating blending accuracy of LGBM and RF

In [666]:
from sklearn.ensemble import RandomForestRegressor
err = []
y_pred_tot_xgb = []



fold = KFold(n_splits=5, shuffle=True, random_state=2020)
i = 1

for train_index, test_index in fold.split(x, y):
    x_train, x_val = x.iloc[train_index], x.iloc[test_index]
    y_train, y_val = y.iloc[train_index], y.iloc[test_index]
    
    m = RandomForestRegressor(bootstrap=True, max_features=0.45, min_samples_leaf=1, min_samples_split=5, n_estimators=100,random_state=2020)
    m.fit(x_train, y_train)
    rfpred = m.predict(x_val)*0.999
    
    m = LGBMRegressor(n_estimators=442, num_leaves=127, max_depth=8,min_child_samples=4,
                      learning_rate=0.02, colsample_bytree=0.4, reg_alpha=0.5, reg_lambda=2,random_state=2020)
    m.fit(x_train, y_train)
    lgbpred=m.predict(x_val)*0.999
    average=0.55*rfpred+0.45*lgbpred
    
    print(i, " err_xgb: ",  rmse(y_val,average))
    err.append(rmse(y_val,average))
    #pred_test = m.predict(test)
    i = i + 1
    #y_pred_tot_xgb.append(pred_test)
(err[0]+err[1]+err[2]+err[3]+err[4])/5

1  err_xgb:  0.6744392062002934
2  err_xgb:  0.5830066895429288
3  err_xgb:  0.6439375050185021
4  err_xgb:  0.6536547390285575
5  err_xgb:  0.6097304120455125


0.6329537103671589

In [None]:
0.6326