In [1]:
import featuretools as ft
import pandas as pd
import numpy as np

In [2]:
train=pd.read_csv('train_kOBLwZA.csv')

In [3]:
test=pd.read_csv('test_t02dQwI.csv')

In [4]:
test_Item_Identifier=test['Item_Identifier']
test_Outlet_Identifier=test['Outlet_Identifier']
sales=train['Item_Outlet_Sales']
train.drop('Item_Outlet_Sales',axis=1,inplace=True)

In [5]:
combi=train.append(test,ignore_index=True)

In [6]:
combi.isnull().sum()

Item_Identifier                 0
Item_Weight                  2439
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  4016
Outlet_Location_Type            0
Outlet_Type                     0
dtype: int64

In [8]:
#we add the missing data using 'fillna'

combi['Item_Weight'].fillna(combi['Item_Weight'].mean(),inplace=True)
combi['Outlet_Size'].fillna("missing",inplace=True)

In [9]:
#now into data preprocessing operations

combi['Item_Fat_Content'].value_counts()

Low Fat    8485
Regular    4824
LF          522
reg         195
low fat     178
Name: Item_Fat_Content, dtype: int64

In [10]:
#convert to binary since there are only two types

fat_content_dict={'Low Fat':0,'Regular':1,'LF':0,'reg':1,'low fat':0}

In [11]:
combi['Item_Fat_Content']=combi['Item_Fat_Content'].replace(fat_content_dict,regex=True)

In [12]:
combi['id']=combi['Item_Identifier']+combi['Outlet_Identifier']
combi.drop(['Item_Identifier'],axis=1,inplace=True)

In [13]:
#now we go on and create an entityset that we will use

es=ft.EntitySet(id='sales')

#then proceed to add a dataframe
es.entity_from_dataframe(entity_id='bigmart',dataframe=combi,index='id')

Entityset: sales
  Entities:
    bigmart [Rows: 14204, Columns: 11]
  Relationships:
    No relationships

In [18]:
es.normalize_entity(base_entity_id='bigmart',new_entity_id='outlet',index='Outlet_Identifier',
                   additional_variables = ['Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type'])




Entityset: sales
  Entities:
    bigmart [Rows: 14204, Columns: 7]
    outlet [Rows: 10, Columns: 5]
  Relationships:
    bigmart.Outlet_Identifier -> outlet.Outlet_Identifier

In [20]:
#now we use Deep Feature Synthesis which utilizes Feature Primitives 

feature_matrix,feature_names=ft.dfs(entityset=es,
                                   target_entity='bigmart',
                                    max_depth=2,
                                   verbose=1,
                                   n_jobs=3)

Built 37 features
EntitySet scattered to workers in 8.894 seconds
Elapsed: 00:02 | Remaining: 00:00 | Progress: 100%|██████████| Calculated: 11/11 chunks


In [21]:
#newly created features

feature_matrix.columns

Index(['Item_Weight', 'Item_Fat_Content', 'Item_Visibility', 'Item_Type',
       'Item_MRP', 'Outlet_Identifier', 'outlet.Outlet_Establishment_Year',
       'outlet.Outlet_Size', 'outlet.Outlet_Location_Type',
       'outlet.Outlet_Type', 'outlet.SUM(bigmart.Item_Weight)',
       'outlet.SUM(bigmart.Item_Fat_Content)',
       'outlet.SUM(bigmart.Item_Visibility)', 'outlet.SUM(bigmart.Item_MRP)',
       'outlet.STD(bigmart.Item_Weight)',
       'outlet.STD(bigmart.Item_Fat_Content)',
       'outlet.STD(bigmart.Item_Visibility)', 'outlet.STD(bigmart.Item_MRP)',
       'outlet.MAX(bigmart.Item_Weight)',
       'outlet.MAX(bigmart.Item_Fat_Content)',
       'outlet.MAX(bigmart.Item_Visibility)', 'outlet.MAX(bigmart.Item_MRP)',
       'outlet.SKEW(bigmart.Item_Weight)',
       'outlet.SKEW(bigmart.Item_Fat_Content)',
       'outlet.SKEW(bigmart.Item_Visibility)', 'outlet.SKEW(bigmart.Item_MRP)',
       'outlet.MIN(bigmart.Item_Weight)',
       'outlet.MIN(bigmart.Item_Fat_Content)',
       

In [22]:
feature_matrix.head()

Unnamed: 0_level_0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,outlet.Outlet_Establishment_Year,outlet.Outlet_Size,outlet.Outlet_Location_Type,outlet.Outlet_Type,...,outlet.MIN(bigmart.Item_Fat_Content),outlet.MIN(bigmart.Item_Visibility),outlet.MIN(bigmart.Item_MRP),outlet.MEAN(bigmart.Item_Weight),outlet.MEAN(bigmart.Item_Fat_Content),outlet.MEAN(bigmart.Item_Visibility),outlet.MEAN(bigmart.Item_MRP),outlet.COUNT(bigmart),outlet.NUM_UNIQUE(bigmart.Item_Type),outlet.MODE(bigmart.Item_Type)
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DRA12OUT010,11.6,0,0.068535,Soft Drinks,143.0154,OUT010,1998,missing,Tier 3,Grocery Store,...,0,0.0,32.6558,12.72287,0.356757,0.101939,141.159742,925,16,Fruits and Vegetables
DRA12OUT013,11.6,0,0.040912,Soft Drinks,142.3154,OUT013,1987,High,Tier 3,Supermarket Type1,...,0,0.0,31.49,12.788139,0.353509,0.060242,141.128428,1553,16,Fruits and Vegetables
DRA12OUT017,11.6,0,0.041178,Soft Drinks,140.3154,OUT017,2007,missing,Tier 2,Supermarket Type1,...,0,0.0,32.09,12.78208,0.35256,0.061142,140.998931,1543,16,Snack Foods
DRA12OUT018,11.6,0,0.041113,Soft Drinks,142.0154,OUT018,2009,Medium,Tier 3,Supermarket Type2,...,0,0.0,31.89,12.803638,0.353816,0.059976,141.000899,1546,16,Fruits and Vegetables
DRA12OUT027,12.792854,0,0.040748,Soft Drinks,140.0154,OUT027,1985,Medium,Tier 3,Supermarket Type3,...,0,0.0,31.29,12.792854,0.353432,0.060344,141.012347,1559,16,Fruits and Vegetables


In [23]:
feature_matrix=feature_matrix.reindex(index=combi['id'])
feature_matrix=feature_matrix.reset_index()

In [24]:
#now into model building and testing

from catboost import CatBoostRegressor

In [26]:
categorical_features=np.where(feature_matrix.dtypes == 'object')[0]

for i in categorical_features:
    feature_matrix.iloc[:,i]=feature_matrix.iloc[:,i].astype('str')

In [27]:
feature_matrix.drop(['id'],axis=1,inplace=True)

train=feature_matrix[:8523]
test=feature_matrix[8523:]

In [28]:
train.drop(['Outlet_Identifier'],axis=1,inplace=True)
test.drop(['Outlet_Identifier'],axis=1,inplace=True)

In [29]:
categorical_features=np.where(train.dtypes=='object')[0]

In [30]:
#here we now split the train data 

from sklearn.model_selection import train_test_split

xtrain,xvalid,ytrain,yvalid=train_test_split(train,sales,test_size=0.25,random_state=11)

In [32]:
#TRAIN THE MODEL USING RMSE-Root Mean Squared Error

model_here=CatBoostRegressor(iterations=100,learning_rate=0.3,depth=6,eval_metric='RMSE',random_seed=7)

model_here.fit(xtrain,ytrain,cat_features=categorical_features,use_best_model=True)

You should provide test set for use best model. use_best_model parameter swiched to false value.


0:	learn: 2133.7136483	total: 153ms	remaining: 15.2s
1:	learn: 1695.0942350	total: 224ms	remaining: 11s
2:	learn: 1436.4124328	total: 286ms	remaining: 9.25s
3:	learn: 1270.3166166	total: 338ms	remaining: 8.11s
4:	learn: 1177.4997780	total: 396ms	remaining: 7.52s
5:	learn: 1126.0729289	total: 452ms	remaining: 7.08s
6:	learn: 1102.1684501	total: 487ms	remaining: 6.47s
7:	learn: 1087.2534720	total: 536ms	remaining: 6.17s
8:	learn: 1079.2502819	total: 581ms	remaining: 5.88s
9:	learn: 1074.6337098	total: 640ms	remaining: 5.76s
10:	learn: 1071.4046295	total: 684ms	remaining: 5.53s
11:	learn: 1069.4368347	total: 745ms	remaining: 5.46s
12:	learn: 1069.1625432	total: 785ms	remaining: 5.25s
13:	learn: 1067.6957846	total: 911ms	remaining: 5.6s
14:	learn: 1067.4406361	total: 970ms	remaining: 5.5s
15:	learn: 1066.9816381	total: 1.02s	remaining: 5.34s
16:	learn: 1064.9973104	total: 1.06s	remaining: 5.2s
17:	learn: 1063.0762860	total: 1.12s	remaining: 5.11s
18:	learn: 1063.0552578	total: 1.15s	remain

<catboost.core.CatBoostRegressor at 0x158f4a22108>

In [33]:
model_here.score(xvalid,yvalid)

1092.201745531876