In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing 

In [2]:
df=pd.read_csv('missing_value_handled.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1017209 entries, 0 to 1017208
Data columns (total 18 columns):
 #   Column                     Non-Null Count    Dtype  
---  ------                     --------------    -----  
 0   Store                      1017209 non-null  int64  
 1   StoreType                  1017209 non-null  object 
 2   Assortment                 1017209 non-null  object 
 3   CompetitionDistance        1017209 non-null  float64
 4   CompetitionOpenSinceMonth  1017209 non-null  float64
 5   CompetitionOpenSinceYear   1017209 non-null  float64
 6   Promo2                     1017209 non-null  int64  
 7   Promo2SinceWeek            1017209 non-null  float64
 8   Promo2SinceYear            1017209 non-null  float64
 9   PromoInterval              1017209 non-null  object 
 10  DayOfWeek                  1017209 non-null  int64  
 11  Date                       1017209 non-null  object 
 12  Sales                      1017209 non-null  int64  
 13  Customers   

In [3]:
df['Assortment']=df['Assortment'].astype('category')
df['StoreType']=df['StoreType'].astype('category')
df['PromoInterval']=df['PromoInterval'].astype('category')
df['Promo2SinceWeek']=df['Promo2SinceWeek'].astype('category')
df['Promo2SinceYear']=df['Promo2SinceYear'].astype('category')
df['StateHoliday']=df['StateHoliday'].astype('category')
df['Date']=df['Date'].astype('datetime64')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1017209 entries, 0 to 1017208
Data columns (total 18 columns):
 #   Column                     Non-Null Count    Dtype         
---  ------                     --------------    -----         
 0   Store                      1017209 non-null  int64         
 1   StoreType                  1017209 non-null  category      
 2   Assortment                 1017209 non-null  category      
 3   CompetitionDistance        1017209 non-null  float64       
 4   CompetitionOpenSinceMonth  1017209 non-null  float64       
 5   CompetitionOpenSinceYear   1017209 non-null  float64       
 6   Promo2                     1017209 non-null  int64         
 7   Promo2SinceWeek            1017209 non-null  category      
 8   Promo2SinceYear            1017209 non-null  category      
 9   PromoInterval              1017209 non-null  category      
 10  DayOfWeek                  1017209 non-null  int64         
 11  Date                       1017209 no

# List of Different DataType

In [5]:
numerical=['Store','CompetitionDistance','Customers']

categorical_binary=['Open','Promo2','SchoolHoliday']


categorical_nominal=['StoreType','CompetitionOpenSinceMonth','Date','CompetitionOpenSinceYear','Promo2SinceWeek','Promo2SinceYear','PromoInterval','StateHoliday']


categorical_ordinal=['Assortment','DayOfWeek']


target=['Sales']


# Label Encoding

In [6]:
label_encoder = preprocessing.LabelEncoder() 
df['Assortment']= label_encoder.fit_transform(df['Assortment']) 
df['Assortment'].unique()

array([0, 2, 1])

In [7]:
df['DayOfWeek']= label_encoder.fit_transform(df['DayOfWeek']) 
df['DayOfWeek'].unique()

array([4, 3, 2, 1, 0, 6, 5])

# One-Hot Encoding

## Encoding for Binary data 

In [8]:
print(df['Open'].unique())
print(df['Promo2'].unique())
print(df['SchoolHoliday'].unique())

[1 0]
[0 1]
[1 0]


In [9]:
encoded1=pd.get_dummies(df[['Open','Promo2','SchoolHoliday']],columns=['Open','Promo2','SchoolHoliday'],drop_first=False)
encoded1

Unnamed: 0,Open_0,Open_1,Promo2_0,Promo2_1,SchoolHoliday_0,SchoolHoliday_1
0,0,1,1,0,0,1
1,0,1,1,0,0,1
2,0,1,1,0,0,1
3,0,1,1,0,0,1
4,0,1,1,0,0,1
...,...,...,...,...,...,...
1017204,0,1,0,1,0,1
1017205,0,1,0,1,0,1
1017206,0,1,0,1,0,1
1017207,0,1,0,1,0,1


In [10]:
df=pd.concat([df,encoded1],axis=1)

In [11]:
df.head(2)

Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,...,Open,Promo,StateHoliday,SchoolHoliday,Open_0,Open_1,Promo2_0,Promo2_1,SchoolHoliday_0,SchoolHoliday_1
0,1,c,0,1270.0,9.0,2008.0,0,0.0,0.0,0,...,1,1,0,1,0,1,1,0,0,1
1,1,c,0,1270.0,9.0,2008.0,0,0.0,0.0,0,...,1,1,0,1,0,1,1,0,0,1


## Encoding for nominal data

In [12]:
print(df['StoreType'].unique())
print(df['PromoInterval'].unique())
print(df['StateHoliday'].unique())

['c', 'a', 'd', 'b']
Categories (4, object): ['c', 'a', 'd', 'b']
['0', 'Jan,Apr,Jul,Oct', 'Feb,May,Aug,Nov', 'Mar,Jun,Sept,Dec']
Categories (4, object): ['0', 'Jan,Apr,Jul,Oct', 'Feb,May,Aug,Nov', 'Mar,Jun,Sept,Dec']
['0', 'a', 'b', 'c']
Categories (4, object): ['0', 'a', 'b', 'c']


In [13]:
PromoInterval_mapping={'0':'0','Jan,Apr,Jul,Oct':'Jan_Apr_Jul_Oct','Feb,May,Aug,Nov':'Feb_May_Aug_Nov','Mar,Jun,Sept,Dec':'Mar_Jun_Sept_Dec'}

df['PromoInterval'] = df['PromoInterval'].map(PromoInterval_mapping)

In [14]:
print(df['PromoInterval'].unique())

['0', 'Jan_Apr_Jul_Oct', 'Feb_May_Aug_Nov', 'Mar_Jun_Sept_Dec']
Categories (4, object): ['0', 'Jan_Apr_Jul_Oct', 'Feb_May_Aug_Nov', 'Mar_Jun_Sept_Dec']


In [15]:
encoded2=pd.get_dummies(df[['StoreType','PromoInterval','StateHoliday']],columns=['StoreType','PromoInterval','StateHoliday'])
encoded2

Unnamed: 0,StoreType_a,StoreType_b,StoreType_c,StoreType_d,PromoInterval_0,PromoInterval_Feb_May_Aug_Nov,PromoInterval_Jan_Apr_Jul_Oct,PromoInterval_Mar_Jun_Sept_Dec,StateHoliday_0,StateHoliday_a,StateHoliday_b,StateHoliday_c
0,0,0,1,0,1,0,0,0,1,0,0,0
1,0,0,1,0,1,0,0,0,1,0,0,0
2,0,0,1,0,1,0,0,0,1,0,0,0
3,0,0,1,0,1,0,0,0,1,0,0,0
4,0,0,1,0,1,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1017204,0,0,0,1,0,0,0,1,1,0,0,0
1017205,0,0,0,1,0,0,0,1,1,0,0,0
1017206,0,0,0,1,0,0,0,1,1,0,0,0
1017207,0,0,0,1,0,0,0,1,1,0,0,0


In [16]:
df=pd.concat([df,encoded2],axis=1)
df.head()

Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,...,StoreType_c,StoreType_d,PromoInterval_0,PromoInterval_Feb_May_Aug_Nov,PromoInterval_Jan_Apr_Jul_Oct,PromoInterval_Mar_Jun_Sept_Dec,StateHoliday_0,StateHoliday_a,StateHoliday_b,StateHoliday_c
0,1,c,0,1270.0,9.0,2008.0,0,0.0,0.0,0,...,1,0,1,0,0,0,1,0,0,0
1,1,c,0,1270.0,9.0,2008.0,0,0.0,0.0,0,...,1,0,1,0,0,0,1,0,0,0
2,1,c,0,1270.0,9.0,2008.0,0,0.0,0.0,0,...,1,0,1,0,0,0,1,0,0,0
3,1,c,0,1270.0,9.0,2008.0,0,0.0,0.0,0,...,1,0,1,0,0,0,1,0,0,0
4,1,c,0,1270.0,9.0,2008.0,0,0.0,0.0,0,...,1,0,1,0,0,0,1,0,0,0


In [17]:
print(df['StoreType'].unique())
print(df['PromoInterval'].unique())
print(df['StateHoliday'].unique())

['c', 'a', 'd', 'b']
Categories (4, object): ['c', 'a', 'd', 'b']
['0', 'Jan_Apr_Jul_Oct', 'Feb_May_Aug_Nov', 'Mar_Jun_Sept_Dec']
Categories (4, object): ['0', 'Jan_Apr_Jul_Oct', 'Feb_May_Aug_Nov', 'Mar_Jun_Sept_Dec']
['0', 'a', 'b', 'c']
Categories (4, object): ['0', 'a', 'b', 'c']


In [18]:
StoreType_mapping={'a':0,'b':1,'c':2,'d':3}
PromoInterval_mapping={'0':0,'Jan_Apr_Jul_Oct':1,'Feb_May_Aug_Nov':2,'Mar_Jun_Sept_Dec':3}
StateHoliday_mapping={'0':0,'a':1,'b':2,'c':3}

df['StoreType']     = df['StoreType'].map(StoreType_mapping)
df['PromoInterval'] = df['PromoInterval'].map(PromoInterval_mapping)
df['StateHoliday']     = df['StateHoliday'].map(StateHoliday_mapping)

# Convert/Manage Date column

In [19]:
def extend_date_feature(my_df):
    my_df['Date']=my_df['Date'].astype('datetime64[ns]')
    my_df.loc[:,'date_year']=my_df['Date'].apply(lambda x: x.year)
    my_df.loc[:,'date_weekofyear']=my_df['Date'].apply(lambda x: x.weekofyear )
    my_df.loc[:,'date_month']=my_df['Date'].apply(lambda x: x.month )
    my_df.loc[:,'date_dayofweek']=my_df['Date'].apply(lambda x: x.dayofweek )
    my_df.loc[:,'date_day']=my_df['Date'].apply(lambda x: x.day )
    #my_df=my_df.drop('Date',axis=1)
    return my_df

f=extend_date_feature(df)

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1017209 entries, 0 to 1017208
Data columns (total 41 columns):
 #   Column                          Non-Null Count    Dtype         
---  ------                          --------------    -----         
 0   Store                           1017209 non-null  int64         
 1   StoreType                       1017209 non-null  category      
 2   Assortment                      1017209 non-null  int64         
 3   CompetitionDistance             1017209 non-null  float64       
 4   CompetitionOpenSinceMonth       1017209 non-null  float64       
 5   CompetitionOpenSinceYear        1017209 non-null  float64       
 6   Promo2                          1017209 non-null  int64         
 7   Promo2SinceWeek                 1017209 non-null  category      
 8   Promo2SinceYear                 1017209 non-null  category      
 9   PromoInterval                   1017209 non-null  category      
 10  DayOfWeek                       1017209 no

In [21]:
df.to_csv('feature_engneering.csv',index=False)