In [1]:
import numpy as np 
import pandas as pd
import scipy 
from scipy import stats
import seaborn as sns

# Feature Engineering
* Feature engineering includes a range of multiple engineering techniques such as selecting relevant features, handling missing data, encoding the data, normalizing it. It is one of the most crucial tasks and plays a major role in determining the outcome of a model

In [2]:
sales=pd.read_csv('https://datahack-prod.s3.amazonaws.com/train_file/train_v9rqX0R.csv')

In [3]:
sales

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.300,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.1380
1,DRC01,5.920,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.500,Low Fat,0.016760,Meat,141.6180,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.2700
3,FDX07,19.200,Regular,0.000000,Fruits and Vegetables,182.0950,OUT010,1998,,Tier 3,Grocery Store,732.3800
4,NCD19,8.930,Low Fat,0.000000,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052
...,...,...,...,...,...,...,...,...,...,...,...,...
8518,FDF22,6.865,Low Fat,0.056783,Snack Foods,214.5218,OUT013,1987,High,Tier 3,Supermarket Type1,2778.3834
8519,FDS36,8.380,Regular,0.046982,Baking Goods,108.1570,OUT045,2002,,Tier 2,Supermarket Type1,549.2850
8520,NCJ29,10.600,Low Fat,0.035186,Health and Hygiene,85.1224,OUT035,2004,Small,Tier 2,Supermarket Type1,1193.1136
8521,FDN46,7.210,Regular,0.145221,Snack Foods,103.1332,OUT018,2009,Medium,Tier 3,Supermarket Type2,1845.5976


In [4]:
sales.Item_Fat_Content.unique()

array(['Low Fat', 'Regular', 'low fat', 'LF', 'reg'], dtype=object)

In [5]:
# As we have to treat this columns as we have treated already in other files

In [6]:
sales.Item_Fat_Content.replace(to_replace=['LF','low fat','reg'],value=['Low Fat','Low Fat','Regular'],inplace=True)

In [7]:
sales.Item_Fat_Content.unique()

array(['Low Fat', 'Regular'], dtype=object)

# ID_Item_Cat

In [14]:
sales.Item_Identifier

0       FDA15
1       DRC01
2       FDN15
3       FDX07
4       NCD19
        ...  
8518    FDF22
8519    FDS36
8520    NCJ29
8521    FDN46
8522    DRG01
Name: Item_Identifier, Length: 8523, dtype: object

In [15]:
# Creating a new column from the Item Identifier 

In [16]:
sales['Id_Item_Cat']=sales.Item_Identifier.str[0:2]

In [17]:
sales.Id_Item_Cat

0       FD
1       DR
2       FD
3       FD
4       NC
        ..
8518    FD
8519    FD
8520    NC
8521    FD
8522    DR
Name: Id_Item_Cat, Length: 8523, dtype: object

In [18]:
# We need to change the category of NC into a new category

In [19]:
sales.loc[sales.Id_Item_Cat=='NC','Item_Fat_Content']='Non Edible'

In [20]:
# Creating categories perishable and non persihable

In [37]:
perish=['Dairy', 'Meat', 'Fruits and Vegetables','Breakfast',  
       'Breads', 'Starchy Foods',  'Seafood']

In [38]:
def perished(x):
    if x in perish:
        return('Perishable')
    else:
        return('Non Perishable')

In [39]:
sales['Item_cat']=sales.Item_Type.apply(perished)

In [40]:
sales.Item_cat

0           Perishable
1       Non Perishable
2           Perishable
3           Perishable
4       Non Perishable
             ...      
8518    Non Perishable
8519    Non Perishable
8520    Non Perishable
8521    Non Perishable
8522    Non Perishable
Name: Item_cat, Length: 8523, dtype: object

In [41]:
# Lets talk about outlet Id
# binning the outlet id into three categories

In [42]:
lowes =["OUT010", "OUT019"]
high =["OUT013", "OUT027"]


def outlets(x):
    if x in lowes:
        return("Low_performers")
    elif x in high:
        return("high_performers")
    else: 
        return("Avg_performers") 
    
    
sales["Out_Cat"]=sales.Outlet_Identifier.apply(outlets)

In [43]:
sales

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Id_Item_Cat,Item_cat,Out_Cat
0,FDA15,9.300,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.1380,FD,Perishable,Avg_performers
1,DRC01,5.920,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,DR,Non Perishable,Avg_performers
2,FDN15,17.500,Low Fat,0.016760,Meat,141.6180,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.2700,FD,Perishable,Avg_performers
3,FDX07,19.200,Regular,0.000000,Fruits and Vegetables,182.0950,OUT010,1998,,Tier 3,Grocery Store,732.3800,FD,Perishable,Low_performers
4,NCD19,8.930,Non Edible,0.000000,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,NC,Non Perishable,high_performers
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8518,FDF22,6.865,Low Fat,0.056783,Snack Foods,214.5218,OUT013,1987,High,Tier 3,Supermarket Type1,2778.3834,FD,Non Perishable,high_performers
8519,FDS36,8.380,Regular,0.046982,Baking Goods,108.1570,OUT045,2002,,Tier 2,Supermarket Type1,549.2850,FD,Non Perishable,Avg_performers
8520,NCJ29,10.600,Non Edible,0.035186,Health and Hygiene,85.1224,OUT035,2004,Small,Tier 2,Supermarket Type1,1193.1136,NC,Non Perishable,Avg_performers
8521,FDN46,7.210,Regular,0.145221,Snack Foods,103.1332,OUT018,2009,Medium,Tier 3,Supermarket Type2,1845.5976,FD,Non Perishable,Avg_performers


# Establishment Year

In [45]:
# The data is till 2013 
# So we are creating a new feature that is VINTAGE

In [46]:
sales['vintage']=2013-sales.Outlet_Establishment_Year

In [47]:
sales

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Id_Item_Cat,Item_cat,Out_Cat,vintage
0,FDA15,9.300,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.1380,FD,Perishable,Avg_performers,14
1,DRC01,5.920,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,DR,Non Perishable,Avg_performers,4
2,FDN15,17.500,Low Fat,0.016760,Meat,141.6180,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.2700,FD,Perishable,Avg_performers,14
3,FDX07,19.200,Regular,0.000000,Fruits and Vegetables,182.0950,OUT010,1998,,Tier 3,Grocery Store,732.3800,FD,Perishable,Low_performers,15
4,NCD19,8.930,Non Edible,0.000000,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,NC,Non Perishable,high_performers,26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8518,FDF22,6.865,Low Fat,0.056783,Snack Foods,214.5218,OUT013,1987,High,Tier 3,Supermarket Type1,2778.3834,FD,Non Perishable,high_performers,26
8519,FDS36,8.380,Regular,0.046982,Baking Goods,108.1570,OUT045,2002,,Tier 2,Supermarket Type1,549.2850,FD,Non Perishable,Avg_performers,11
8520,NCJ29,10.600,Non Edible,0.035186,Health and Hygiene,85.1224,OUT035,2004,Small,Tier 2,Supermarket Type1,1193.1136,NC,Non Perishable,Avg_performers,9
8521,FDN46,7.210,Regular,0.145221,Snack Foods,103.1332,OUT018,2009,Medium,Tier 3,Supermarket Type2,1845.5976,FD,Non Perishable,Avg_performers,4
