In [11]:
import pandas as pd
import os
import category_encoders as ce

In [2]:
os.chdir("../")

In [8]:
df = pd.read_csv("data/raw/train.csv")

### Item Identifier 

In [10]:
df['Item_Identifier']

0       FDA15
1       DRC01
2       FDN15
3       FDX07
4       NCD19
        ...  
8518    FDF22
8519    FDS36
8520    NCJ29
8521    FDN46
8522    DRG01
Name: Item_Identifier, Length: 8523, dtype: object

In [12]:
encoder = ce.TargetEncoder(cols=['Item_Identifier'])
df['Item_Identifier_encoded'] = encoder.fit_transform(df['Item_Identifier'], df['Item_Outlet_Sales'])

In [14]:
df['Item_Identifier_encoded'] = df['Item_Identifier_encoded'].astype('int')

In [15]:
df[['Item_Identifier', 'Item_Identifier_encoded']]

Unnamed: 0,Item_Identifier,Item_Identifier_encoded
0,FDA15,2879
1,DRC01,1920
2,FDN15,2030
3,FDX07,2347
4,NCD19,1879
...,...,...
8518,FDF22,2424
8519,FDS36,2271
8520,NCJ29,1991
8521,FDN46,2158


### Item Type

In [16]:
src_column = 'Item_Type'
target_column = f'{src_column}_encoded'
encoder = ce.TargetEncoder(cols=[src_column])
df[target_column] = encoder.fit_transform(df[src_column], df['Item_Outlet_Sales'])
df[target_column] = df[target_column].astype('int')
df[[src_column, target_column]]

Unnamed: 0,Item_Type,Item_Type_encoded
0,Dairy,2232
1,Soft Drinks,2006
2,Meat,2158
3,Fruits and Vegetables,2289
4,Household,2258
...,...,...
8518,Snack Foods,2277
8519,Baking Goods,1952
8520,Health and Hygiene,2010
8521,Snack Foods,2277


### Item_Fat_Content

In [21]:
src_column = 'Item_Fat_Content'
target_column = f'{src_column}_encoded'
df[target_column] = df[src_column].map({
    'Low Fat': "LF",
    'Regular': "REG",
    'reg': "REG",
    'low fat': "LF"
})

df = pd.get_dummies(df, columns=[target_column], dtype=int)
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Item_Identifier_encoded,Item_Type_encoded,Item_Fat_Content_encoded_LF,Item_Fat_Content_encoded_REG,Item_Fat_Content_encoded_LF.1,Item_Fat_Content_encoded_REG.1
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138,2879,2232,True,False,1,0
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,1920,2006,False,True,0,1
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27,2030,2158,True,False,1,0
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38,2347,2289,False,True,0,1
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,1879,2258,True,False,1,0


### Outlet_Location_Type

In [22]:
src_column = 'Outlet_Location_Type'
df = pd.get_dummies(df, columns=[src_column], dtype=int)
df.head(2)

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Type,Item_Outlet_Sales,Item_Identifier_encoded,Item_Type_encoded,Item_Fat_Content_encoded_LF,Item_Fat_Content_encoded_REG,Item_Fat_Content_encoded_LF.1,Item_Fat_Content_encoded_REG.1,Outlet_Location_Type_Tier 1,Outlet_Location_Type_Tier 2,Outlet_Location_Type_Tier 3
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Supermarket Type1,3735.138,2879,2232,True,False,1,0,1,0,0
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Supermarket Type2,443.4228,1920,2006,False,True,0,1,0,0,1


### Outlet_Size

In [23]:
src_column = 'Outlet_Size'
df = pd.get_dummies(df, columns=[src_column], dtype=int)
df.head(2)

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Type,Item_Outlet_Sales,...,Item_Fat_Content_encoded_LF,Item_Fat_Content_encoded_REG,Item_Fat_Content_encoded_LF.1,Item_Fat_Content_encoded_REG.1,Outlet_Location_Type_Tier 1,Outlet_Location_Type_Tier 2,Outlet_Location_Type_Tier 3,Outlet_Size_High,Outlet_Size_Medium,Outlet_Size_Small
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Supermarket Type1,3735.138,...,True,False,1,0,1,0,0,0,1,0
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Supermarket Type2,443.4228,...,False,True,0,1,0,0,1,0,1,0


### Outlet_Type

In [24]:
src_column = 'Outlet_Type'
df = pd.get_dummies(df, columns=[src_column], dtype=int)
df.head(2)

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Item_Outlet_Sales,Item_Identifier_encoded,...,Outlet_Location_Type_Tier 1,Outlet_Location_Type_Tier 2,Outlet_Location_Type_Tier 3,Outlet_Size_High,Outlet_Size_Medium,Outlet_Size_Small,Outlet_Type_Grocery Store,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,3735.138,2879,...,1,0,0,0,1,0,0,1,0,0
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,443.4228,1920,...,0,0,1,0,1,0,0,0,1,0
