In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
df = pd.read_csv('blackFriday_test.csv')
df1= pd.read_csv('blackFriday_train.csv')

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
Df =  df1.append(df)

In [5]:
Df.columns

Index(['User_ID', 'Product_ID', 'Gender', 'Age', 'Occupation', 'City_Category',
       'Stay_In_Current_City_Years', 'Marital_Status', 'Product_Category_1',
       'Product_Category_2', 'Product_Category_3', 'Purchase'],
      dtype='object')

In [6]:
num_col = [x for x in Df.columns if Df[x].dtype != 'O']
cat_col = [x for x in Df.columns if Df[x].dtype == 'O']

In [7]:
df_num=Df[num_col]
df_cat=Df[cat_col]

In [8]:
# We just need df_cat for our feature encoding.
df_cat

Unnamed: 0,Product_ID,Gender,Age,City_Category,Stay_In_Current_City_Years
0,P00069042,F,0-17,A,2
1,P00248942,F,0-17,A,2
2,P00087842,F,0-17,A,2
3,P00085442,F,0-17,A,2
4,P00285442,M,55+,C,4+
...,...,...,...,...,...
233594,P00118942,F,26-35,B,4+
233595,P00254642,F,26-35,B,4+
233596,P00031842,F,26-35,B,4+
233597,P00124742,F,46-50,C,4+


In [9]:
# Finding unique category out of each columns to deside encoding technique.

df_cat['Product_ID'].unique()

array(['P00069042', 'P00248942', 'P00087842', ..., 'P00030342',
       'P00074942', 'P00253842'], dtype=object)

In [10]:
df_cat['Gender'].unique()

array(['F', 'M'], dtype=object)

In [11]:
df_cat['Age'].unique()

array(['0-17', '55+', '26-35', '46-50', '51-55', '36-45', '18-25'],
      dtype=object)

In [12]:
df_cat['City_Category'].unique()

array(['A', 'C', 'B'], dtype=object)

In [13]:
df_cat['Stay_In_Current_City_Years'].unique()

array(['2', '4+', '3', '1', '0'], dtype=object)

In [14]:
# From above we have lot of unique product id which has even no use in deciding sale. So will drop it.
# Will convert Gender and City_Category by nominal encoding technique as they are of nominal category.
# We will convert Age by label encoding technique as they are of ordinal category.
# For Stay_in city yr we will simply eliminate '+' sign and convert the data type.

In [15]:
# Droping Product_ID column

df_cat.drop(['Product_ID'],axis =1,inplace=True)

In [16]:
df_cat

Unnamed: 0,Gender,Age,City_Category,Stay_In_Current_City_Years
0,F,0-17,A,2
1,F,0-17,A,2
2,F,0-17,A,2
3,F,0-17,A,2
4,M,55+,C,4+
...,...,...,...,...
233594,F,26-35,B,4+
233595,F,26-35,B,4+
233596,F,26-35,B,4+
233597,F,46-50,C,4+


In [17]:
# Converting Gender column using nominal encoding.

one_hot_encoded_df = pd.get_dummies(df_cat, columns = ['Gender','City_Category'])
print(one_hot_encoded_df)

          Age Stay_In_Current_City_Years  Gender_F  Gender_M  City_Category_A  \
0        0-17                          2         1         0                1   
1        0-17                          2         1         0                1   
2        0-17                          2         1         0                1   
3        0-17                          2         1         0                1   
4         55+                         4+         0         1                0   
...       ...                        ...       ...       ...              ...   
233594  26-35                         4+         1         0                0   
233595  26-35                         4+         1         0                0   
233596  26-35                         4+         1         0                0   
233597  46-50                         4+         1         0                0   
233598  46-50                         4+         1         0                0   

        City_Category_B  Ci

In [18]:
df_cat = one_hot_encoded_df

In [19]:
df_cat

Unnamed: 0,Age,Stay_In_Current_City_Years,Gender_F,Gender_M,City_Category_A,City_Category_B,City_Category_C
0,0-17,2,1,0,1,0,0
1,0-17,2,1,0,1,0,0
2,0-17,2,1,0,1,0,0
3,0-17,2,1,0,1,0,0
4,55+,4+,0,1,0,0,1
...,...,...,...,...,...,...,...
233594,26-35,4+,1,0,0,1,0
233595,26-35,4+,1,0,0,1,0
233596,26-35,4+,1,0,0,1,0
233597,46-50,4+,1,0,0,0,1


In [20]:
df_cat['Age'].unique()

array(['0-17', '55+', '26-35', '46-50', '51-55', '36-45', '18-25'],
      dtype=object)

In [21]:
# Assigning labels to the ordinal categorical column

df_cat['Age'] = df_cat['Age'].map({'0-17':1,'18-25':2,'26-35':3,'36-45':4,'46-50':5,'51-55':6,'55+':7})

In [22]:
df_cat

Unnamed: 0,Age,Stay_In_Current_City_Years,Gender_F,Gender_M,City_Category_A,City_Category_B,City_Category_C
0,1,2,1,0,1,0,0
1,1,2,1,0,1,0,0
2,1,2,1,0,1,0,0
3,1,2,1,0,1,0,0
4,7,4+,0,1,0,0,1
...,...,...,...,...,...,...,...
233594,3,4+,1,0,0,1,0
233595,3,4+,1,0,0,1,0
233596,3,4+,1,0,0,1,0
233597,5,4+,1,0,0,0,1


In [23]:
# In column of Stay_in city yr unique values only '4+' is restricting to make the column numerical type.
# so replacing 4+ with 4

df_cat['Stay_In_Current_City_Years'] = df_cat['Stay_In_Current_City_Years'].str.replace('+','')

In [24]:
df_cat.Stay_In_Current_City_Years.unique()

array(['2', '4', '3', '1', '0'], dtype=object)

In [25]:
df_cat

# All columns data are converted to numerical columns

Unnamed: 0,Age,Stay_In_Current_City_Years,Gender_F,Gender_M,City_Category_A,City_Category_B,City_Category_C
0,1,2,1,0,1,0,0
1,1,2,1,0,1,0,0
2,1,2,1,0,1,0,0
3,1,2,1,0,1,0,0
4,7,4,0,1,0,0,1
...,...,...,...,...,...,...,...
233594,3,4,1,0,0,1,0
233595,3,4,1,0,0,1,0
233596,3,4,1,0,0,1,0
233597,5,4,1,0,0,0,1


In [26]:
# Checking for datatype of the columns

df_cat.dtypes

Age                            int64
Stay_In_Current_City_Years    object
Gender_F                       uint8
Gender_M                       uint8
City_Category_A                uint8
City_Category_B                uint8
City_Category_C                uint8
dtype: object

In [27]:
# Converting datatype of stay in cuty to int

df_cat['Stay_In_Current_City_Years'] = df_cat['Stay_In_Current_City_Years'].astype('int')

In [28]:
df_cat.dtypes

Age                           int64
Stay_In_Current_City_Years    int32
Gender_F                      uint8
Gender_M                      uint8
City_Category_A               uint8
City_Category_B               uint8
City_Category_C               uint8
dtype: object

In [29]:
# Data is not that much spreaded within vast range so will not perform feature scalling .

In [31]:
df_num

Unnamed: 0,User_ID,Occupation,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,10,0,3,,,8370.0
1,1000001,10,0,1,6.0,14.0,15200.0
2,1000001,10,0,12,,,1422.0
3,1000001,10,0,12,14.0,,1057.0
4,1000002,16,0,8,,,7969.0
...,...,...,...,...,...,...,...
233594,1006036,15,1,8,,,
233595,1006036,15,1,5,8.0,,
233596,1006036,15,1,1,5.0,12.0,
233597,1006037,1,0,10,16.0,,


In [32]:
df_cat

Unnamed: 0,Age,Stay_In_Current_City_Years,Gender_F,Gender_M,City_Category_A,City_Category_B,City_Category_C
0,1,2,1,0,1,0,0
1,1,2,1,0,1,0,0
2,1,2,1,0,1,0,0
3,1,2,1,0,1,0,0
4,7,4,0,1,0,0,1
...,...,...,...,...,...,...,...
233594,3,4,1,0,0,1,0
233595,3,4,1,0,0,1,0
233596,3,4,1,0,0,1,0
233597,5,4,1,0,0,0,1
