In [37]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [38]:
df = pd.read_csv('Black Friday Sale.csv')

# Data Cleaning and Preprocessing

In [39]:
df.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


In [40]:
df. isna(). sum()

User_ID                            0
Product_ID                         0
Gender                             0
Age                                0
Occupation                         0
City_Category                      0
Stay_In_Current_City_Years         0
Marital_Status                     0
Product_Category_1                 0
Product_Category_2            173638
Product_Category_3            383247
Purchase                           0
dtype: int64

In [41]:
df.shape

(550068, 12)

In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550068 entries, 0 to 550067
Data columns (total 12 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   User_ID                     550068 non-null  int64  
 1   Product_ID                  550068 non-null  object 
 2   Gender                      550068 non-null  object 
 3   Age                         550068 non-null  object 
 4   Occupation                  550068 non-null  int64  
 5   City_Category               550068 non-null  object 
 6   Stay_In_Current_City_Years  550068 non-null  object 
 7   Marital_Status              550068 non-null  int64  
 8   Product_Category_1          550068 non-null  int64  
 9   Product_Category_2          376430 non-null  float64
 10  Product_Category_3          166821 non-null  float64
 11  Purchase                    550068 non-null  int64  
dtypes: float64(2), int64(5), object(5)
memory usage: 50.4+ MB


In [43]:
df.describe

<bound method NDFrame.describe of         User_ID Product_ID Gender    Age  Occupation City_Category  \
0       1000001  P00069042      F   0-17          10             A   
1       1000001  P00248942      F   0-17          10             A   
2       1000001  P00087842      F   0-17          10             A   
3       1000001  P00085442      F   0-17          10             A   
4       1000002  P00285442      M    55+          16             C   
...         ...        ...    ...    ...         ...           ...   
550063  1006033  P00372445      M  51-55          13             B   
550064  1006035  P00375436      F  26-35           1             C   
550065  1006036  P00375436      F  26-35          15             B   
550066  1006038  P00375436      F    55+           1             C   
550067  1006039  P00371644      F  46-50           0             B   

       Stay_In_Current_City_Years  Marital_Status  Product_Category_1  \
0                               2               0   

In [44]:
#encode the gender for easier analysis
df['Gender']=df['Gender'].map({'M':0,'F':1})

In [45]:
df.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,1,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,1,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,1,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,1,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,0,55+,16,C,4+,0,8,,,7969


In [46]:
df['City_Category'].unique()

array(['A', 'C', 'B'], dtype=object)

In [47]:
#encode the City_Category for easier analysis
df['City_Category']=df['City_Category'].map({'A':0,'B':1,'C':2})

In [48]:
df.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,1,0-17,10,0,2,0,3,,,8370
1,1000001,P00248942,1,0-17,10,0,2,0,1,6.0,14.0,15200
2,1000001,P00087842,1,0-17,10,0,2,0,12,,,1422
3,1000001,P00085442,1,0-17,10,0,2,0,12,14.0,,1057
4,1000002,P00285442,0,55+,16,2,4+,0,8,,,7969


In [49]:
df.dtypes

User_ID                         int64
Product_ID                     object
Gender                          int64
Age                            object
Occupation                      int64
City_Category                   int64
Stay_In_Current_City_Years     object
Marital_Status                  int64
Product_Category_1              int64
Product_Category_2            float64
Product_Category_3            float64
Purchase                        int64
dtype: object

In [50]:
df['Stay_In_Current_City_Years']=df['Stay_In_Current_City_Years'].replace('4+',"4")
df['Stay_In_Current_City_Years']=df['Stay_In_Current_City_Years'].astype(int)
df.dtypes

User_ID                         int64
Product_ID                     object
Gender                          int64
Age                            object
Occupation                      int64
City_Category                   int64
Stay_In_Current_City_Years      int32
Marital_Status                  int64
Product_Category_1              int64
Product_Category_2            float64
Product_Category_3            float64
Purchase                        int64
dtype: object

In [51]:
df['Age'].unique()

array(['0-17', '55+', '26-35', '46-50', '51-55', '36-45', '18-25'],
      dtype=object)

In [52]:
from sklearn.preprocessing import OrdinalEncoder

#Since age has an order encode these age ranges in ordinal format
encode= OrdinalEncoder(categories=[['0-17','18-25','26-35','36-45','46-50','51-55','55+']])
encode_reshape=df.Age.values.reshape(-1,1)
df['Age']=encode.fit_transform(encode_reshape)

In [53]:
df['Age']

0         0.0
1         0.0
2         0.0
3         0.0
4         6.0
         ... 
550063    5.0
550064    2.0
550065    2.0
550066    6.0
550067    4.0
Name: Age, Length: 550068, dtype: float64

In [55]:
#array(['0-17', '55+', '26-35', '46-50', '51-55', '36-45', '18-25'], dtype=object)
df['Age'].unique()

array([0., 6., 2., 4., 5., 3., 1.])

In [56]:
df.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,1,0.0,10,0,2,0,3,,,8370
1,1000001,P00248942,1,0.0,10,0,2,0,1,6.0,14.0,15200
2,1000001,P00087842,1,0.0,10,0,2,0,12,,,1422
3,1000001,P00085442,1,0.0,10,0,2,0,12,14.0,,1057
4,1000002,P00285442,0,6.0,16,2,4,0,8,,,7969
