In [48]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRFRegressor

models = {
    "LR" : LinearRegression(),
    "KNN" : KNeighborsRegressor(),
    "DT" : DecisionTreeRegressor(),
    "RF" : RandomForestRegressor(),
    "SVM" : SVR(),
    "XGBR" : XGBRFRegressor()
}

In [22]:
df = pd.read_csv("Assignment Datasets/data.csv")
df.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


In [23]:
df.drop(["User_ID","Product_ID","Stay_In_Current_City_Years","City_Category"], axis=1, inplace=True)
df.head()

Unnamed: 0,Gender,Age,Occupation,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,F,0-17,10,0,3,,,8370
1,F,0-17,10,0,1,6.0,14.0,15200
2,F,0-17,10,0,12,,,1422
3,F,0-17,10,0,12,14.0,,1057
4,M,55+,16,0,8,,,7969


In [24]:
df.shape

(22894, 8)

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22894 entries, 0 to 22893
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Gender              22894 non-null  object 
 1   Age                 22894 non-null  object 
 2   Occupation          22894 non-null  int64  
 3   Marital_Status      22894 non-null  int64  
 4   Product_Category_1  22894 non-null  int64  
 5   Product_Category_2  12804 non-null  float64
 6   Product_Category_3  5576 non-null   float64
 7   Purchase            22894 non-null  int64  
dtypes: float64(2), int64(4), object(2)
memory usage: 1.4+ MB


In [26]:
df = pd.get_dummies(data=df, columns=["Gender","Age",], drop_first=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22894 entries, 0 to 22893
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Occupation          22894 non-null  int64  
 1   Marital_Status      22894 non-null  int64  
 2   Product_Category_1  22894 non-null  int64  
 3   Product_Category_2  12804 non-null  float64
 4   Product_Category_3  5576 non-null   float64
 5   Purchase            22894 non-null  int64  
 6   Gender_M            22894 non-null  uint8  
 7   Age_18-25           22894 non-null  uint8  
 8   Age_26-35           22894 non-null  uint8  
 9   Age_36-45           22894 non-null  uint8  
 10  Age_46-50           22894 non-null  uint8  
 11  Age_51-55           22894 non-null  uint8  
 12  Age_55+             22894 non-null  uint8  
dtypes: float64(2), int64(4), uint8(7)
memory usage: 1.2 MB


In [27]:
df.isnull().sum()

Occupation                0
Marital_Status            0
Product_Category_1        0
Product_Category_2    10090
Product_Category_3    17318
Purchase                  0
Gender_M                  0
Age_18-25                 0
Age_26-35                 0
Age_36-45                 0
Age_46-50                 0
Age_51-55                 0
Age_55+                   0
dtype: int64

In [38]:
df["Product_Category_2"].fillna(df["Product_Category_2"].mode()[0], inplace=True)
df["Product_Category_3"].fillna(df["Product_Category_3"].mode()[0], inplace=True)
df.isnull().sum()

Occupation            0
Marital_Status        0
Product_Category_1    0
Product_Category_2    0
Product_Category_3    0
Purchase              0
Gender_M              0
Age_18-25             0
Age_26-35             0
Age_36-45             0
Age_46-50             0
Age_51-55             0
Age_55+               0
dtype: int64

In [40]:
df.head()

Unnamed: 0,Occupation,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase,Gender_M,Age_18-25,Age_26-35,Age_36-45,Age_46-50,Age_51-55,Age_55+
0,10,0,3,8.0,16.0,8370,0,0,0,0,0,0,0
1,10,0,1,6.0,14.0,15200,0,0,0,0,0,0,0
2,10,0,12,8.0,16.0,1422,0,0,0,0,0,0,0
3,10,0,12,14.0,16.0,1057,0,0,0,0,0,0,0
4,16,0,8,8.0,16.0,7969,1,0,0,0,0,0,1


In [43]:
x = df.drop(["Purchase"], axis=1)
y = df["Purchase"]

In [46]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [47]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
x_train

array([[ 0.43720326,  1.19799061, -0.61061016, ...,  3.35422661,
        -0.28005602, -0.21594267],
       [-0.32997801,  1.19799061,  1.69923062, ..., -0.29813132,
        -0.28005602, -0.21594267],
       [-0.94372302, -0.83473109, -0.45662078, ..., -0.29813132,
        -0.28005602, -0.21594267],
       ...,
       [-0.17654175,  1.19799061, -0.91858894, ...,  3.35422661,
        -0.28005602, -0.21594267],
       [ 1.35782079, -0.83473109, -1.07257832, ..., -0.29813132,
        -0.28005602, -0.21594267],
       [ 1.81812955, -0.83473109, -0.45662078, ..., -0.29813132,
        -0.28005602, -0.21594267]])

In [49]:
for name, model in models.items():
    print(f"Using : {name} Algorithm: ")
    model.fit(x_train,y_train)
    print(f'Train Accuracy: {model.score(x_train,y_train)}')
    print(f'Test Accuracy: {model.score(x_test,y_test)}')
    print('-'*50,'\n')

Using : LR Algorithm: 
Train Accuracy: 0.43398900351575564
Test Accuracy: 0.39803583038039814
-------------------------------------------------- 

Using : KNN Algorithm: 
Train Accuracy: 0.7121155644892692
Test Accuracy: 0.5343888924581828
-------------------------------------------------- 

Using : DT Algorithm: 
Train Accuracy: 0.8957093034313139
Test Accuracy: 0.6246800332275801
-------------------------------------------------- 

Using : RF Algorithm: 
Train Accuracy: 0.8821860903986516
Test Accuracy: 0.717645285029278
-------------------------------------------------- 

Using : SVM Algorithm: 
Train Accuracy: 0.055588537051791675
Test Accuracy: 0.04788685051366692
-------------------------------------------------- 

Using : XGBR Algorithm: 
Train Accuracy: 0.6854412480813947
Test Accuracy: 0.6741965672145299
-------------------------------------------------- 

