In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Loading the dataset
social_data = pd.read_csv('/content/Social_Network_Ads.csv')

In [3]:
social_data

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0
...,...,...,...,...,...
395,15691863,Female,46,41000,1
396,15706071,Male,51,23000,1
397,15654296,Female,50,20000,1
398,15755018,Male,36,33000,0


In [4]:
social_data.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [5]:
social_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   User ID          400 non-null    int64 
 1   Gender           400 non-null    object
 2   Age              400 non-null    int64 
 3   EstimatedSalary  400 non-null    int64 
 4   Purchased        400 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 15.8+ KB


In [6]:
# Checking for null values
social_data.isna().sum()

User ID            0
Gender             0
Age                0
EstimatedSalary    0
Purchased          0
dtype: int64

In [7]:
# Dropping the 'User ID' column
social_data = social_data.drop('User ID', axis=1)

In [8]:
social_data

Unnamed: 0,Gender,Age,EstimatedSalary,Purchased
0,Male,19,19000,0
1,Male,35,20000,0
2,Female,26,43000,0
3,Female,27,57000,0
4,Male,19,76000,0
...,...,...,...,...
395,Female,46,41000,1
396,Male,51,23000,1
397,Female,50,20000,1
398,Male,36,33000,0


In [9]:
# Converting categorical variables into indicator variables(either 0 or 1)
social_data = pd.get_dummies(social_data,dtype=int)

In [10]:
social_data

Unnamed: 0,Age,EstimatedSalary,Purchased,Gender_Female,Gender_Male
0,19,19000,0,0,1
1,35,20000,0,0,1
2,26,43000,0,1,0
3,27,57000,0,1,0
4,19,76000,0,0,1
...,...,...,...,...,...
395,46,41000,1,1,0
396,51,23000,1,0,1
397,50,20000,1,1,0
398,36,33000,0,0,1


In [11]:
# Split the data into features (X) and target variable (y)
y=social_data['Purchased']
X=social_data.drop('Purchased', axis=1)

In [12]:
# Importing libraries for Train-test split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=42,test_size=0.2)

## Finding out which classification model gives the best result

### 1. Logistic Regression

In [13]:
# Importing Confusion Matrix Libraries
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [14]:
# Importing LogisticRegression Libraries
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()

In [15]:
#  Trying the Logistic regression
log_reg = LogisticRegression()
log_reg.fit(X_train,y_train)
# Predicting
log_pred = log_reg.predict(X_test)
accuracy_score(log_pred, y_test)

0.65

### 2. K-Nearest Neighbors

In [16]:
# Importing KNeighborsClassifier Libraries
from sklearn.neighbors import KNeighborsClassifier

In [17]:
#  Trying the KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3,metric='minkowski',p=2)
knn.fit(X_train, y_train)
# Predicting
knn_pred = knn.predict(X_test)
accuracy_score(knn_pred, y_test)

0.8

### 3. Support Vector Classifier

In [18]:
# Importing Support Vector Classifier Libraries
from sklearn.svm import SVC

In [19]:
#  Trying the Support Vector Classifier
svc = SVC()
svc.fit(X_train, y_train)
# Predicting
svc_pred = svc.predict(X_test)
accuracy_score(svc_pred, y_test)

0.7375

### 4. Decision Tree Classifier

In [20]:
# Importing Decision Tree Libraries
from sklearn.tree import DecisionTreeClassifier

In [21]:
#  Trying the Decision Tree
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
# Predicting
dt_pred = dt.predict(X_test)
accuracy_score(dt_pred, y_test)

0.8625

### 5. Random Forest Classifier

In [22]:
# Importing Random Forest Libraries
from sklearn.ensemble import RandomForestClassifier

In [23]:
#  Trying the Random Forest
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
# Predicting
rf_pred = rf.predict(X_test)
accuracy_score(rf_pred, y_test)

0.9125

### 6. XGBoost Classifier

In [24]:
# Importing XGBoost Classifier Libraries
from xgboost import XGBClassifier

In [25]:
#  Trying the XGBoost Classifier
xgb = XGBClassifier()
xgb.fit(X_train, y_train)
# Predicting
xgb_pred = xgb.predict(X_test)
accuracy_score(xgb_pred, y_test)

0.875

In [26]:
#    Displaying the accuracy scores altogether
print('Accuracy Scores of models',
      '\n Logistic Regression       : ',accuracy_score(log_pred, y_test),
      '\n K-Nearest Neighbors       : ',accuracy_score(knn_pred, y_test),
      '\n Support Vector Classifier : ',accuracy_score(svc_pred, y_test),
      '\n Decision Tree Classifier  : ',accuracy_score(dt_pred, y_test),
      '\n Random Forest Classifier  : ',accuracy_score(rf_pred, y_test),
      '\n XGBoost Classifier        : ',accuracy_score(xgb_pred, y_test))

Accuracy Scores of models 
 Logistic Regression       :  0.65 
 K-Nearest Neighbors       :  0.8 
 Support Vector Classifier :  0.7375 
 Decision Tree Classifier  :  0.8625 
 Random Forest Classifier  :  0.9125 
 XGBoost Classifier        :  0.875


### Inferences : Random forest Classifier gives best accuracy here. So We can select Random Forest based on the inferences

In [27]:
# Fetching the values from the rfmodel corresponding to
#   Age	= 46
#   EstimatedSalary	= 41000
#   Gender_Female	= 1 and
#   Gender_Male = 0
rf.predict([[46,41000,1,0]])



array([1])

In [28]:
import pickle
pickle.dump(rf,open('rfmodel.pkl','wb'))
pickled_model = pickle.load(open('rfmodel.pkl','rb'))

In [29]:
# Fetching the values from the pickled model corresponding to
#   Age	= 46
#   EstimatedSalary	= 41000
#   Gender_Female	= 1 and
#   Gender_Male = 0
pickled_model.predict([[46,41000,1,0]])



array([1])

### Inference : Both the rfmodel and picked model returns the same result. So pickling is done accurately.

In [30]:
# Fetching the values from the pickled model corresponding to
#   Age	= 19
#   EstimatedSalary	= 19000
#   Gender_Female	= 0 and
#   Gender_Male = 1
pickled_model.predict([[19,19000,0,1]])



array([0])

In [31]:
pickled_model