## Classification 

### predict whether the customer will purchase a product

In [1]:
# pre-requisities

import warnings
warnings.filterwarnings(action="ignore")

In [2]:
# import required packages

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# load the datset

df=pd.read_csv('Social_Network_Ads.csv')
df.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


### EDA

In [4]:
# get the general info about the dataset

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   User ID          400 non-null    int64 
 1   Gender           400 non-null    object
 2   Age              400 non-null    int64 
 3   EstimatedSalary  400 non-null    int64 
 4   Purchased        400 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 15.8+ KB


In [5]:
# remove the user ID  as it is an unwanted column

df.drop('User ID',axis=1,inplace=True)

In [6]:
df.head()

Unnamed: 0,Gender,Age,EstimatedSalary,Purchased
0,Male,19,19000,0
1,Male,35,20000,0
2,Female,26,43000,0
3,Female,27,57000,0
4,Male,19,76000,0


### data cleansing

In [7]:
from sklearn.preprocessing import LabelEncoder

# convert the gender column into a numeric data
df['Gender'] = LabelEncoder().fit_transform(df['Gender'])

In [8]:
# find the correction
df.corr()

Unnamed: 0,Gender,Age,EstimatedSalary,Purchased
Gender,1.0,-0.073741,-0.060435,-0.042469
Age,-0.073741,1.0,0.155238,0.622454
EstimatedSalary,-0.060435,0.155238,1.0,0.362083
Purchased,-0.042469,0.622454,0.362083,1.0


### Split the data

In [9]:
# create x
x=df.drop(['Gender','Purchased'],axis=1)

# create y
y=df['Purchased']

In [10]:
from sklearn.model_selection import train_test_split

# split the data in training and test set
x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.7,random_state=123456)

### Model Building

In [19]:
from sklearn.svm import SVC

# create the model
model_svm = SVC(C=2,kernel ='rbf')

# train the model
model_svm.fit(x_train,y_train)

0,1,2
,C,2
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


### Find the best hyper parameters for my model

In [None]:
# from sklearn.model_selection import GridSearchCV

# # create hyper parameters
# parameters = {
#     'kernel':['linear','poly','rbf','sigmoid'],
#     'C':[0.5,1]
# }

# # create the grid Search
# grid_search_svm=GridSearchCV(model_svm,parameters)

# grid_search_svm.fit(x_train,y_train)

In [None]:
# check the best parameters
#grid_search_svm.best_params

In [20]:
y_pred = model_svm.predict(x_test)

In [21]:
# evaluate the data
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,confusion_matrix

accuracy = accuracy_score(y_test,y_pred)
precision = precision_score(y_test,y_pred)
recall=recall_score(y_test,y_pred)
f1=f1_score(y_test,y_pred)

print(confusion_matrix(y_test,y_pred))

print(f"Accuracy = {accuracy}")
print(f"Precision = {precision}")
print(f"Recall = {recall}")
print(f"F1 = {f1}")

[[76  3]
 [20 21]]
Accuracy = 0.8083333333333333
Precision = 0.875
Recall = 0.5121951219512195
F1 = 0.6461538461538462


In [17]:
from sklearn.neighbors import KNeighborsClassifier

# create the model
model_knn = KNeighborsClassifier(n_neighbors=7)

# train the model
model_knn.fit(x_train,y_train)

0,1,2
,n_neighbors,7
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


#### find the best values for hyperparameters

In [None]:
# from sklearn.model_selection import GridSearchCV

# #create the list of hyper-parameters along with the possible values
# parameters = {
#    'n_neighbors': range(3, 11),
#    'weights': ['uniform', 'distance']
# }

# #create the grid search
# grid_search_knn = GridSearchCV(model_knn, parameters)

# #fit the values
# grid_search_knn.fit(x_train,y_train)

In [None]:
# find the best combination of hyper-parameters
# grid_search_knn.best_params_

### KNN model evaluation

In [18]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

y_pred = model_knn.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"accuracy = {accuracy:.2f}, precision = {precision:.2f}, recall = {recall:.2f}, f1 = {f1:.2f}")

accuracy = 0.86, precision = 0.85, recall = 0.71, f1 = 0.77


In [22]:
# visualize the output