# Data Analytics II
1. Implement logistic regression using Python/R to perform classification on Social_Network_Ads.csv dataset.
2. Compute Confusion matrix to find TP, FP, TN, FN, Accuracy, Error rate, Precision, Recall on the given dataset.

In [44]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [45]:
# Importing the dataset
social_ads = pd.read_csv("Social_Network_Ads.csv")

In [46]:
# Displaying Head
social_ads.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [47]:
# Checking Datatypes
social_ads.dtypes

User ID             int64
Gender             object
Age                 int64
EstimatedSalary     int64
Purchased           int64
dtype: object

In [48]:
# Checking for NULL values
social_ads.isna().sum()

User ID            0
Gender             0
Age                0
EstimatedSalary    0
Purchased          0
dtype: int64

In [49]:
# Create Data into Dependent and Independent Sets
# We consider only the 'Age' and 'Estimated Salary' as columns
X = social_ads.loc[:, "Age":"EstimatedSalary"]
y = social_ads["Purchased"]

In [50]:
X.head()

Unnamed: 0,Age,EstimatedSalary
0,19,19000
1,35,20000
2,26,43000
3,27,57000
4,19,76000


In [51]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: Purchased, dtype: int64

In [52]:
# Split the data into train and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [73]:
X_train.head()

Unnamed: 0,Age,EstimatedSalary
32,21,16000
25,47,20000
154,40,47000
136,20,82000
223,60,102000


In [74]:
X_test.head()

Unnamed: 0,Age,EstimatedSalary
269,40,61000
101,28,59000
368,38,71000
120,36,75000
142,35,59000


In [55]:
# Normalization
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train_scaled = sc.fit_transform(X_train)
X_test_scaled = sc.transform(X_test)

In [56]:
X_train_scaled

array([[-1.5967531 , -1.59694844],
       [ 0.85842816, -1.47917395],
       [ 0.19741782, -0.68419616],
       [-1.69118315,  0.3463306 ],
       [ 2.08601879,  0.93520304],
       [ 0.29184787,  0.05189438],
       [ 0.66956806, -1.12585049],
       [ 0.76399811,  0.52299233],
       [-0.27473242,  0.25799974],
       [-1.12460286,  0.28744336],
       [ 0.00855772, -0.44864719],
       [-0.46359252, -0.03643648],
       [-1.21903291,  0.25799974],
       [ 2.08601879, -1.06696324],
       [ 0.38627792, -0.47809081],
       [ 2.08601879,  0.37577422],
       [ 0.00855772, -0.27198546],
       [ 1.5194385 , -0.00699286],
       [-1.12460286,  0.31688698],
       [-1.69118315, -1.39084308],
       [ 0.95285821, -1.18473773],
       [ 0.95285821, -1.03751962],
       [ 1.04728826, -0.91974513],
       [-1.5967531 , -0.0658801 ],
       [-0.36916247,  1.22963925],
       [ 0.95285821, -0.86085789],
       [-0.27473242,  1.11186477],
       [-1.407893  , -0.12476735],
       [ 0.10298777,

In [59]:
len(X_train_scaled)

320

In [60]:
X_train_scaled = pd.DataFrame(X_train_scaled, columns=["Age", "EstimatedSalary"])
X_train_scaled.head()

Unnamed: 0,Age,EstimatedSalary
0,-1.596753,-1.596948
1,0.858428,-1.479174
2,0.197418,-0.684196
3,-1.691183,0.346331
4,2.086019,0.935203


In [61]:
X_test_scaled

array([[ 0.19741782, -0.27198546],
       [-0.93574276, -0.3308727 ],
       [ 0.00855772,  0.02245076],
       [-0.18030237,  0.14022525],
       [-0.27473242, -0.3308727 ],
       [ 0.48070797,  1.22963925],
       [-0.46359252, -1.24362497],
       [-0.55802257,  1.37685736],
       [ 0.19741782,  0.02245076],
       [-1.69118315, -0.62530892],
       [-1.31346296, -0.44864719],
       [ 0.66956806, -1.4202867 ],
       [-0.27473242,  0.52299233],
       [ 1.89715869, -0.94918876],
       [-1.03017281,  0.58187957],
       [ 1.04728826, -1.24362497],
       [-1.03017281,  0.40521784],
       [-1.12460286, -1.12585049],
       [-0.93574276, -0.97863238],
       [ 0.10298777,  0.19911249],
       [-1.12460286, -1.62639206],
       [-0.65245262,  0.16966887],
       [-1.12460286,  0.46410509],
       [-0.84131271, -1.24362497],
       [ 0.10298777,  0.14022525],
       [ 0.66956806,  1.78906807],
       [ 0.76399811, -1.12585049],
       [-0.36916247, -1.33195584],
       [-0.93574276,

In [62]:
len(X_test_scaled)

80

In [63]:
X_test_scaled = pd.DataFrame(X_test_scaled, columns=["Age", "EstimatedSalary"])
X_test_scaled.head()

Unnamed: 0,Age,EstimatedSalary
0,0.197418,-0.271985
1,-0.935743,-0.330873
2,0.008558,0.022451
3,-0.180302,0.140225
4,-0.274732,-0.330873


In [65]:
# Import the model
from sklearn.linear_model import LogisticRegression

# Set the model
logisticReg = LogisticRegression()

# Fit the model
logisticReg.fit(X_train_scaled, y_train)

LogisticRegression()

In [66]:
# Predict values using the model
y_predict = logisticReg.predict(X_test_scaled)

In [67]:
# Compute the Confusion Matrix
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_predict)
cm

array([[48,  2],
       [12, 18]])

True Positive: 48 \
False Positive: 5 \
False Negative: 13 \
True Negative: 14

In [68]:
# Accuracy = (TP/TestLength)
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_predict)
print("Accuracy Score: ", accuracy)

Accuracy Score:  0.825


In [69]:
# Recall = (TP/(TP+FN))
from sklearn.metrics import recall_score

recall = recall_score(y_test, y_predict, average=None, zero_division=1)
print("Recall Score: ", recall)

Recall Score:  [0.96 0.6 ]


In [70]:
# Precision = (TP/(TP+FP))
from sklearn.metrics import precision_score

precision = precision_score(y_test, y_predict, zero_division=1)
print("Precision Score: ", precision)

Precision Score:  0.9


In [71]:
# Error Rate = 1- accuracy
error_rate = 1-accuracy
print("Error Rate: ", error_rate)

Error Rate:  0.17500000000000004


In [72]:
# F1 Score = 2 * (precision * recall) / (precision + recall)
from sklearn.metrics import f1_score
f1 = f1_score(y_test, y_predict, zero_division=1)
print("F1 Score: ", f1)

F1 Score:  0.7200000000000001
