In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

In [5]:
df = pd.read_csv("Social_Network_Ads.csv")

In [6]:
df.head()

Unnamed: 0,Age,EstimatedSalary,Purchased
0,19,19000,0
1,35,20000,0
2,26,43000,0
3,27,57000,0
4,19,76000,0


In [7]:
df.tail()

Unnamed: 0,Age,EstimatedSalary,Purchased
395,46,41000,1
396,51,23000,1
397,50,20000,1
398,36,33000,0
399,49,36000,1


#### Data Preprocessing

In [8]:
df.dtypes

Age                int64
EstimatedSalary    int64
Purchased          int64
dtype: object

In [9]:
df.describe()

Unnamed: 0,Age,EstimatedSalary,Purchased
count,400.0,400.0,400.0
mean,37.655,69742.5,0.3575
std,10.482877,34096.960282,0.479864
min,18.0,15000.0,0.0
25%,29.75,43000.0,0.0
50%,37.0,70000.0,0.0
75%,46.0,88000.0,1.0
max,60.0,150000.0,1.0


In [10]:
df.shape

(400, 3)

In [11]:
df.size

1200

In [12]:
df.isna().sum()

Age                0
EstimatedSalary    0
Purchased          0
dtype: int64

In [17]:
duplicated_rows = df.duplicated().sum()
print("Number of duplicated rows:", duplicated_rows)

Number of duplicated rows: 33


In [16]:
df.drop_duplicates()

Unnamed: 0,Age,EstimatedSalary,Purchased
0,19,19000,0
1,35,20000,0
2,26,43000,0
3,27,57000,0
4,19,76000,0
...,...,...,...
395,46,41000,1
396,51,23000,1
397,50,20000,1
398,36,33000,0


#### Logistic Regression

In [35]:
# Spliting the dataset into features and target variable
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

In [36]:
# Spliting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
# Feature scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [21]:
# Fit logistic regression to the training set
classifier = LogisticRegression(random_state=42)
classifier.fit(X_train, y_train)

In [37]:
# Predicting the test set results
y_pred = classifier.predict(X_test)

In [31]:
cm = confusion_matrix(y_test, y_pred)
TP = cm[1, 1]
TN = cm[0, 0]
FP = cm[0, 1]
FN = cm[1, 0]

In [33]:
type1_error = FP / (FP + TN)
type2_error = FN / (FN + TP)

In [26]:
accuracy = accuracy_score(y_test, y_pred)
error_rate = 1 - accuracy
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

#### Performance Metrics

In [34]:
print("\nConfusion Matrix:")
print(cm)
print("\nTrue Positive (TP):", TP)
print("True Negative (TN):", TN)
print("False Positive (FP):", FP)
print("False Negative (FN):", FN)
print("\nAccuracy:", accuracy)
print("Error Rate:", error_rate)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("\nType I error (False Positive Rate):", type1_error)
print("Type II error (False Negative Rate):", type2_error)


Confusion Matrix:
[[50  2]
 [ 9 19]]

True Positive (TP): 19
True Negative (TN): 50
False Positive (FP): 2
False Negative (FN): 9

Accuracy: 0.8625
Error Rate: 0.13749999999999996
Precision: 0.9047619047619048
Recall: 0.6785714285714286
F1 Score: 0.7755102040816326

Type I error (False Positive Rate): 0.038461538461538464
Type II error (False Negative Rate): 0.32142857142857145
