In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("C:\\Users\\Diya\\Downloads\\Social_Network_Ads.csv")

In [3]:
df.head(5)

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [4]:
X = df.iloc[:, [1, 2, 3]].values 

In [5]:
Y = df.iloc[:, 4].values 

In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

In [7]:
gender_mapping = {"Male": 0, "Female": 1}
X_train[:, 0] = np.vectorize(gender_mapping.get)(X_train[:, 0])
X_test[:, 0] = np.vectorize(gender_mapping.get)(X_test[:, 0])
Y_train = Y_train.astype(np.float64)
Y_test = Y_test.astype(np.float64)

In [8]:
print("Shape of X_train : ", X_train.shape)
print("Shape of Y_train : ", Y_train.shape)
print("Shape of X_test : ", X_test.shape)
print("Shape of Y_test : ", Y_test.shape)

Shape of X_train :  (280, 3)
Shape of Y_train :  (280,)
Shape of X_test :  (120, 3)
Shape of Y_test :  (120,)


In [9]:
X_train= X_train.astype(np.float64)
X_test= X_test.astype(np.float64)
Y_train= Y_train.astype(np.float64)
Y_test= Y_test.astype(np.float64)

In [10]:
def sigmoid(z):
    z_exp = np.exp(np.array(z, dtype=np.float64))
    return 1 / (1 + z_exp)

In [11]:
def gradient_descent(X, y, theta, alpha, iterations):
    m = len(y)
    for _ in range(iterations):
        h = sigmoid(np.dot(X, theta))
        gradient = np.dot(X.T, (h - y)) / m
        theta -= alpha * gradient
    return theta

In [12]:
X_train_bias = np.c_[np.ones((X_train.shape[0], 1)), X_train]
X_train_bias= X_train_bias.astype(np.float64)
theta = np.zeros(X_train_bias.shape[1]).astype(np.float64)

In [13]:
alpha = 0.01
iterations = 1000
theta = gradient_descent(X_train_bias, Y_train, theta, alpha, iterations)

In [14]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

def evaluate_model(X_train, X_test, Y_train, Y_test):
    models = {
        "Raw Data": LogisticRegression(random_state=42),
        "Normalization": LogisticRegression(random_state=42),
        "Standardization": LogisticRegression(random_state=42),
    }

    scalers = {
        "Raw Data": None,
        "Normalization": MinMaxScaler(),
        "Standardization": StandardScaler(),
    }

    for key in models:
        scaler = scalers[key]
        X_train_scaled = X_train.astype(float) if scaler is None else scaler.fit_transform(X_train)
        X_test_scaled = X_test.astype(float) if scaler is None else scaler.transform(X_test)

        # Logistic Regression from scratch
        h_scratch = sigmoid(np.dot(np.c_[np.ones((X_test_scaled.shape[0], 1)), X_test_scaled], theta))
        y_pred_scratch = np.round(h_scratch)
        acc = accuracy_score(Y_test, y_pred_scratch)
        # Logistic Regression using sklearn
        model = models[key]
        model.fit(X_train_scaled, Y_train)
        y_pred_sklearn = model.predict(X_test_scaled)
        acc_sklearn = accuracy_score(Y_test, y_pred_sklearn)

        print(f"{key} Accuracy (from scratch): {acc}")
        print(f"{key} Accuracy (sklearn): {acc_sklearn}")
        print( y_pred_scratch)
        print( y_pred_sklearn)
        print()

In [15]:
evaluate_model(X_train, X_test, Y_train, Y_test)

Raw Data Accuracy (from scratch): 0.39166666666666666
Raw Data Accuracy (sklearn): 0.6083333333333333
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]

Normalization Accuracy (from scratch): 0.39166666666666666
Normalization Accuracy (sklearn): 0.8416666666666667
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1

  z_exp = np.exp(np.array(z, dtype=np.float64))


In [16]:
#For all three cases, sklearn gives the most accurate predictions
#Standardization and Normalization are important to get accurate results