# Predicting if user will click on the advertisement or not using logistic regression.

# Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import sklearn

* **Numpy:** Numpy is used to create and manipulate multi-dimension matrix.  
* **Pandas:** Pandas is used for data manipulation and analysis.  
* **Sklearn:** Sklearn is a simple and effective tool for machine learning and data analysis. 

# Loading and Cleaning Dataset

In [None]:
# Loading Dataset from CSV
dataset = pd.read_csv("/content/drive/MyDrive/AI and ML Datasets/advertising.csv")

**pd.read_csv():** It is used to read the csv file.

In [None]:
dataset.head()

Unnamed: 0,Time_Spent,Age,Area_Income,Daily_Internet_Usage,Ad_Topic_Line,City,Male,Country,Timestamp,Clicked_on_Ad
0,68.95,35,61833.9,256.09,Cloned 5thgeneration orchestration,Wrightburgh,0,Tunisia,2016-03-27 00:53:11,0
1,80.23,31,68441.85,193.77,Monitored national standardization,West Jodi,1,Nauru,2016-04-04 01:39:02,0
2,69.47,26,59785.94,236.5,Organic bottom-line service-desk,Davidton,0,San Marino,2016-03-13 20:35:42,0
3,74.15,29,54806.18,245.89,Triple-buffered reciprocal time-frame,West Terrifurt,1,Italy,2016-01-10 02:31:19,0
4,68.37,35,73889.99,225.58,Robust logistical utilization,South Manuel,0,Iceland,2016-06-03 03:36:18,0


**Creating New Dataset with only Important and Numerical features**

In [None]:
dataset = dataset[["Time_Spent","Age","Area_Income","Daily_Internet_Usage","Clicked_on_Ad"]]

**Checking for null value**

In [None]:
# Checking for Null value
dataset.notnull().all()

Time_Spent              True
Age                     True
Area_Income             True
Daily_Internet_Usage    True
Clicked_on_Ad           True
dtype: bool

## Creating Features and Target Array

In [None]:
features = dataset.iloc[:,:-1].values

In [None]:
target = dataset.iloc[:,-1].values

In [None]:
features.shape, target.shape

((1000, 4), (1000,))

Shape (1000,4) can be interpret as (samples,features) and shape (1000,) can be interpret as vector of shape (samples,)

# Dataset Preprocessing

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

## Splitting Datasets

In [None]:
X_train,X_test,y_train,y_test = train_test_split(features,target,test_size=0.2,random_state=145)

In [None]:
X_train.shape, X_test.shape

((800, 4), (200, 4))

In [None]:
y_train.shape, y_test.shape

((800,), (200,))

## Feature Scaling

In [None]:
scale = StandardScaler()

In [None]:
scaled_X_train = scale.fit_transform(X_train)
scaled_X_test = scale.transform(X_test)

In [None]:
scaled_X_train

array([[ 0.80133695, -1.02305448,  0.59086554,  0.74606393],
       [-1.86600016,  0.9308201 , -1.6272745 , -0.32525132],
       [ 1.66156272,  0.24121731, -1.39983041,  0.11444898],
       ...,
       [-0.6808002 , -1.82759107, -0.21560121, -0.18262126],
       [-1.88129306,  0.35615111, -0.28861316, -0.6532096 ],
       [ 0.04752429,  0.35615111,  1.58169619,  0.95387683]])

# Scratch Implementation of Logistic Regression

In a layman's term Logistic regression can be understood as a upgrade to a linear regression. Upgrade to linear regression was addition of a **Sigmoid Function**. Sigmoid function takes the output of linear regression as a input and outputs the value between 0 and 1.
 
Linear regression: $\text{h}_\theta(x) = \theta^TX$  
Logistic regression: $\text{g}(\text{h}_\theta(x)) = \frac{1}{1+e^{-\text{h}_\theta(x)}}$

**Defining Sigmoid Function :** $\text{g(z)} = \frac{1}{1 + e^{-z}}$  
Sigmoid Function helps to squash the output to range of 0 and 1. Input can be any number but output will be in the range of 0 and 1.

In [None]:
def sigmoid(z):
  return 1/(1+np.exp(-z))

**Defining   
Overall Cost Function:** $\text{J}(y,\hat{y}) = \frac{1}{m}\sum_{i=1}^{m}(-y\log(\hat{y})-(1-y)\log(1-\hat{y}))$  
Here, $m$ is the total no of data samples.  
We use this cost function other than **MAE** and **MSE** because of the sigmoid function

In [None]:
#cost function
def Overall_Loss(y,y_hat):
  '''
    y and y_hat will have the same dim. (features, Samples)
  '''
  return (1/y.shape[1]) * (np.sum((-y * np.log(y_hat)) - ((1-y) * np.log(1 - y_hat))))

# np.sum([[1,2,3]]) => 6

**Steps To Calculate Cost:**  
Step1: Predict output  
  * $\hat{y} = \text{g}(\theta^TX) = \frac{1}{1 + e^{-\theta^TX}}$  
  
Step2: Calculate Loss/Cost using target matrix ($y$) and predicted output matrix ($\hat{y}$) 
  * $ loss = \text{J}(y,\hat{y})$

In [None]:
def CalculateCost(X, y, theta):
  '''
    Shape of X : (Features, Samples)
    Shape of y : (1, Samples)
    Shape of theta : (Output_feature, Input_features) theta is already transposed
  '''
  y_hat = sigmoid(np.dot(theta, X)) # np.dot is a function for matrix multiplication
  loss = Overall_Loss(y, y_hat)
  return y_hat, loss

**Calculate Gradient using** :   
$\frac{d}{d\theta}J(y,\hat{y}) = \frac{1}{m}*matmul((\hat{y}-y),X^T)$  
Here, $m$ is the total no of data samples and $matmul$ represents the matrix multiplication.  

$y$ represents the true targets and $\hat{y}$ represents the predicted targets

In [None]:
def CalculatetGradient(X, y, y_hat):
  '''
    Shape of X, y, y_hat  = (features, samples)
  '''
  tmp = y_hat - y
  d_theta = (1 / X.shape[1]) * (np.dot( tmp, X.T))
  return d_theta

**Initializing $\theta$ or Weight matrix:**  
To define $\theta$ we need to know the shape of a $\theta$. The shape of the $\theta$ is always equal to (input_features, output_features).  
  
In logistic regression output feature is alway 1 so we can define $\theta$ as (input_features,1)  

$\hat{y} = \theta^TX$  
(1,samples) = (features,1)$^T$.(features,samples)  
(1,samples) = (1,features).(features,samples)  
(1,samples) = (1,samples) #After matrix multiplication
  

In [None]:
def initialize_theta(input_feature):
  # shape of theta = (4 + 1, 1)  :   (Input Features + bias, Output)
  theta = np.zeros((input_feature + 1, 1)) # +1 for theta_0 / bias
  return theta

In [None]:
def fit(X, y, theta, lr=0.001, loop=100):
  '''
    Shape of X = (Samples, Features)
    Shape of theta = (1, Features + bias) # theta is already transposed
    Shape of Y is the vector of one column= (Samples,) 
  '''

  # transposing X to match the requires shape so that we can do matrix mult. with theta transpose.
  X = X.T # shape after Transpose (features, samples)

  # adding 1's matrix on top of X matrix for theta_0 / bias
  X = np.vstack([np.ones((1,X.shape[1])),X]) # shape after vertical stack: (features + bias, samples)

  # Converting vector to matrix using reshape
  y = y.reshape(1,-1) # shape after reshaping matrix: (1, samples) 

  loss_plot = []

  # Performing Gradient descent
  for i in range(loop):
    y_hat, loss = CalculateCost(X, y, theta)
    dw = CalculatetGradient(X, y, y_hat)
    
    # optimize theta parameter
    theta = theta - (lr * dw)

    # print losses
    if i%100 == 0:
      print(f"loop : {i}, loss: {loss:.6f}")
      loss_plot.append(loss)
      
  print(f"loop : {loop}, loss: {loss:.6f}")
  return loss_plot, theta.T

In [None]:
X_train.shape

(800, 4)

In [None]:
# calling initialize_theta() function
theta = initialize_theta(X_train.shape[1])
theta, theta.shape

(array([[0.],
        [0.],
        [0.],
        [0.],
        [0.]]), (5, 1))

In [None]:
# Calling fit function
loss_plot, theta = fit(scaled_X_train, y_train, theta.T, lr=0.01, loop=2000)

loop : 0, loss: 0.693147
loop : 100, loss: 0.431302
loop : 200, loss: 0.320813
loop : 300, loss: 0.263721
loop : 400, loss: 0.229562
loop : 500, loss: 0.206992
loop : 600, loss: 0.191007
loop : 700, loss: 0.179096
loop : 800, loss: 0.169873
loop : 900, loss: 0.162512
loop : 1000, loss: 0.156494
loop : 1100, loss: 0.151476
loop : 1200, loss: 0.147223
loop : 1300, loss: 0.143569
loop : 1400, loss: 0.140391
loop : 1500, loss: 0.137599
loop : 1600, loss: 0.135125
loop : 1700, loss: 0.132915
loop : 1800, loss: 0.130928
loop : 1900, loss: 0.129130
loop : 2000, loss: 0.127510


In [None]:
theta, theta.shape

(array([[ 0.2340359 ],
        [-1.43059768],
        [ 0.73174127],
        [-0.83978951],
        [-1.58125691]]), (5, 1))

In [None]:
def predict(X, theta):

  X = X.T # shape after transpose (Features, Samples)
  X = np.vstack([np.ones((1,X.shape[1])),X])

  y_hat = sigmoid(np.dot(theta.T,X))
  y_hat = y_hat >= 0.5
  y_hat = y_hat.astype(int)

  return y_hat

In [None]:
def accuracy(y,y_hat):
  return (100 - np.mean(np.abs(y_hat - y)) * 100)

In [None]:
pred = predict(scaled_X_test,theta)
pred.shape

(1, 200)

In [None]:
accuracy(y_test.reshape(1,-1),pred)

96.5

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
confusion_matrix(y_test,pred.reshape(-1))

array([[96,  0],
       [ 7, 97]])

# Implementation with Sklearn

In [None]:
scaled_X_train.shape, y_train.shape

((800, 4), (800,))

In [None]:
scaled_X_test.shape, y_test.shape

((200, 4), (200,))

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(scaled_X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
accuracy(y_test,model.predict(scaled_X_test))

97.0

In [None]:
confusion_matrix(y_test,model.predict(scaled_X_test))

array([[95,  1],
       [ 5, 99]])