--------------------------------------------------------------------------------------------------------------------------------------------

# Logistic Regression:

---------------------------------------------------------------------------------------------------------------------------------------------

## The Dataset:

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb

In [2]:
training_data_2 = pd.read_csv('data2_train.csv')
testing_data_2 = pd.read_csv('data2_test.csv')
training_data_2

Unnamed: 0,Feature_1,Feature_2,Target
0,8.160646,88.799326,0
1,31.149536,102.335826,0
2,13.103383,92.902908,0
3,15.950445,77.412565,0
4,35.856965,94.441550,0
...,...,...,...
795,35.142860,19.722994,1
796,38.306612,36.989602,1
797,28.649138,30.474118,1
798,31.949594,23.900828,1


In [3]:
training_data_2['Target'].value_counts()

Target
0    419
1    381
Name: count, dtype: int64

- So there are 2 features and we need to classify it into 0 and 1 only.

--------------------------------------------------------------------------------------------------------------------------------------------

## Making and Training the model:

1. Logistic Regression:
* It is an algorithm used to classify things into 2 target values.
* It uses the sigmoid curve to classify data
* sigmoid function: e^z/(1+e^z) where, z = w1.x1 + w2.x2 + .... wn.xn + b, OR, just W.X + b, where W and X are arrays(vectors) containing slopes and independent variables corresponding to each slope respectively.
* It can return values in the range: (0,1) and if the value is above 0.5, it clasiffies it as 1 and if it is below 0.5, it classifies it as 0.

2. Optimization of the biases using Gradient Descent:
* It uses a cost function and tries to minimize it.
* Here we find the difference between sigmoid calculation at a particular step and the real value at each iteration.
* dw = (1/m) * (Y' - Y).X , db = (1/m) * (Y'-Y) --> These are the gradient values
* w = w - learningRate*dw , b = b - learningRate*db --> This is how after each iteration, the biases get updated 

In [4]:
class Logistic_Regression():
    
    def __init__(self, learning_rate=0.01, no_of_iterations=10000):
        
        self.learning_rate = learning_rate  #Setting the learning rate which can be customized using parameters
        self.no_of_iterations = no_of_iterations  #Setting the number of iterations in gradient descent

    def fit(self, X, y):
        
        self.m, self.n = X.shape  # Here m would store the number of rows and n would store the number of columns in X 
        self.w = np.zeros(self.n)  # Initialising biases with 0
        self.b = 0  #initialising constant with 0
        self.X = X
        self.y = y

        for i in range(self.no_of_iterations):
            
            Sig_val = 1 / (1 + np.exp(-(self.X.dot(self.w) + self.b)))  # The sigmoid function value at the current sigmoid function

            dw = (1 / self.m) * np.dot(self.X.T, (Sig_val - self.y))  #Gradient values
            db = (1 / self.m) * np.sum(Sig_val - self.y)  #Gradient values

            self.w-=self.learning_rate*dw  #Updating the biases
            self.b-=self.learning_rate*db  #Updating the constant

    def predict(self, X):
        
        y_pred= 1 / (1 + np.exp(-(X.dot(self.w) + self.b)))  #Prediction is done by simply putting the X values in the sigmoid function obtained
        y_pred=np.where(y_pred > 0.5, 1, 0)  #Returns 1 if sigmoid function > 0.5 and 0 if sigmoid function < 0.5
        return y_pred

    def accuracy(self, y_pred, y):
        
        return np.mean(y_pred == y)

Here I have created a class LogisticRegression which contains all the functions explained above.

--------------------------------------------------------------------------------------------------------------------------------------------

## Splitting the datasets:

In [5]:
X_train = training_data_2.drop(columns=['Target'])
X_train

Unnamed: 0,Feature_1,Feature_2
0,8.160646,88.799326
1,31.149536,102.335826
2,13.103383,92.902908
3,15.950445,77.412565
4,35.856965,94.441550
...,...,...
795,35.142860,19.722994
796,38.306612,36.989602
797,28.649138,30.474118
798,31.949594,23.900828


In [6]:
y_train = training_data_2['Target']
y_train

0      0
1      0
2      0
3      0
4      0
      ..
795    1
796    1
797    1
798    1
799    1
Name: Target, Length: 800, dtype: int64

In [7]:
x_test = testing_data_2.drop(columns=['Target'])
x_test

Unnamed: 0,Feature_1,Feature_2
0,48.489576,81.609641
1,26.069706,89.783100
2,31.967447,88.005024
3,44.957613,91.219129
4,27.681870,87.381969
...,...,...
195,41.109517,44.902902
196,52.749410,36.919400
197,26.368150,43.111313
198,29.879813,24.719104


In [8]:
y_test = testing_data_2['Target']
y_test

0      0
1      0
2      0
3      0
4      0
      ..
195    1
196    1
197    1
198    1
199    1
Name: Target, Length: 200, dtype: int64

--------------------------------------------------------------------------------------------------------------------------------------------

## Making and Fitting the Model:

In [9]:
model = Logistic_Regression(learning_rate=0.005)

In [10]:
model.fit(X_train,y_train)

In [11]:
y_train_pred = model.predict(X_train)
y_train_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [12]:
accuracy_train = model.accuracy(y_train_pred,y_train)
accuracy_train

0.93375

In [13]:
y_test_pred = model.predict(x_test)

In [14]:
accuracy_test = model.accuracy(y_test_pred,y_test)
accuracy_test

0.93

- I got a decent accuracy of 93% here

--------------------------------------------------------------------------------------------------------------------------------------------

## Using scikit-learn:

In [15]:
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report

In [16]:
model_sk = LogisticRegression()

In [17]:
model_sk.fit(X_train,y_train)

In [18]:
model_sk.score(X_train,y_train)

0.98625

In [19]:
model_sk.score(x_test,y_test)

0.99

- So, I got an accuracy of 99% using the scikit module compared to an accuracy of 93% using the model built by me.