### Building linear and logistic regression from scratch 

In [1]:
import pandas as pd
import numpy as np 
import scipy.stats as ss
from numpy import linalg as npl
import matplotlib.pyplot as pp
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import r2_score,accuracy_score,classification_report, confusion_matrix

from IPython.display import display

In [2]:
churn_df=pd.read_csv('churn_data.csv')#,ignore_index=True)
display(churn_df.head())
internet_df=pd.read_csv('internet_data.csv')
display(internet_df.head())
customer_df=pd.read_csv('customer_data.csv')
display(customer_df.head())

#### Merging the datasets
df1=churn_df.merge(internet_df,how='inner',on='customerID')
final_df=df1.merge(customer_df,how='inner',on='customerID')
print('Merged dataset\n')
display(final_df.head())
display(final_df.info())

bike_df=pd.read_csv('day.csv')
display(bike_df.head())

Unnamed: 0,customerID,tenure,PhoneService,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,1,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,34,Yes,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,2,Yes,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,45,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,2,Yes,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


Unnamed: 0,customerID,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies
0,7590-VHVEG,No phone service,DSL,No,Yes,No,No,No,No
1,5575-GNVDE,No,DSL,Yes,No,Yes,No,No,No
2,3668-QPYBK,No,DSL,Yes,Yes,No,No,No,No
3,7795-CFOCW,No phone service,DSL,Yes,No,Yes,Yes,No,No
4,9237-HQITU,No,Fiber optic,No,No,No,No,No,No


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents
0,7590-VHVEG,Female,0,Yes,No
1,5575-GNVDE,Male,0,No,No
2,3668-QPYBK,Male,0,No,No
3,7795-CFOCW,Male,0,No,No
4,9237-HQITU,Female,0,No,No


Merged dataset



Unnamed: 0,customerID,tenure,PhoneService,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,MultipleLines,...,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,gender,SeniorCitizen,Partner,Dependents
0,7590-VHVEG,1,No,Month-to-month,Yes,Electronic check,29.85,29.85,No,No phone service,...,No,Yes,No,No,No,No,Female,0,Yes,No
1,5575-GNVDE,34,Yes,One year,No,Mailed check,56.95,1889.5,No,No,...,Yes,No,Yes,No,No,No,Male,0,No,No
2,3668-QPYBK,2,Yes,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,No,...,Yes,Yes,No,No,No,No,Male,0,No,No
3,7795-CFOCW,45,No,One year,No,Bank transfer (automatic),42.3,1840.75,No,No phone service,...,Yes,No,Yes,Yes,No,No,Male,0,No,No
4,9237-HQITU,2,Yes,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,No,...,No,No,No,No,No,No,Female,0,No,No


<class 'pandas.core.frame.DataFrame'>
Int64Index: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   tenure            7043 non-null   int64  
 2   PhoneService      7043 non-null   object 
 3   Contract          7043 non-null   object 
 4   PaperlessBilling  7043 non-null   object 
 5   PaymentMethod     7043 non-null   object 
 6   MonthlyCharges    7043 non-null   float64
 7   TotalCharges      7043 non-null   object 
 8   Churn             7043 non-null   object 
 9   MultipleLines     7043 non-null   object 
 10  InternetService   7043 non-null   object 
 11  OnlineSecurity    7043 non-null   object 
 12  OnlineBackup      7043 non-null   object 
 13  DeviceProtection  7043 non-null   object 
 14  TechSupport       7043 non-null   object 
 15  StreamingTV       7043 non-null   object 
 16  StreamingMovies   7043 non-null   object 


None

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,01-01-2018,1,0,1,0,6,0,2,14.110847,18.18125,80.5833,10.749882,331,654,985
1,2,02-01-2018,1,0,1,0,0,0,2,14.902598,17.68695,69.6087,16.652113,131,670,801
2,3,03-01-2018,1,0,1,0,1,1,1,8.050924,9.47025,43.7273,16.636703,120,1229,1349
3,4,04-01-2018,1,0,1,0,2,1,1,8.2,10.6061,59.0435,10.739832,108,1454,1562
4,5,05-01-2018,1,0,1,0,3,1,1,9.305237,11.4635,43.6957,12.5223,82,1518,1600


In [3]:
### Synthesized linear regression data

X,Y=make_regression(n_samples=1000,n_features=10,random_state=100)
X.shape,Y.shape

((1000, 10), (1000,))

#### Normalized Equation to solve linear regression

MSE =  $\frac{1}{m} \Sigma_{i=1}^{m} (y_i-\hat{y_i})^2$

$\hat{y_i}=X\beta$

$\frac{d(MSE)}{d\beta} = \frac{1}{m} (-2X^T Y\beta +2X^T X \beta)$

$\frac{d(MSE)}{d\beta} =0$ 

$\beta = (X^{-1}X) \cdot (X^{T}Y)$




In [4]:
class linear_regression_norm:
    """
    Building a multiple linear regression model from scratch by using Norm equation
    Attributes: X is the depen
    """
    def _init(self,n_iterations=1000):
        self.beta = None
        self.n_iterations=n_iterations
    def train(self,X,Y):
        """
        Linear Regression using norm equation or closed form equation
        Return: betas/ co-efficients
        """
        ### Adding intercepts to X  ###
        X_added_intercept=np.hstack((np.ones((X.shape[0],1)),X))
        
        ### Closed-form solution & norm equation ###
        x_t_x_inv=npl.inv(np.dot(X_added_intercept.T,X_added_intercept))
        x_t_y=np.dot(X_added_intercept.T,Y)
        ### Finding the optimal beta - first order derivative equated to zero ###
        self.beta = np.dot(x_t_x_inv,x_t_y)
        return self.beta
    def predict(self,X):
        ### Test dataset - Adding intercept ###
        X_test_added_int=np.hstack((np.ones((X.shape[0],1)),X))
        ### Predicted value ###
        y_pred=np.dot(X_test_added_int,self.beta)

        return y_pred.flatten()


#### Gradient Descent approach to solve linear regression

MSE = $\frac{1}{n} \Sigma_{i=1}^{n} (y-y_i)^2$

$y_i=X\beta$

MSE= $\frac{1}{n} \Sigma_{i=1}^{n} (y-X\beta)^T (y-X\beta) $

MSE= $\frac{1}{n} \Sigma_{i=1}^{n} (y^T y-2y X\beta + \beta^T X^T X \beta) $

$\frac{d(MSE)}{d\beta} = \frac{1}{n} (-2X^T Y\beta +2X^T X \beta)$

$\frac{d(MSE)}{d\beta} =0$ 

$slope = \frac{-2}{n} X^T(Y-X\beta)$


$\beta_{new}=\beta_{old} - \eta \cdot slope $

In [5]:
class linear_regression_gd:
    """
    Building a multiple linear regression model from scratch by using Norm equation
    Attributes: X is the depen
    """
    def __init__(self,n_learning_rate=0.005,n_iterations=1000):
        self.beta = None
        self.n_learning_rate=n_learning_rate
        self.n_iterations=n_iterations
    def train(self,x,y):
        """
        Linear Regression using norm equation or closed form equation
        
        Return: betas/ co-efficients
        """
        ### Adding intercepts to X  ###
        X_added_intercept=np.hstack((np.ones((x.shape[0],1)),x))
        # print(X_added_intercept.shape)
        ### initializing betas ###
        self.beta=np.ones((x.shape[1]+1,1))
        # print(f'beta: {self.beta.shape}')
        ### Gradient Descent ###
        for i in range(self.n_iterations):
            
            ### y predicted ####
            y_pred=np.dot(X_added_intercept,self.beta)
            ### error ###
            error = y_pred-y.reshape(700,1)
            # print(X_added_intercept.T.shape)
            # print('error_shape:',error.shape)
            gradient = (1*2/x.shape[0])* np.dot(X_added_intercept.T, error)
            # print('gradient:',gradient.shape)
            self.beta -= self.n_learning_rate*gradient
        return self.beta
    def predict(self,x):
        ### Test dataset - Adding intercept ###
        X_test_added_int=np.hstack((np.ones((x.shape[0],1)),x))
        ### Predicted value ###
        y_pred=np.dot(X_test_added_int,self.beta)

        return y_pred.flatten()


In [6]:
def r_squared(y, y_pred):
    y_mean = np.mean(y)
    ss_residual = np.sum((y - y_pred) ** 2)
    ss_total = np.sum((y - y_mean) ** 2)
    return 1 - (ss_residual / ss_total)


### Train &  test spilt 

In [7]:
x_train,x_test,y_train,y_test=train_test_split(X[:,5:-1],Y,train_size=0.7,random_state=0)
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((700, 4), (300, 4), (700,), (300,))

In [8]:
### Linear regression using normal equation
model_norm=linear_regression_norm()
model_norm.train(x_train,y_train)
y_pred_norm=model_norm.predict(x_test)
print('R-squared of linear regression using normal equation:',r_squared(y_test,y_pred_norm))

### Linear regression using gradient descent 
model_gd=linear_regression_gd()
model_gd.train(x_train,y_train)
y_pred_gd=model_gd.predict(x_test)
print('R-squared of linear regression using gradient descend:',r_squared(y_test,y_pred_gd))

### Linear regression using sk-learn
model_sk=LinearRegression()
model_sk.fit(x_train,y_train)
y_pred_sk=model_sk.predict(x_test)
print('R-squared of linear regression using SK-learn:',r2_score(y_test,y_pred_sk))


R-squared of linear regression using normal equation: 0.3863391471944524
R-squared of linear regression using gradient descend: 0.386349007172634
R-squared of linear regression using SK-learn: 0.3863391471944525


The linear regression using normal equation and gradient descent build from scratch has same accuracy as sk learn models

Logistic regression from scratch 

Linear regression gets into a sigmoid function to make it a probalistic model

$$
\hat{y} = \sigma(z) = \frac{1}{1 + e^{-z}}
$$

where:

$$
z = \beta_0 + \beta_1 x_1 + \beta_2 x_2 + \cdots + \beta_n x_n
$$

The binary cross-entropy loss is defined as:

$$
\text{Loss} = -\frac{1}{N} \sum_{i=1}^N \left[ y_i \log(\hat{y}_i) + (1 - y_i) \log(1 - \hat{y}_i) \right]
$$


Find the optimal coefficients $\beta_{0},\beta_{1}, \cdots,\beta_{n}$ using gradient descend

In [9]:
class logistic_regression:
    """
    Logistic Regression: 
    Input: X is the independent variables and Y is dependent variables
    Retruns: train - fits the model & predict: inference and gives predicted output   
    """
    def __init__(self,n_iterations=1000,learning_rate=0.05):
        self.learning_rate=learning_rate
        self.n_iterations=n_iterations
        
    def sigmoid_function(self,z):
        yhat=1/(1+np.exp(-1*z))
        return yhat
        
    
    def train(self,x,y):
        ### Reshaping y ###
        y=y.reshape(700,1)
        #### Adding intercept ####
        x_added_intercept=np.hstack((np.ones((x.shape[0],1)),x))
        print(f'shape after adding intercept: {x_added_intercept.shape}')
        self.beta=np.ones((x.shape[1]+1,1))
        #### fitting the line ####
        z = np.dot(x_added_intercept,self.beta)
        print(f'shape of fitted y: {x_added_intercept.shape}')
        
        m,n =x.shape
        
        for i in range(self.n_iterations):
            z = np.dot(x_added_intercept,self.beta)
            yhat=self.sigmoid_function(z)
            # print(f'shape of fitted y: {x_added_intercept.shape}')
            loss=1/m * np.sum((y * np.log(yhat)) + (1-y)*np.log(1-yhat))
            ### Slope of binary cross entropy ###
            gradient = 1/m * np.dot(x_added_intercept.T,(yhat-y))
            ### Updating the betas ###
            self.beta -= self.learning_rate *gradient
            # print(self.beta)
        return self.beta
    def predict(self,x, threshold=0.5):
        ### Adding intercept ###
        x_test_int=np.hstack((np.ones((x.shape[0],1)),x))
        ### prediction - converting the linear output to probabilities ####
        y_prob= self.sigmoid_function(np.dot(x_test_int,self.beta))
        ### 
        y_pred_label = np.where(y_prob>threshold,1,0)
        
        return y_prob.flatten(), y_pred_label.flatten()    

In [10]:
Y_label = np.where(Y>150,1,0)
x_train,x_test,y_train,y_test=train_test_split(X,Y_label,train_size=0.7,random_state=0)
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((700, 10), (300, 10), (700,), (300,))

In [11]:
### Building logistic regression model from scratch ###
log_model=logistic_regression()
log_model.train(x_train,y_train)
y_pred_proba,y_pred_label_log=log_model.predict(x_test)
print(f'Logistic Regression accuracy_score using model built by scratch:{accuracy_score(y_test,y_pred_label_log):.2f}')
model_lg=LogisticRegression()
model_lg.fit(x_train,y_train)
y_pred_label_sk=model_lg.predict(x_test)
print(f'Logistic Regression accuracy_score using model built by sklearn:{accuracy_score(y_test,y_pred_label_sk)}')


shape after adding intercept: (700, 11)
shape of fitted y: (700, 11)
Logistic Regression accuracy_score using model built by scratch:0.97
Logistic Regression accuracy_score using model built by sklearn:0.98


In [15]:
print('Confusion matrix of logistic regression build from scratch')
confusion_matrix(y_test,y_pred_label_log)

Confusion matrix of logistic regression build from scratch


array([[256,   4],
       [  5,  35]], dtype=int64)

In [16]:
print('Confusion matrix of logistic regression using sk-learn')
confusion_matrix(y_test,y_pred_label_sk)

Confusion matrix of logistic regression using sk-learn


array([[259,   1],
       [  5,  35]], dtype=int64)

The accuracy score and confusion matrices of these two models for the same dataset. Hence, the model that is build is correct!!