In [114]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


# Logistic Regression

In [115]:
class Logistic_Regression():

    # initiate learning_rate and no_of_iterations
    def __init__(self, lr, no_of_iter):
        self.learning_rate = lr
        self.no_of_iterations = no_of_iter

    # train the model
    def fit(self, X, Y):
        #no of datapoints in the dataset (no of rows) - m
        #no of i/p features in the dataset (no of cols) - n
        self.m, self.n = X.shape

        # initiate w and b value
        self.w = np.zeros(self.n)
        self.b = 0
        self.X = X
        self.Y = Y

        # implement gradient descent
        for i in range(self.no_of_iterations):
            self.update_weights()

    # gradient descent algo
    def update_weights(self):

        #Y_hat formulae (sigmoid function)
        Y_hat = 1 / (1 + np.exp(-(self.X.dot(self.w))+self.b))  

        #derivatives
        dw= (1/self.m) * np.dot(self.X.T, (Y_hat - self.Y))
        db= (1/self.m) * np.sum(Y_hat - self.Y)

        #updating w and b using gradient descent
        self.w= self.w - self.learning_rate * dw
        self.b= self.b - self.learning_rate * db

    # predict function with probability threshold as 0.5
    def predict(self, X):

        Y_pred = 1 / (1 + np.exp(-(X.dot(self.w))+self.b))

        Y_pred = np.where(Y_pred > 0.5, 1, 0)
        return Y_pred


data collection

In [116]:
data = pd.read_csv("datasets/diabetes2.csv")

In [117]:
X = data.drop(columns='Outcome')
y = data['Outcome']

train test split

In [118]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size= 0.3, random_state=42)

feature scaling

In [119]:
""" from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test) """

' from sklearn.preprocessing import StandardScaler\nsc = StandardScaler()\n\nX_train = sc.fit_transform(X_train)\nX_test = sc.transform(X_test) '

Train the manually coded Logistic Regression Model

In [120]:
model = Logistic_Regression(lr=0.01, no_of_iter=1000)

model.fit(X_train,y_train)

predict train set results

In [121]:
y_pred_train = model.predict(X_train)

train accuracy

In [122]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_train,y_pred_train)
print(cm)
print("training accuracy:",accuracy_score(y_train, y_pred_train))

[[348   1]
 [187   1]]
training accuracy: 0.6499068901303539


predict test set results

In [123]:
y_pred_test = model.predict(X_test)

test accuracy

In [124]:
from sklearn.metrics import accuracy_score,confusion_matrix
cm = confusion_matrix(y_test, y_pred_test)
print(cm)
print("test accuracy:", accuracy_score(y_test,y_pred_test))

[[150   1]
 [ 79   1]]
test accuracy: 0.6536796536796536


Interpreting coefficients of independent variables

In [125]:
model.w

array([ 0.90501551, -0.13267222, -1.10445312, -0.21994217, -0.34169352,
        0.08540445,  0.0273881 ,  0.13388974])

In [126]:
intercept = model.b
print("intercept:",model.b)

intercept: -0.36485847230164287


In [127]:
independent_variables =["Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI", "DiabetesPedigreeFunction", "Age"]
coefficients = model.w

coeff_df = pd.DataFrame({'independent variable':independent_variables, 'coeff':coefficients})

coeff_df.loc[len(coeff_df.index)] = ['Intercept',model.b] 

coeff_df

Unnamed: 0,independent variable,coeff
0,Pregnancies,0.905016
1,Glucose,-0.132672
2,BloodPressure,-1.104453
3,SkinThickness,-0.219942
4,Insulin,-0.341694
5,BMI,0.085404
6,DiabetesPedigreeFunction,0.027388
7,Age,0.13389
8,Intercept,-0.364858


**Q.**
In logistic regression, How do the coefficients of independent variables convey us about their contribution to dependent variable ?

**ANS**

**Sign of the Coefficients:** The sign of the coefficient (+ or -) indicates the direction of the relationship between the independent variable and the dependent variable.
- A positive coefficient suggests that an increase in the independent variable is associated with an increase in the log-odds of the dependent variable being in class 1.
- A negative coefficient suggests that an increase in the independent variable is associated with a decrease in the log-odds of the dependent variable being in class 1.

**Magnitude of the Coefficients:** The magnitude of the coefficient indicates the strength of the relationship between the independent variable and the dependent variable. Larger magnitudes suggest a stronger influence on the dependent variable.

**Odds Ratio:** Exponentiating the coefficient gives the odds ratio. For example, if the coefficient for a variable is 0.5, then the odds of the dependent variable being in class 1 increase by a factor of exp(0.5) ≈ 1.65 for a one-unit increase in the independent variable.

**Comparing Coefficients:** You can compare the coefficients of different independent variables to assess their relative importance in predicting the dependent variable. Larger coefficients typically indicate stronger contributions to the prediction.