## Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

## Loading CSV_File

In [None]:
df = pd.read_csv('diabetes.csv')

## EDA and Data Cleaning

In [None]:
# Data Preview
df.head()

In [None]:
# getting the shape
df.shape

In [None]:
# for getting column name
df.columns

In [None]:
# geting the info of dataset columnwise
df.info()

In [None]:
# for checking null values
df.isnull().sum()

In [None]:
# for gettting every statistical data
df.describe()

In [None]:
# Tranpose of Matrix
df.describe().T

In [None]:
# Heatmap
cor = df.corr()
sns.heatmap(cor, cmap="crest", annot=True)
plt.show()

In [None]:
means = df.iloc[:, 1:6].mean()
means

In [None]:
nonzeros = list(df.columns[1:6])
for column in nonzeros:
    df[column] = df[column].replace(0, means[column])

In [None]:
df_improve =df[['Glucose','BMI','Age','Insulin','Outcome']]

In [None]:
# improved dataset after datacleaning
df_improve.head()

In [None]:
# Getting correlation of improved dataset
df_improve.corr()


In [None]:
# Count the number of occurrences of each unique value in the 'Outcome' column
outcome_counts = df['Outcome'].value_counts()
print(outcome_counts)

## Data Visualisation

In [None]:
sns.set_context('notebook', font_scale= 1.2)
fig, ax = plt.subplots(2, 3, figsize = (20, 10))

plt.suptitle('Distribution of various Numerical Features based on target variable', fontsize = 20)

ax1 = sns.histplot(x ='Age', data= df, hue= 'Outcome', kde= True, ax= ax[0, 0], palette='magma')
ax1.set(xlabel = 'Age')

ax2 = sns.histplot(x ='BloodPressure', data= df, hue= 'Outcome', kde= True, ax= ax[0, 1], palette='viridis')
ax2.set(xlabel = 'Blood Pressure')

ax3 = sns.histplot(x ='Glucose', data= df, hue= 'Outcome', kde= True, ax= ax[0, 2], palette='magma')
ax3.set(xlabel = 'Glucose level')

ax4 = sns.histplot(x ='Insulin', data= df, hue= 'Outcome', kde= True, ax= ax[1, 0], palette='viridis')
ax4.set(xlabel = 'Insulin')

ax5 = sns.histplot(x ='BMI', data= df, hue= 'Outcome', kde= True, ax= ax[1, 1], palette='magma')
ax5.set(xlabel = 'Body Mass Index')

ax6 = sns.histplot(x ='DiabetesPedigreeFunction', data= df, hue= 'Outcome', kde= True, ax= ax[1, 2], palette='viridis')
ax6.set(xlabel = 'Diabetes Likelihood Based on Family History')

plt.show()

In [None]:
# Pairplot
sns.pairplot(df,hue='Outcome')

In [None]:
# Count the number of occurrences of each unique value in the 'Outcome' column
outcome_counts = df['Outcome'].value_counts()

In [None]:
# Create a pie chart
plt.figure(figsize=(6, 6))
plt.pie(outcome_counts, labels=['Fit', 'Diabetic'], autopct='%1.1f%%', startangle=140)
plt.title('Outcome Count by Fit and Diabetic')
plt.show()



In [None]:
# Plot the count of each class in the 'Outcome' column
sns.countplot(x=df['Outcome'], data=df)
plt.show()

## Model Building

In [None]:
X = df_improve.loc[:, ['Glucose', 'Insulin','BMI','Age']].to_numpy()
y = df_improve.loc[:, 'Outcome'].to_numpy()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42, stratify = y)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

## Scaling

In [None]:
#Standardising the Dataset
mean = X_train.mean()
std = X_test.std()

X_train = (X_train - mean) / std
X_train = np.c_[np.ones(X_train.shape[0]), X_train]
X_test = (X_test - mean) / std
X_test = np.c_[np.ones(X_test.shape[0]), X_test]

## Logistic Regression

In [None]:
class LogisticRegression:

    def __init__(self, learning_rate = 0.01, no_of_iterations = 1000):
        self.learning_rate = learning_rate
        self.no_of_iterations = no_of_iterations
        self.costs = []
    
    # Activation function
    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    
    def fit(self, X, y):  
        self.X = X
        self.Y= y
        self.m = X.shape[0]         # Number of rows in the data
        self.n = X.shape[1]         # Number of columns in the data
        self.w = np.zeros(self.n)   # Initializing weights
        self.b = 0                  # Initializing bias
        
        for i in range(0, self.no_of_iterations):
            # Prediction
            y_hat = self.sigmoid(self.X.dot(self.w) + self.b)
            
            # Calculating cost function
            cost = (-self.Y * np.log(y_hat) - (1 - self.Y) * np.log(1 - y_hat)).mean()
            self.costs.append(cost)
            
            # Calculating gradients for backward propagation
            dw = (1 / self.m) * np.dot(self.X.T, (y_hat - self.Y))
            db = (1 / self.m) * np.sum(y_hat - self.Y)
            
            # Updating parameters
            self.w = self.w - self.learning_rate * dw
            self.b = self.b - self.learning_rate * db

    def predict(self, X):
        y_pred = self.sigmoid(X.dot(self.w) + self.b) 
        y_pred = np.where(y_pred > 0.4, 1, 0)   # y_pred = 1 if prob > 0.4 else 0
        return y_pred

In [None]:
log_reg = LogisticRegression(learning_rate= 0.1, no_of_iterations= 200)
log_reg.fit(X_train, y_train)


In [None]:
train_pred = log_reg.predict(X_train)
test_pred = log_reg.predict(X_test)

In [None]:
train_pred

In [None]:
test_pred

## Model Evaluation

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
def evaluate_model_performance(y_train, train_pred, y_test, test_pred):
    train_accuracy = accuracy_score(y_train, train_pred)
    test_accuracy = accuracy_score(y_test, test_pred)
    train_precision = precision_score(y_train, train_pred)
    test_precision = precision_score(y_test, test_pred)
    train_recall = recall_score(y_train, train_pred)
    test_recall = recall_score(y_test, test_pred)
    train_f1 = f1_score(y_train, train_pred)
    test_f1 = f1_score(y_test, test_pred)

    print("Train Accuracy:", train_accuracy)
    print("Test Accuracy:", test_accuracy)
    print("Train Precision:", train_precision)
    print("Test Precision:", test_precision)
    print("Train Recall:", train_recall)
    print("Test Recall:", test_recall)
    print("Train F1 Score:", train_f1)
    print("Test F1 Score:", test_f1)


In [None]:
evaluate_model_performance(y_train, train_pred, y_test, test_pred)

In [None]:
from sklearn import metrics

In [None]:
cfn_matrix = metrics.confusion_matrix(y_test, test_pred)
print(cfn_matrix)

In [None]:
cfn_matrix = metrics.confusion_matrix(y_test, test_pred)

# You have a typo in the following line. Instead of "pd.df(cfn_matrix)", you should use "pd.DataFrame(cfn_matrix)".
sns.heatmap(pd.DataFrame(cfn_matrix), annot=True, cmap="YlGnBu", fmt='g')
plt.title('Confusion Matrix', y=1.1)
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
plt.show()

In [None]:
# Classification Report
from sklearn.metrics import classification_report

print(classification_report(y_test,test_pred))