**Import Libraries**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as mlt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

**Reading CSV file**

In [None]:
df = pd.read_csv('diabetes_prediction_dataset.csv') #Reading the csv file
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


getting shape of the dataset** number of rows and columns in the data**

In [None]:
df.shape

(100000, 9)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


**Data Preprocessing**

In [None]:
df.isnull().sum()/len(df)*100

gender                 0.0
age                    0.0
hypertension           0.0
heart_disease          0.0
smoking_history        0.0
bmi                    0.0
HbA1c_level            0.0
blood_glucose_level    0.0
diabetes               0.0
dtype: float64

In [None]:
df.describe()

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,41.885856,0.07485,0.03942,27.320767,5.527507,138.05806,0.085
std,22.51684,0.26315,0.194593,6.636783,1.070672,40.708136,0.278883
min,0.08,0.0,0.0,10.01,3.5,80.0,0.0
25%,24.0,0.0,0.0,23.63,4.8,100.0,0.0
50%,43.0,0.0,0.0,27.32,5.8,140.0,0.0
75%,60.0,0.0,0.0,29.58,6.2,159.0,0.0
max,80.0,1.0,1.0,95.69,9.0,300.0,1.0


**Label Encoding**


*  gender
*  smoking history



In [None]:
le = LabelEncoder()
df['gender'] = le.fit_transform(df['gender'])
df['smoking_history'] = le.fit_transform(df['smoking_history'])

In [None]:
le.classes_

array(['No Info', 'current', 'ever', 'former', 'never', 'not current'],
      dtype=object)

In [None]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,0,80.0,0,1,4,25.19,6.6,140,0
1,0,54.0,0,0,0,27.32,6.6,80,0
2,1,28.0,0,0,4,27.32,5.7,158,0
3,0,36.0,0,0,1,23.45,5.0,155,0
4,1,76.0,1,1,1,20.14,4.8,155,0


In [None]:
df.duplicated().sum()

3854

In [None]:
df = df.drop_duplicates()

In [None]:
df['diabetes'].value_counts()

diabetes
0    87664
1     8482
Name: count, dtype: int64

In [None]:
X=df[['gender','age','hypertension','heart_disease','smoking_history','bmi',
      'HbA1c_level','blood_glucose_level']]  #indepedent variables
y=df['diabetes'] #dependent variables

**Balancing data**

*  Sampling **bold text**




In [None]:
from imblearn.over_sampling import RandomOverSampler

# Initialize RandomOverSampler
ros = RandomOverSampler()

# Upsample the minority class
X, y = ros.fit_resample(X, y)

In [None]:
y.value_counts()

diabetes
0    87664
1    87664
Name: count, dtype: int64

In [None]:
df = df.sample(frac=1)
df

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
41238,1,27.0,1,0,4,31.16,5.0,160,0
18729,0,71.0,0,0,1,40.03,7.0,300,1
185,1,41.0,0,0,3,23.56,6.5,140,0
28841,0,23.0,0,0,1,28.82,6.6,85,0
20746,0,40.0,0,0,4,27.46,6.2,126,0
...,...,...,...,...,...,...,...,...,...
24317,1,37.0,0,0,4,27.32,3.5,80,0
84432,0,69.0,0,0,3,24.38,4.0,140,0
19771,1,16.0,0,0,0,27.32,6.5,130,0
97842,0,46.0,0,0,4,29.26,6.1,159,0


**Splitting data into train and test**

*   train = 70% **bold text**
*  test = 30% **bold text**



In [None]:
# Select ratio
ratio = 0.70

total_rows = df.shape[0]
train_size = int(total_rows*ratio)

# Split data into test and train
train = df[0:train_size]
test = df[train_size:]

In [None]:
# print train set
print("Train dataframe")
print(train)

# print test set
print("Test dataframe")
print(test)

Train dataframe
       gender   age  hypertension  heart_disease  smoking_history    bmi  \
41238       1  27.0             1              0                4  31.16   
18729       0  71.0             0              0                1  40.03   
185         1  41.0             0              0                3  23.56   
28841       0  23.0             0              0                1  28.82   
20746       0  40.0             0              0                4  27.46   
...       ...   ...           ...            ...              ...    ...   
35837       1  10.0             0              0                4  22.96   
41839       1  68.0             0              0                1  26.08   
55175       0  28.0             0              0                1  27.32   
5893        0  44.0             0              0                4  27.32   
25129       0  24.0             0              0                4  27.30   

       HbA1c_level  blood_glucose_level  diabetes  
41238          5.0 

In [None]:
X_train = train.iloc[:, :-1]
y_train = train.iloc[:, -1]

X_test = test.iloc[:, :-1]
y_test = test.iloc[:, -1]

**Logistic Regression**

In [None]:
def sigmoid(z):
    """Sigmoid activation function."""
    return 1 / (1 + np.exp(-z))

def compute_cost(X, y, theta):
    """Compute the cost (logistic loss) for logistic regression."""
    m = len(y)
    h = sigmoid(np.dot(X, theta))
    cost = (-1 / m) * np.sum(y * np.log(h) + (1 - y) * np.log(1 - h))
    return cost

def compute_gradient(X, y, theta):
    """Compute the gradient of the cost function for logistic regression."""
    m = len(y)
    h = sigmoid(np.dot(X, theta))
    gradient = (1 / m) * np.dot(X.T, (h - y))
    return gradient

def update_pa3rameters(theta, gradient, learning_rate):
    """Update the parameters (weights and bias) using gradient descent."""
    theta -= learning_rate * gradient
    return theta

def predict(X, theta):
    """Predict the output given the input features and parameters."""
    return np.round(sigmoid(np.dot(X, theta)))


# Initialize parameters
theta = np.zeros(X_train.shape[1] + 1)  # Initialize weights and bias

# Set hyperparameters
learning_rate = 0.1
iterations = 100

# Perform gradient descent
for _ in range(iterations):
    # Add bias term to X_train
    X_train_with_bias = np.column_stack((np.ones(X_train.shape[0]), X_train))
    # Compute cost
    cost = compute_cost(X_train_with_bias, y_train, theta)
    # Compute gradient
    gradient = compute_gradient(X_train_with_bias, y_train, theta)
    # Update parameters
    theta = update_parameters(theta, gradient, learning_rate)

  return 1 / (1 + np.exp(-z))
  cost = (-1 / m) * np.sum(y * np.log(h) + (1 - y) * np.log(1 - h))
  return 1 / (1 + np.exp(-z))
  cost = (-1 / m) * np.sum(y * np.log(h) + (1 - y) * np.log(1 - h))
  return 1 / (1 + np.exp(-z))
  cost = (-1 / m) * np.sum(y * np.log(h) + (1 - y) * np.log(1 - h))
  return 1 / (1 + np.exp(-z))
  return 1 / (1 + np.exp(-z))
  cost = (-1 / m) * np.sum(y * np.log(h) + (1 - y) * np.log(1 - h))
  return 1 / (1 + np.exp(-z))
  cost = (-1 / m) * np.sum(y * np.log(h) + (1 - y) * np.log(1 - h))
  return 1 / (1 + np.exp(-z))
  cost = (-1 / m) * np.sum(y * np.log(h) + (1 - y) * np.log(1 - h))
  return 1 / (1 + np.exp(-z))
  cost = (-1 / m) * np.sum(y * np.log(h) + (1 - y) * np.log(1 - h))
  return 1 / (1 + np.exp(-z))
  cost = (-1 / m) * np.sum(y * np.log(h) + (1 - y) * np.log(1 - h))
  return 1 / (1 + np.exp(-z))
  cost = (-1 / m) * np.sum(y * np.log(h) + (1 - y) * np.log(1 - h))
  return 1 / (1 + np.exp(-z))
  return 1 / (1 + np.exp(-z))
  cost = (-1 / m) * np.sum(y

In [None]:
# Predict on test data
X_test_with_bias = np.column_stack((np.ones(X_test.shape[0]), X_test))
y_pred_test = predict(X_test_with_bias, theta)

accuracy = np.mean(y_pred_test == y_test)
print("Testing Accuracy:", accuracy)

Testing Accuracy: 0.9096519206767438


Predicting the accuracy with train and test data

In [None]:
# Predict on train data
X_train_with_bias = np.column_stack((np.ones(X_train.shape[0]), X_train))
predictions = predict(X_train_with_bias, theta)

# Model evaluation
accuracy = np.mean(predictions == y_train)
print("Training Accuracy:", accuracy)


Training Accuracy: 0.91263261121512


**Correlation coefficient**

In [None]:
import numpy as np

def pearson_correlation_coefficient(X, y):

    mean_X = np.mean(X)
    mean_y = np.mean(y)

    # Calculate covariance
    deviation_X = X - mean_X
    deviation_y = y - mean_y

    # Calculate standard deviations
    num = np.sum(deviation_X * deviation_y)
    den = np.sqrt(np.sum(deviation_X**2) * np.sum(deviation_y**2))
    # Calculate Pearson correlation coefficient
    correlation_coefficient = num/den

    return correlation_coefficient

In [None]:
X = df['gender'].values
y = df['diabetes'].values

pearson_coefficient = pearson_correlation_coefficient(X, y)
print("Pearson correlation coefficient for age v/s diabetes:", pearson_coefficient)

Pearson correlation coefficient for age v/s diabetes: 0.0376130036574501


In [None]:
X = df['age'].values
y = df['diabetes'].values

pearson_coefficient = pearson_correlation_coefficient(X, y)
print("Pearson correlation coefficient for age v/s diabetes:", pearson_coefficient)

Pearson correlation coefficient for age v/s diabetes: 0.2649269242360692


In [None]:
X = df['hypertension'].values
y = df['diabetes'].values

pearson_coefficient = pearson_correlation_coefficient(X, y)
print("Pearson correlation coefficient for age v/s diabetes:", pearson_coefficient)

Pearson correlation coefficient for age v/s diabetes: 0.19570970179625768


In [None]:
X = df['heart_disease'].values
y = df['diabetes'].values

pearson_coefficient = pearson_correlation_coefficient(X, y)
print("Pearson correlation coefficient for age v/s diabetes:", pearson_coefficient)

Pearson correlation coefficient for age v/s diabetes: 0.17071089766632166


In [None]:
X = df['smoking_history'].values
y = df['diabetes'].values

pearson_coefficient = pearson_correlation_coefficient(X, y)
print("Pearson correlation coefficient for age v/s diabetes:", pearson_coefficient)

Pearson correlation coefficient for age v/s diabetes: 0.08847085424856206


In [None]:
X = df['bmi'].values
y = df['diabetes'].values

pearson_coefficient = pearson_correlation_coefficient(X, y)
print("Pearson correlation coefficient for age v/s diabetes:", pearson_coefficient)

Pearson correlation coefficient for age v/s diabetes: 0.21493169995887673


In [None]:
X = df['HbA1c_level'].values
y = df['diabetes'].values

pearson_coefficient = pearson_correlation_coefficient(X, y)
print("Pearson correlation coefficient for age v/s diabetes:", pearson_coefficient)

Pearson correlation coefficient for age v/s diabetes: 0.406408375829487


In [None]:
X = df['blood_glucose_level'].values
y = df['diabetes'].values

pearson_coefficient = pearson_correlation_coefficient(X, y)
print("Pearson correlation coefficient for age v/s diabetes:", pearson_coefficient)

Pearson correlation coefficient for age v/s diabetes: 0.42433583557906734


**Evalution Matrix**

In [None]:
#Evaluation Matrix
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

cm = confusion_matrix(y_test, y_pred_test)
print("Confusion Matrix:")
print(cm)

cr = classification_report(y_test, y_pred_test)
print("\nClassification Report:",)
print (cr)

acc = accuracy_score(y_test, y_pred_test)
print("\nAccuracy:",acc*100)

Confusion Matrix:
[[26238     1]
 [ 2605     0]]

Classification Report:
              precision    recall  f1-score   support

           0       0.91      1.00      0.95     26239
           1       0.00      0.00      0.00      2605

    accuracy                           0.91     28844
   macro avg       0.45      0.50      0.48     28844
weighted avg       0.83      0.91      0.87     28844


Accuracy: 90.96519206767438


**Identifying the outliers by using box plot**