In [3]:
import pandas as pd
import numpy as np
import random

# Set random seed for reproducibility
random.seed(42)
np.random.seed(42)

# Number of rows
num_rows = 75000

# Generate raw data
data = {
    "Patient_ID": ["ID" + str(i).zfill(6) for i in range(1, num_rows + 1)],
    "Age": np.random.randint(18, 80, num_rows),
    "Gender": np.random.choice(["Male", "Female", "  Male  ", "  Female  "], num_rows),
    "BMI": np.random.uniform(18.5, 40, num_rows).round(2),
    "Glucose_Level": np.random.randint(70, 200, num_rows),
    "Blood_Pressure": np.random.randint(60, 180, num_rows),
    "Insulin": np.random.randint(15, 300, num_rows),
    "Diabetes_Pedigree_Function": np.random.uniform(0.1, 2.5, num_rows).round(2),
    "Pregnancies": np.random.randint(0, 15, num_rows),
    "Outcome": np.random.choice([0, 1], num_rows, p=[0.65, 0.35]),  # 0 = No Diabetes, 1 = Diabetes
}

# Convert to DataFrame
raw_diabetes_data = pd.DataFrame(data)

# Introduce missing values
for col in ["BMI", "Glucose_Level", "Blood_Pressure", "Insulin"]:
    raw_diabetes_data.loc[
        np.random.choice(raw_diabetes_data.index, size=5000, replace=False), col
    ] = np.nan

# Introduce duplicates
raw_diabetes_data = pd.concat([raw_diabetes_data, raw_diabetes_data.sample(2000)])

# Add leading and trailing spaces to categorical columns
raw_diabetes_data["Gender"] = raw_diabetes_data["Gender"].astype(str)

# Save the raw dataset
raw_diabetes_data.to_csv("raw_diabetes_data.csv", index=False)
print("Raw diabetes dataset created with 75,000+ rows.")


Raw diabetes dataset created with 75,000+ rows.


# Step 1: Load and Inspect the Dataset

In [5]:
import pandas as pd

# Load the raw dataset
df = pd.read_csv("raw_diabetes_data.csv")

# Display the first 10 rows
print("First 10 rows of the dataset:")
df.head(10)

First 10 rows of the dataset:


Unnamed: 0,Patient_ID,Age,Gender,BMI,Glucose_Level,Blood_Pressure,Insulin,Diabetes_Pedigree_Function,Pregnancies,Outcome
0,ID000001,56,Male,,119.0,166.0,85.0,2.01,12,0
1,ID000002,69,Male,28.7,100.0,124.0,260.0,2.24,10,0
2,ID000003,46,Female,27.47,170.0,141.0,238.0,0.94,0,0
3,ID000004,32,Female,27.26,130.0,,182.0,0.32,0,0
4,ID000005,60,Female,32.11,78.0,167.0,63.0,2.48,11,1
5,ID000006,25,Male,37.8,106.0,172.0,80.0,2.25,2,1
6,ID000007,78,Male,32.48,106.0,170.0,85.0,1.84,11,0
7,ID000008,38,Female,24.18,126.0,143.0,212.0,2.32,9,0
8,ID000009,56,Female,,143.0,157.0,201.0,0.82,0,0
9,ID000010,75,Female,39.53,145.0,81.0,90.0,2.49,9,1


In [3]:
# Check the shape of the dataset
print("Number of rows and columns:")
print(df.shape)

Number of rows and columns:
(77000, 10)


In [17]:
# Check for missing values in each column
print("Missing values per column:")
print(df.isnull().sum())

Missing values per column:
Patient_ID                    0
Age                           0
Gender                        0
BMI                           0
Glucose_Level                 0
Blood_Pressure                0
Insulin                       0
Diabetes_Pedigree_Function    0
Pregnancies                   0
Outcome                       0
Age_Range                     0
dtype: int64


In [16]:
# Get basic information about the dataset
print("Dataset information:")
print(df.info())

Dataset information:
<class 'pandas.core.frame.DataFrame'>
Index: 75000 entries, 0 to 74999
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype   
---  ------                      --------------  -----   
 0   Patient_ID                  75000 non-null  object  
 1   Age                         75000 non-null  int64   
 2   Gender                      75000 non-null  object  
 3   BMI                         75000 non-null  float64 
 4   Glucose_Level               75000 non-null  float64 
 5   Blood_Pressure              75000 non-null  float64 
 6   Insulin                     75000 non-null  float64 
 7   Diabetes_Pedigree_Function  75000 non-null  float64 
 8   Pregnancies                 75000 non-null  int64   
 9   Outcome                     75000 non-null  int64   
 10  Age_Range                   75000 non-null  category
dtypes: category(1), float64(5), int64(3), object(2)
memory usage: 6.4+ MB
None


In [15]:
# Count the number of duplicates
num_duplicates = df.duplicated().sum()
print("Number of duplicate rows:")
print(num_duplicates)

Number of duplicate rows:
0


In [14]:
# Remove duplicates
df = df.drop_duplicates()

# Verify duplicates are removed
print("Shape after removing duplicates:")
print(df.shape)

Shape after removing duplicates:
(75000, 11)


# Handle Missing Values

In [13]:
# Fill missing BMI values by Gender median
median_bmi_male = df[df['Gender'].str.strip() == "Male"]["BMI"].median()
median_bmi_female = df[df['Gender'].str.strip() == "Female"]["BMI"].median()

df.loc[df['Gender'].str.strip() == "Male", 'BMI'] = df[df[
    'Gender'].str.strip() == "Male"]['BMI'].fillna(median_bmi_male)

df.loc[df['Gender'].str.strip() == "Female", 'BMI'] = df[df[
    'Gender'].str.strip() == "Female"]['BMI'].fillna(median_bmi_female)


In [12]:
# Fill missing Glucose_Level values by Outcome mean
mean_glucose_no = df[df['Outcome'] == 0]['Glucose_Level'].mean()
mean_glucose_yes = df[df['Outcome'] == 1]['Glucose_Level'].mean()

df.loc[df['Outcome'] == 0, 'Glucose_Level'] = df[df[
    'Outcome'] == 0]['Glucose_Level'].fillna(mean_glucose_no)

df.loc[df['Outcome'] == 1, 'Glucose_Level'] = df[df[
    'Outcome'] == 1]['Glucose_Level'].fillna(mean_glucose_yes)


In [11]:
# Fill missing Blood_Pressure by Age ranges
df['Age_Range'] = pd.cut(df['Age'], bins=[0, 30, 50, 80], labels=[
    'Young', 'Middle-aged', 'Older'])

median_bp_young = df[df['Age_Range'] == "Young"]['Blood_Pressure'].median()
median_bp_middle = df[df['Age_Range'] == "Middle-aged"]['Blood_Pressure'].median()
median_bp_older = df[df['Age_Range'] == "Older"]['Blood_Pressure'].median()

df.loc[df['Age_Range'] == "Young", 'Blood_Pressure'] = df[df[
    'Age_Range'] == "Young"]['Blood_Pressure'].fillna(median_bp_young)

df.loc[df['Age_Range'] == "Middle-aged", 'Blood_Pressure'] = df[df[
    'Age_Range'] == "Middle-aged"]['Blood_Pressure'].fillna(median_bp_middle)

df.loc[df['Age_Range'] == "Older", 'Blood_Pressure'] = df[df[
    'Age_Range'] == "Older"]['Blood_Pressure'].fillna(median_bp_older)


In [10]:
# Fill missing Insulin values by Outcome median
median_insulin_no = df[df['Outcome'] == 0]['Insulin'].median()
median_insulin_yes = df[df['Outcome'] == 1]['Insulin'].median()

df.loc[df['Outcome'] == 0, 'Insulin'] = df[df['Outcome'] == 0][
    'Insulin'].fillna(median_insulin_no)

df.loc[df['Outcome'] == 1, 'Insulin'] = df[df['Outcome'] == 1][
    'Insulin'].fillna(median_insulin_yes)

# Clean Categorical Columns

In [12]:
# Strip leading and trailing spaces from Gender
df['Gender'] = df['Gender'].str.strip()

In [13]:
# Verify unique values in Gender
print("Unique values in Gender after cleaning:")
print(df['Gender'].unique())

Unique values in Gender after cleaning:
['Male' 'Female']


# Exploratory Data Analysis (EDA)

In [14]:
# Descriptive statistics of numerical columns
print("Descriptive statistics:")
print(df.describe())

Descriptive statistics:
                Age           BMI  Glucose_Level  Blood_Pressure  \
count  75000.000000  75000.000000   75000.000000    75000.000000   
mean      48.542307     29.233257     134.596171      119.260213   
std       17.892669      5.990626      36.312304       33.471576   
min       18.000000     18.500000      70.000000       60.000000   
25%       33.000000     24.240000     104.000000       91.000000   
50%       48.000000     29.220000     134.563308      119.000000   
75%       64.000000     34.180000     165.000000      147.000000   
max       79.000000     40.000000     199.000000      179.000000   

            Insulin  Diabetes_Pedigree_Function   Pregnancies       Outcome  
count  75000.000000                75000.000000  75000.000000  75000.000000  
mean     157.832933                    1.295869      7.005067      0.347853  
std       79.486816                    0.692298      4.329146      0.476292  
min       15.000000                    0.100000    

In [15]:
# Outcome count
print("Count of each Outcome (0 = No Diabetes, 1 = Diabetes):")
print(df['Outcome'].value_counts())

Count of each Outcome (0 = No Diabetes, 1 = Diabetes):
Outcome
0    48911
1    26089
Name: count, dtype: int64


In [16]:
# Select numerical columns only
numerical_df = df.select_dtypes(include=['float64', 'int64'])

# Calculate correlation between numerical features
correlation_matrix = numerical_df.corr()

# Print the correlation matrix
print("Correlation matrix:")
print(correlation_matrix)

Correlation matrix:
                                 Age       BMI  Glucose_Level  Blood_Pressure  \
Age                         1.000000 -0.003428      -0.002356        0.000973   
BMI                        -0.003428  1.000000       0.001763       -0.002038   
Glucose_Level              -0.002356  0.001763       1.000000        0.002147   
Blood_Pressure              0.000973 -0.002038       0.002147        1.000000   
Insulin                     0.002998  0.001635      -0.003110       -0.002344   
Diabetes_Pedigree_Function  0.002062 -0.001042       0.003445        0.004572   
Pregnancies                 0.000178 -0.004871      -0.001806        0.000645   
Outcome                    -0.002759 -0.006668       0.001239        0.003403   

                             Insulin  Diabetes_Pedigree_Function  Pregnancies  \
Age                         0.002998                    0.002062     0.000178   
BMI                         0.001635                   -0.001042    -0.004871   
Glucose

# Train-Test Split for Modeling

In [19]:
from sklearn.model_selection import train_test_split

# Select features and target
X = df[['Age', 'BMI', 'Glucose_Level', 'Blood_Pressure', 'Insulin', 
        'Diabetes_Pedigree_Function', 'Pregnancies']]
y = df['Outcome']

In [20]:
# Split the dataset into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                            test_size=0.2, random_state=42)

In [21]:
print("Shapes of training and testing sets:")
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

Shapes of training and testing sets:
(60000, 7) (15000, 7) (60000,) (15000,)


# Modeling (Random Forest)

In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [21]:
# Train Random Forest classifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

In [22]:
# Predict on test data
y_pred_rf = rf_model.predict(X_test)
print(y_pred_rf)

[0 0 0 ... 0 0 0]


In [23]:
# Evaluate Random Forest model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("Random Forest Accuracy:")
print(accuracy_rf)

Random Forest Accuracy:
0.6862666666666667


In [24]:
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))

Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.68      0.96      0.80      9757
           1       0.72      0.17      0.27      5243

    accuracy                           0.69     15000
   macro avg       0.70      0.57      0.54     15000
weighted avg       0.70      0.69      0.62     15000



# Modeling (SVM)

In [18]:
from sklearn.svm import SVC

# Train SVM classifier
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)

ValueError: Input X contains NaN.
SVC does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
# Predict on test data
y_pred_svm = svm_model.predict(X_test)

In [None]:
# Evaluate SVM model
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print("SVM Accuracy:")
print(accuracy_svm)

In [None]:
print("SVM Classification Report:")
print(classification_report(y_test, y_pred_svm))

In [None]:
# Compare Models

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
# Random Forest Metrics
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)

In [None]:
print("Random Forest Metrics:")
print("Accuracy:", accuracy_rf)
print("Precision:", precision_rf)
print("Recall:", recall_rf)
print("F1 Score:", f1_rf)

In [None]:
# SVM Metrics
accuracy_svm = accuracy_score(y_test, y_pred_svm)
precision_svm = precision_score(y_test, y_pred_svm)
recall_svm = recall_score(y_test, y_pred_svm)
f1_svm = f1_score(y_test, y_pred_svm)

print("SVM Metrics:")
print("Accuracy:", accuracy_svm)
print("Precision:", precision_svm)
print("Recall:", recall_svm)
print("F1 Score:", f1_svm)

In [None]:
# Comparison Summary
print("\nComparison Summary:")
print("Random Forest: Accuracy =", "accuracy_rf, ",
      Precision =, "precision_rf,
      ", Recall =", recall_rf, ", F1 =", f1_rf)

print("SVM: Accuracy =", accuracy_svm, ", Precision =", precision_svm, ",
      Recall =", recall_svm, ", F1 =", f1_svm)

In [None]:
# Feature Importance with Random Forest

In [None]:
import matplotlib.pyplot as plt

# Get feature importances from the Random Forest model
feature_importances = rf_model.feature_importances_

# List of feature names
feature_names = X.columns

In [None]:
# Combine features and their importance scores
features_with_importance = pd.DataFrame({'Feature': feature_names, 
                                         'Importance': feature_importances})


In [None]:
# Sort by importance
features_with_importance = features_with_importance.sort_values(by=
                                            'Importance', ascending=False)


In [None]:
print("Feature Importance from Random Forest:")
print(features_with_importance)

In [None]:
# Plot feature importance
plt.figure(figsize=(10, 6))
plt.barh(features_with_importance['Feature'], features_with_importance['Importance'],
         color='skyblue')
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.title("Feature Importance (Random Forest)")
plt.gca().invert_yaxis()
plt.show()

In [None]:
# Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for Random Forest
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [None]:
# Perform GridSearchCV for Random Forest
rf_grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
                              param_grid=rf_param_grid,
                              cv=3, scoring='accuracy', verbose=2)

In [None]:
rf_grid_search.fit(X_train, y_train)

# Best parameters and accuracy
print("Best Random Forest Parameters:", rf_grid_search.best_params_)
print("Best Random Forest Accuracy:", rf_grid_search.best_score_)

In [None]:
# SVM Hyperparameter Tuning:

In [None]:
# Define the parameter grid for SVM
svm_param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': [0.1, 0.01, 0.001]
}

In [None]:
# Perform GridSearchCV for SVM
svm_grid_search = GridSearchCV(estimator=SVC(random_state=42),
                               param_grid=svm_param_grid,
                               cv=3, scoring='accuracy', verbose=2)


In [None]:
svm_grid_search.fit(X_train, y_train)

# Best parameters and accuracy
print("Best SVM Parameters:", svm_grid_search.best_params_)
print("Best SVM Accuracy:", svm_grid_search.best_score_)

In [None]:
# Distribution Analysis of Features

In [None]:
# Manually calculate statistics for 'Age'
mean_age = X['Age'].sum() / X['Age'].count()
median_age = X['Age'].sort_values().iloc[X['Age'].count() // 2]
mode_age = X['Age'].value_counts().index[0]

print("Age Statistics:")
print("Mean Age:", mean_age)
print("Median Age:", median_age)
print("Mode Age:", mode_age)

In [None]:
# Similarly, calculate for Glucose
mean_glucose = X['Glucose'].sum() / X['Glucose'].count()
median_glucose = X['Glucose'].sort_values().iloc[X['Glucose'].count() // 2]
mode_glucose = X['Glucose'].value_counts().index[0]

print("\nGlucose Statistics:")
print("Mean Glucose:", mean_glucose)
print("Median Glucose:", median_glucose)
print("Mode Glucose:", mode_glucose)

In [None]:
# Relationships Between Variables
# Manually check relationships between variables (e.g., BMI and Glucose).

In [None]:
# BMI vs Glucose: Calculate average BMI for high glucose (>140)
high_glucose = X[X['Glucose'] > 140]
avg_bmi_high_glucose = high_glucose['BMI'].sum() / high_glucose['BMI'].count()

print("Average BMI for High Glucose (>140):", avg_bmi_high_glucose)

# BMI vs Glucose: Calculate average BMI for low glucose (<100)
low_glucose = X[X['Glucose'] < 100]
avg_bmi_low_glucose = low_glucose['BMI'].sum() / low_glucose['BMI'].count()

print("Average BMI for Low Glucose (<100):", avg_bmi_low_glucose)

In [None]:
# Class Imbalance Analysis

In [None]:
# Count for each class in Outcome
class_0_count = y[y == 0].count()
class_1_count = y[y == 1].count()

In [None]:
print("Class Distribution:")
print("Class 0 (No Diabetes):", class_0_count)
print("Class 1 (Diabetes):", class_1_count)

In [None]:
# Calculate percentages
total_count = class_0_count + class_1_count
class_0_percentage = (class_0_count / total_count) * 100
class_1_percentage = (class_1_count / total_count) * 100

print("\nClass Percentage Distribution:")
print("Class 0 (No Diabetes):", class_0_percentage, "%")
print("Class 1 (Diabetes):", class_1_percentage, "%")

In [None]:
# Group-Based Analysis

In [None]:
# Filter rows by Gender
male_bmi = X[X['Gender'] == 'Male']['BMI']
female_bmi = X[X['Gender'] == 'Female']['BMI']

# Calculate averages
avg_male_bmi = male_bmi.sum() / male_bmi.count()
avg_female_bmi = female_bmi.sum() / female_bmi.count()

print("Average BMI by Gender:")
print("Male:", avg_male_bmi)
print("Female:", avg_female_bmi)

In [None]:
# Install the Required Library
pip install firthlogist

In [None]:
from firthlogist import FirthLogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

In [None]:
# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Standardize the numerical features for better model performance
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.select_dtypes(include=['float64', 'int64']))
X_test_scaled = scaler.transform(X_test.select_dtypes(include=['float64', 'int64']))

In [None]:
# Fit Firth Logistic Regression
firth_model = FirthLogisticRegression()
firth_model.fit(X_train_scaled, y_train)

In [None]:
# Predict on the test set
y_pred = firth_model.predict(X_test_scaled)

In [None]:
# Evaluate the model
print("Classification Report (Firth Logistic Regression):")
print(classification_report(y_test, y_pred))

# Analysis Report: Diabetes Dataset

# Introduction

# Data Preprocessing

# Exploratory Data Analysis (EDA)

# Predictive Modeling

# Firth Logistic Regression

In [None]:
# you should write like as above for random forest and svm

# Key Findings