****DIABETES PREDICTION MODEL****

In [100]:
# import manipulation lybraries
import numpy as np
import pandas as pd

# import visualization lybraries
import matplotlib.pyplot as plt
import seaborn as sns

# import warning
import warnings
warnings.filterwarnings('ignore')

# import logging 
import logging 
logging.basicConfig(level = logging.INFO,
                    format="%(asctime)s - %(levelname)s - %(message)s",
                    filename = 'diabetes.log',
                    filemode = 'w',
                    force = True)
from collections import OrderedDict

# import machine learning lybraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder,RobustScaler 
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier,BaggingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix,mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import SMOTE


                    

In [74]:
# data ingestion


df = pd.read_csv(r'C:\Diabetes_Prediction_Model\data\raw\diabetes.csv')
df
             

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [75]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [78]:
# Segregate the numerical and categorical columns
numerical_cols = df.select_dtypes(exclude=['object']).columns
categorical_cols = df.select_dtypes(include=['object']).columns

from collections import OrderedDict

numerical_stats = []

for i in numerical_cols:
    num_stats = OrderedDict({
        "Feature":i,
        "Maximum":df[i].max(),
        "Minimum":df[i].min(),
        "Mean":df[i].mean(),
        "Q1":df[i].quantile(0.25),
        "Q2":df[i].quantile(0.50),
        "Q3":df[i].quantile(0.75),
        "IQR":df[i].quantile(0.75) - df[i].quantile(0.25),\
        "Standard Deviation":df[i].std(),
        "Variance":df[i].var(),
        "Skewness":df[i].skew(),
        "Kurtosis":df[i].kurtosis(),
    })
    numerical_stats.append(num_stats)
    numerical_stats_report = pd.DataFrame(numerical_stats)

pd.set_option('display.max_columns', None)
print(numerical_stats_report)

                    Feature  Maximum  Minimum        Mean        Q1        Q2  \
0               Pregnancies    17.00    0.000    3.845052   1.00000    3.0000   
1                   Glucose   199.00    0.000  120.894531  99.00000  117.0000   
2             BloodPressure   122.00    0.000   69.105469  62.00000   72.0000   
3             SkinThickness    99.00    0.000   20.536458   0.00000   23.0000   
4                   Insulin   846.00    0.000   79.799479   0.00000   30.5000   
5                       BMI    67.10    0.000   31.992578  27.30000   32.0000   
6  DiabetesPedigreeFunction     2.42    0.078    0.471876   0.24375    0.3725   
7                       Age    81.00   21.000   33.240885  24.00000   29.0000   
8                   Outcome     1.00    0.000    0.348958   0.00000    0.0000   

          Q3       IQR  Standard Deviation      Variance  Skewness  Kurtosis  
0    6.00000    5.0000            3.369578     11.354056  0.901674  0.159220  
1  140.25000   41.2500         

In [79]:
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [85]:
df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [103]:
from sklearn.preprocessing import StandardScaler

X = df.drop(columns=['Outcome'])
y = df['Outcome']

X_train, X_test, y_train, y_test = train_test_split(
                                                    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = RobustScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train,y_train)


In [106]:
models = {
    "Logistic regression": LogisticRegression(),
    "Random forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Add Boost" : AdaBoostClassifier(),
    "Bagging classifier": BaggingClassifier()

}

for model_name , model in models.items():
    model.fit(X_train,y_train) # seen data
    y_pred = model.predict(X_test)
    print(f"model:{model_name}")
    print(f"classification_report(y_pred, y_test)")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("-" * 50)

model:Logistic regression
classification_report(y_pred, y_test)
Confusion Matrix:
[[76 24]
 [16 38]]
--------------------------------------------------
model:Random forest
classification_report(y_pred, y_test)
Confusion Matrix:
[[76 24]
 [14 40]]
--------------------------------------------------
model:Gradient Boosting
classification_report(y_pred, y_test)
Confusion Matrix:
[[75 25]
 [16 38]]
--------------------------------------------------
model:Add Boost
classification_report(y_pred, y_test)
Confusion Matrix:
[[78 22]
 [11 43]]
--------------------------------------------------
model:Bagging classifier
classification_report(y_pred, y_test)
Confusion Matrix:
[[80 20]
 [23 31]]
--------------------------------------------------
