In [82]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [83]:
df = pd.read_csv('../data/preprocessed.csv')

In [84]:
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,-0.526397,-1.151398,-3.752683,-1.322774,-0.701206,-4.135256,-0.490735,-1.035940,0
1,1.588046,-0.276643,0.680345,0.233505,-0.701206,-0.489169,2.415030,1.487101,1
2,-0.828460,0.566871,-1.265862,-0.090720,0.013448,-0.424522,0.549161,-0.948939,0
3,-1.130523,1.254179,-1.049617,-1.322774,-0.701206,-1.303720,-0.639291,2.792122,0
4,0.681856,0.410665,0.572222,1.076490,2.484601,1.838121,-0.686829,1.139095,1
...,...,...,...,...,...,...,...,...,...
763,1.588046,1.379144,1.004713,-1.322774,-0.701206,-0.204722,-0.496677,1.400099,1
764,-0.828460,-1.370087,-0.725249,0.622575,-0.219029,0.170230,2.322925,-0.774936,0
765,1.285983,-0.807744,0.139732,-1.322774,-0.701206,0.622758,0.047040,2.096111,0
766,-0.526397,0.785560,0.031609,1.141335,2.398498,-0.515028,-0.392688,-0.339929,1


In [85]:
print(df.shape)
print(df.isnull().sum())
df['Outcome'].value_counts()

(768, 9)
Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


Outcome
0    500
1    268
Name: count, dtype: int64

In [86]:
X = df.drop("Outcome" , axis = 1)
y = df['Outcome']

In [87]:
X_train , X_test , y_train , y_test = train_test_split(
    X,y,
    test_size = 0.2,
    random_state = 42,
    stratify = y
)

In [88]:
X_train , X_test

(     Pregnancies   Glucose  BloodPressure  SkinThickness   Insulin       BMI  \
 341    -0.828460 -0.557814       0.572222      -0.609479  0.004838 -1.626955   
 711    -0.526397 -1.182639      -0.238697       0.492885 -0.132927  0.622758   
 358    -0.828460 -0.026713       0.572222       1.789784  1.020852  0.894276   
 35      0.681856  0.067011       0.139732       1.595249  1.279161  0.209018   
 686     1.588046 -0.932709      -0.076513      -1.322774 -0.701206 -1.006344   
 ..           ...       ...            ...            ...       ...       ...   
 506     0.077730  0.660595       0.896590      -1.322774 -0.701206  1.553674   
 104    -0.224334 -1.338845       0.031609      -1.322774 -0.701206  0.066795   
 549     0.379793 -0.307884       0.139732       0.492885 -0.701206 -1.045132   
 660     0.077730 -0.307884       0.139732       1.724939  1.081124  0.661547   
 100     1.890109 -0.182919       1.545326      -1.322774 -0.701206 -1.032203   
 
      DiabetesPedigreeFunc

# Checking for different Algorithms

In [95]:
from sklearn.tree import DecisionTreeClassifier

# Decision Tree
model =  DecisionTreeClassifier(
        class_weight="balanced",
         max_depth=5,
         random_state=42)


# Train
model.fit(X_train, y_train)


0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,5
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [96]:
y_pred = model.predict(X_test)

In [97]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.7337662337662337

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.66      0.76       100
           1       0.58      0.87      0.70        54

    accuracy                           0.73       154
   macro avg       0.74      0.77      0.73       154
weighted avg       0.79      0.73      0.74       154



In [98]:
confusion_matrix(y_test, y_pred)

array([[66, 34],
       [ 7, 47]])

In [99]:
pipeline = Pipeline([
    ("scaler" , StandardScaler()) , 
    ("model", DecisionTreeClassifier(
        class_weight="balanced",
         max_depth=5,
         random_state=42))
])

In [100]:
pipeline

0,1,2
,steps,"[('scaler', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,5
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [101]:
pipeline.fit(X_train,y_train)
print("Pipeline created successfully")

Pipeline created successfully


In [102]:
# Evaluating pipeline
y_pred = pipeline.predict(X_test)

In [103]:
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.66      0.76       100
           1       0.58      0.87      0.70        54

    accuracy                           0.73       154
   macro avg       0.74      0.77      0.73       154
weighted avg       0.79      0.73      0.74       154

Confusion Matrix:
 [[66 34]
 [ 7 47]]


In [104]:
import joblib

In [105]:
joblib.dump(pipeline, "../artifacts/diabetes_pipeline.pkl")
print("Pipeline saved as '../artifacts/diabetes_pipeline.pkl'")

Pipeline saved as '../artifacts/diabetes_pipeline.pkl'
