In [1]:
from sklearn.metrics import classification_report
import pandas as pd
import pickle
import warnings
warnings.filterwarnings('ignore')

In [2]:
diabetes_df = pd.read_csv('data/diabetes.csv')
diabetes_df.head()

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
0,40,Male,No,Yes,No,Yes,No,No,No,Yes,No,Yes,No,Yes,Yes,Yes,Positive
1,58,Male,No,No,No,Yes,No,No,Yes,No,No,No,Yes,No,Yes,No,Positive
2,41,Male,Yes,No,No,Yes,Yes,No,No,Yes,No,Yes,No,Yes,Yes,No,Positive
3,45,Male,No,No,Yes,Yes,Yes,Yes,No,Yes,No,Yes,No,No,No,No,Positive
4,60,Male,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Positive


In [3]:
print(pd.isna(diabetes_df).sum())

Age                   0
Gender                0
Polyuria              0
Polydipsia            0
sudden weight loss    0
weakness              0
Polyphagia            0
Genital thrush        0
visual blurring       0
Itching               0
Irritability          0
delayed healing       0
partial paresis       0
muscle stiffness      0
Alopecia              0
Obesity               0
class                 0
dtype: int64


In [4]:
for column in diabetes_df.columns:
    if column  == 'Gender':
        diabetes_df[column] = diabetes_df[column].replace(['Male', 'Female'], [0, 1])
    elif column == 'class':
        diabetes_df[column] = diabetes_df[column].replace(['Negative', 'Positive'], [0,1])
    else: 
        diabetes_df[column] = diabetes_df[column].replace(['No', 'Yes'], [0,1])

In [5]:
diabetes_df.head()

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
0,40,0,0,1,0,1,0,0,0,1,0,1,0,1,1,1,1
1,58,0,0,0,0,1,0,0,1,0,0,0,1,0,1,0,1
2,41,0,1,0,0,1,1,0,0,1,0,1,0,1,1,0,1
3,45,0,0,0,1,1,1,1,0,1,0,1,0,0,0,0,1
4,60,0,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1


In [6]:
X = diabetes_df.drop(["class"], axis=1)
y = diabetes_df["class"]
X.head()

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity
0,40,0,0,1,0,1,0,0,0,1,0,1,0,1,1,1
1,58,0,0,0,0,1,0,0,1,0,0,0,1,0,1,0
2,41,0,1,0,0,1,1,0,0,1,0,1,0,1,1,0
3,45,0,0,0,1,1,1,1,0,1,0,1,0,0,0,0
4,60,0,1,1,1,1,1,0,1,1,1,1,1,1,1,1


In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

In [8]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    random_state=42,
    n_estimators=30,
    criterion="gini",
    max_depth=5,
    warm_start=False,
    oob_score=True,
    class_weight=None,

)
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_test)
print(y_pred[0])

[0.79573893 0.20426107]


In [9]:
y_pred_df = pd.DataFrame(data = y_pred, columns=['0', '1'])

y_pred_df.head()

Unnamed: 0,0,1
0,0.795739,0.204261
1,0.013333,0.986667
2,0.064103,0.935897
3,0.029274,0.970726
4,0.128385,0.871615


In [10]:
print(classification_report(model.predict(X_val), y_val))

              precision    recall  f1-score   support

           0       1.00      0.86      0.93        44
           1       0.91      1.00      0.95        60

    accuracy                           0.94       104
   macro avg       0.95      0.93      0.94       104
weighted avg       0.95      0.94      0.94       104



In [11]:
pickle.dump(model, open('models/diabetes_model', 'wb'))