In [1]:
import pandas as pd
import numpy as np

## Loding the dataset

In [2]:
df = pd.read_csv("diabetes.csv")
print("done")

done


### EDA

In [3]:
df.size

6912

In [4]:
df.sample(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
598,1,173,74,0,0,36.8,0.088,38,1
109,0,95,85,25,36,37.4,0.247,24,1
688,1,140,74,26,180,24.1,0.828,23,0
72,13,126,90,0,0,43.4,0.583,42,1
244,2,146,76,35,194,38.2,0.329,29,0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [6]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [7]:
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [8]:
df.duplicated().sum()

0

In [9]:
df.corr()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
Pregnancies,1.0,0.129459,0.141282,-0.081672,-0.073535,0.017683,-0.033523,0.544341,0.221898
Glucose,0.129459,1.0,0.15259,0.057328,0.331357,0.221071,0.137337,0.263514,0.466581
BloodPressure,0.141282,0.15259,1.0,0.207371,0.088933,0.281805,0.041265,0.239528,0.065068
SkinThickness,-0.081672,0.057328,0.207371,1.0,0.436783,0.392573,0.183928,-0.11397,0.074752
Insulin,-0.073535,0.331357,0.088933,0.436783,1.0,0.197859,0.185071,-0.042163,0.130548
BMI,0.017683,0.221071,0.281805,0.392573,0.197859,1.0,0.140647,0.036242,0.292695
DiabetesPedigreeFunction,-0.033523,0.137337,0.041265,0.183928,0.185071,0.140647,1.0,0.033561,0.173844
Age,0.544341,0.263514,0.239528,-0.11397,-0.042163,0.036242,0.033561,1.0,0.238356
Outcome,0.221898,0.466581,0.065068,0.074752,0.130548,0.292695,0.173844,0.238356,1.0


### Feature engineering

In [10]:
def ageGroup(age):
    if age <= 30:
        return 0           # age group 0 young
    elif age <= 60:
        return 1           # age group 1 elders
    return 2               # age group 2 old

df['AgeGroup'] = df['Age'].map(ageGroup)

In [11]:
def bmiCategory(bmi):
    if bmi <= 18.5:
        return 0         # underweight
    elif bmi <= 24.9:
        return 1         # normal
    elif bmi <= 29.9:
        return 2         # overweight
    return 3             # obbessed

df['BMIGroup'] = df['BMI'].map(bmiCategory)

In [12]:
# glucose to insuline ration -> GTIratio
df['GTIratio'] = df['Glucose'] / (df['Insulin'] +1)

In [13]:
df['isInsulinFlag'] =((df['Insulin'] > 100) & (df['Glucose'] > 120)).astype(int)

### Model creation

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome', 'AgeGroup',
       'BMIGroup', 'GTIratio', 'isInsulinFlag'],
      dtype='object')

In [16]:
X = df.drop(columns=['Outcome', 'Age', 'BMI', 'SkinThickness'])
y = df['Outcome']

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

In [17]:
x_train.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,Insulin,DiabetesPedigreeFunction,AgeGroup,BMIGroup,GTIratio,isInsulinFlag
602,1,124,74,0,0.1,0,2,124.0,0
429,1,95,82,180,0.233,1,3,0.524862,0
623,0,94,70,115,0.347,0,3,0.810345,0
209,7,184,84,0,0.355,1,3,184.0,0
589,0,73,0,0,0.342,0,1,73.0,0


In [18]:
x_test.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,Insulin,DiabetesPedigreeFunction,AgeGroup,BMIGroup,GTIratio,isInsulinFlag
158,2,88,74,53,0.229,0,2,1.62963,0
251,2,129,84,0,0.284,0,2,129.0,0
631,0,102,78,90,0.238,0,3,1.120879,0
757,0,123,72,0,0.258,1,3,123.0,0
689,1,144,82,180,0.335,1,3,0.79558,1


In [49]:
y_train.head()

602    0
429    1
623    0
209    1
589    0
Name: Outcome, dtype: int64

In [50]:
y_test.head()

158    0
251    0
631    0
757    1
689    1
Name: Outcome, dtype: int64

In [51]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

In [52]:
lr = LogisticRegression()
lr.fit(x_train, y_train)
y_pred = lr.predict(x_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [31]:
from sklearn.metrics import accuracy_score, f1_score, classification_report
report = classification_report(y_test, y_pred)

In [32]:
print(report)

              precision    recall  f1-score   support

           0       0.81      0.88      0.84       109
           1       0.63      0.49      0.55        45

    accuracy                           0.77       154
   macro avg       0.72      0.68      0.70       154
weighted avg       0.75      0.77      0.76       154



In [60]:
def try_all(models):
    for m in models:
        model = m
        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
    
        print("\n-----------")
        print(f"reeport using ---> {model}")
        report = classification_report(y_test, y_pred)
        print(report)

models = [LogisticRegression(), SVC(), KNeighborsClassifier(), DecisionTreeClassifier(), ExtraTreesClassifier(), BalancedRandomForestClassifier()
]
try_all(models)    

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



-----------
reeport using ---> LogisticRegression()
              precision    recall  f1-score   support

           0       0.84      0.82      0.83       109
           1       0.58      0.62      0.60        45

    accuracy                           0.76       154
   macro avg       0.71      0.72      0.72       154
weighted avg       0.76      0.76      0.76       154


-----------
reeport using ---> SVC()
              precision    recall  f1-score   support

           0       0.80      0.91      0.85       109
           1       0.67      0.44      0.53        45

    accuracy                           0.77       154
   macro avg       0.73      0.68      0.69       154
weighted avg       0.76      0.77      0.76       154


-----------
reeport using ---> KNeighborsClassifier()
              precision    recall  f1-score   support

           0       0.76      0.86      0.81       109
           1       0.52      0.36      0.42        45

    accuracy                        

In [61]:
df['Outcome'].value_counts()

Outcome
0    500
1    268
Name: count, dtype: int64