## Diabetes Prediction

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [4]:
df = pd.read_csv("E:\\ML\\diabetes_prediction_dataset.csv")

In [5]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


In [7]:
df.describe()

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,41.885856,0.07485,0.03942,27.320767,5.527507,138.05806,0.085
std,22.51684,0.26315,0.194593,6.636783,1.070672,40.708136,0.278883
min,0.08,0.0,0.0,10.01,3.5,80.0,0.0
25%,24.0,0.0,0.0,23.63,4.8,100.0,0.0
50%,43.0,0.0,0.0,27.32,5.8,140.0,0.0
75%,60.0,0.0,0.0,29.58,6.2,159.0,0.0
max,80.0,1.0,1.0,95.69,9.0,300.0,1.0


In [8]:
df.isnull().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64

### Encoding

In [10]:
from sklearn.preprocessing import LabelEncoder

In [11]:
encoder = LabelEncoder()

In [12]:
import warnings
warnings.filterwarnings('ignore')
df['gender'] = encoder.fit_transform(df[['gender']])

In [13]:
df['smoking_history'].value_counts()

smoking_history
No Info        35816
never          35095
former          9352
current         9286
not current     6447
ever            4004
Name: count, dtype: int64

In [25]:
df_dummy = pd.get_dummies(df, columns=['smoking_history'], drop_first=True)

In [27]:
df_dummy.head()

Unnamed: 0,gender,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes,smoking_history_current,smoking_history_ever,smoking_history_former,smoking_history_never,smoking_history_not current
0,0,80.0,0,1,25.19,6.6,140,0,False,False,False,True,False
1,0,54.0,0,0,27.32,6.6,80,0,False,False,False,False,False
2,1,28.0,0,0,27.32,5.7,158,0,False,False,False,True,False
3,0,36.0,0,0,23.45,5.0,155,0,True,False,False,False,False
4,1,76.0,1,1,20.14,4.8,155,0,True,False,False,False,False


In [31]:
df_dummy = df_dummy.astype({col: int for col in ['smoking_history_current', 'smoking_history_ever', 'smoking_history_former', 'smoking_history_never', 'smoking_history_not current'] if df_dummy[col].dtype == 'bool'})

In [33]:
df_dummy.head()

Unnamed: 0,gender,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes,smoking_history_current,smoking_history_ever,smoking_history_former,smoking_history_never,smoking_history_not current
0,0,80.0,0,1,25.19,6.6,140,0,0,0,0,1,0
1,0,54.0,0,0,27.32,6.6,80,0,0,0,0,0,0
2,1,28.0,0,0,27.32,5.7,158,0,0,0,0,1,0
3,0,36.0,0,0,23.45,5.0,155,0,1,0,0,0,0
4,1,76.0,1,1,20.14,4.8,155,0,1,0,0,0,0


In [37]:
## independent features
X = df_dummy[['gender', 'age', 'hypertension', 'heart_disease', 'bmi', 'HbA1c_level', 'blood_glucose_level', 'smoking_history_current', 'smoking_history_ever', 'smoking_history_former', 'smoking_history_never', 'smoking_history_not current']] 

In [39]:
X

Unnamed: 0,gender,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,smoking_history_current,smoking_history_ever,smoking_history_former,smoking_history_never,smoking_history_not current
0,0,80.0,0,1,25.19,6.6,140,0,0,0,1,0
1,0,54.0,0,0,27.32,6.6,80,0,0,0,0,0
2,1,28.0,0,0,27.32,5.7,158,0,0,0,1,0
3,0,36.0,0,0,23.45,5.0,155,1,0,0,0,0
4,1,76.0,1,1,20.14,4.8,155,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0,80.0,0,0,27.32,6.2,90,0,0,0,0,0
99996,0,2.0,0,0,17.37,6.5,100,0,0,0,0,0
99997,1,66.0,0,0,27.83,5.7,155,0,0,1,0,0
99998,0,24.0,0,0,35.42,4.0,100,0,0,0,1,0


In [43]:
## dependent features
y = df_dummy['diabetes']

In [46]:
y

0        0
1        0
2        0
3        0
4        0
        ..
99995    0
99996    0
99997    0
99998    0
99999    0
Name: diabetes, Length: 100000, dtype: int64

In [48]:
y.value_counts()

diabetes
0    91500
1     8500
Name: count, dtype: int64

In [50]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [52]:
from sklearn.linear_model import LogisticRegression

In [54]:
regression = LogisticRegression(class_weight='balanced')

In [56]:
regression.fit(X_train, y_train)

In [58]:
y_pred = regression.predict(X_test)

In [60]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, confusion_matrix

In [62]:
f1score = f1_score(y_test, y_pred)
accscore = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
precisionscore = precision_score(y_test, y_pred)
recallscore = recall_score(y_test, y_pred)

In [64]:
print(f1score)
print(accscore)
print(cm)
print(precisionscore)
print(recallscore)

0.5684817230395874
0.8862
[[20281  2569]
 [  276  1874]]
0.42178708080126043
0.8716279069767442


In [66]:
import numpy as np
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

# Get predicted probabilities for class 1 (diabetes)
y_probs = regression.predict_proba(X_test)[:, 1]

# Define a range of thresholds to test
thresholds = np.arange(0.1, 0.9, 0.05)

# Store best values
best_threshold = 0.5
best_f1 = 0

print("Threshold\tF1 Score\tPrecision\tRecall")
for thresh in thresholds:
    y_pred_thresh = (y_probs >= thresh).astype(int)
    f1 = f1_score(y_test, y_pred_thresh)
    precision = precision_score(y_test, y_pred_thresh)
    recall = recall_score(y_test, y_pred_thresh)
    
    print(f"{thresh:.2f}\t\t{f1:.4f}\t\t{precision:.4f}\t\t{recall:.4f}")
    
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = thresh

print(f"\n✅ Best threshold for F1 score: {best_threshold:.2f} (F1 = {best_f1:.4f})")


Threshold	F1 Score	Precision	Recall
0.10		0.3313		0.1989		0.9902
0.15		0.3706		0.2284		0.9828
0.20		0.4024		0.2539		0.9693
0.25		0.4360		0.2821		0.9586
0.30		0.4661		0.3095		0.9433
0.35		0.4939		0.3363		0.9298
0.40		0.5197		0.3634		0.9121
0.45		0.5427		0.3903		0.8902
0.50		0.5685		0.4218		0.8716
0.55		0.5949		0.4563		0.8544
0.60		0.6159		0.4900		0.8288
0.65		0.6389		0.5292		0.8060
0.70		0.6606		0.5732		0.7795
0.75		0.6815		0.6198		0.7567
0.80		0.7012		0.6744		0.7302
0.85		0.7146		0.7391		0.6916

✅ Best threshold for F1 score: 0.85 (F1 = 0.7146)
