In [66]:
import numpy as np
import pandas as pd

In [67]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,LabelEncoder,OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

In [68]:
data=pd.read_csv("C:\\Users\\SRIVANI\\Downloads\\archive (6)\\diabetes_prediction_dataset.csv")
data.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [69]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


In [70]:
data["smoking_history"].unique()

array(['never', 'No Info', 'current', 'former', 'ever', 'not current'],
      dtype=object)

In [71]:
data.isnull().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64

In [72]:
x=data.drop(columns=['diabetes'])
y=data['diabetes']

In [73]:
categorical_cols = ['gender', 'smoking_history']
numeric_cols = ['age', 'bmi', 'HbA1c_level', 'blood_glucose_level']
binary_cols = ['hypertension', 'heart_disease']

In [74]:

preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
    ('num', StandardScaler(), numeric_cols)
], remainder='passthrough')  

In [75]:
knn_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', KNeighborsClassifier(n_neighbors=5,metric='euclidean'))
])


In [76]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [77]:
knn_pipeline.fit(x_train,y_train)

In [78]:
y_pred=knn_pipeline.predict(x_test)
y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [79]:
confusion_matrix(y_test,y_pred)

array([[18171,   121],
       [  666,  1042]], dtype=int64)

In [80]:

accuracy=accuracy_score(y_test,y_pred)
print(f"\nModel accuracy:{accuracy:2f}")
print(classification_report(y_test,y_pred))



Model accuracy:0.960650
              precision    recall  f1-score   support

           0       0.96      0.99      0.98     18292
           1       0.90      0.61      0.73      1708

    accuracy                           0.96     20000
   macro avg       0.93      0.80      0.85     20000
weighted avg       0.96      0.96      0.96     20000



In [81]:


# Make a prediction on a new sample
new_sample = {
    'gender': 'Male',
    'age': 89.5,
    'hypertension': 0,
    'heart_disease': 1,
    'smoking_history': 'No Info',
    'bmi': 20.5,
    'HbA1c_level': 6.6,
    'blood_glucose_level': 150
}
new_sample_df = pd.DataFrame([new_sample])
prediction = knn_pipeline.predict(new_sample_df)
print("Predicted Diabetes Outcome:", prediction[0])


Predicted Diabetes Outcome: 0
