In [1]:
!pip install streamlit

Defaulting to user installation because normal site-packages is not writeable


In [2]:
import streamlit as st
import numpy as np
import pandas as pd
import joblib
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
pip install seaborn

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [4]:
import pandas as pd

url = "diabetes_dataset.csv"
data = pd.read_csv(url)

print(data.head())



   year  gender   age location  race:AfricanAmerican  race:Asian  \
0  2020  Female  32.0  Alabama                     0           0   
1  2015  Female  29.0  Alabama                     0           1   
2  2015    Male  18.0  Alabama                     0           0   
3  2015    Male  41.0  Alabama                     0           0   
4  2016  Female  52.0  Alabama                     1           0   

   race:Caucasian  race:Hispanic  race:Other  hypertension  heart_disease  \
0               0              0           1             0              0   
1               0              0           0             0              0   
2               0              0           1             0              0   
3               1              0           0             0              0   
4               0              0           0             0              0   

  smoking_history    bmi  hbA1c_level  blood_glucose_level  diabetes  
0           never  27.32          5.0                  10

In [5]:
data['gender'] = data['gender'].astype('category')
data['smoking_history'] = data['smoking_history'].astype('category')
data['location'] = data['location'].astype('category')

data['year'] = data['year'].astype('int')
data['age'] = data['age'].astype('float')

if data.isnull().sum().sum() > 0:
    data = data.fillna(method='ffill')

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

categorical_cols = ['gender', 'location', 'smoking_history']
numerical_cols = ['year', 'age', 'bmi', 'hbA1c_level', 'blood_glucose_level']

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)])

X = data.drop('diabetes', axis=1)
y = data['diabetes']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train_prepared = preprocessor.fit_transform(X_train)
X_test_prepared = preprocessor.transform(X_test)

In [7]:
X

Unnamed: 0,year,gender,age,location,race:AfricanAmerican,race:Asian,race:Caucasian,race:Hispanic,race:Other,hypertension,heart_disease,smoking_history,bmi,hbA1c_level,blood_glucose_level
0,2020,Female,32.0,Alabama,0,0,0,0,1,0,0,never,27.32,5.0,100
1,2015,Female,29.0,Alabama,0,1,0,0,0,0,0,never,19.95,5.0,90
2,2015,Male,18.0,Alabama,0,0,0,0,1,0,0,never,23.76,4.8,160
3,2015,Male,41.0,Alabama,0,0,1,0,0,0,0,never,27.32,4.0,159
4,2016,Female,52.0,Alabama,1,0,0,0,0,0,0,never,23.75,6.5,90
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,2018,Female,33.0,Wyoming,0,0,0,0,1,0,0,never,21.21,6.5,90
99996,2016,Female,80.0,Wyoming,0,1,0,0,0,0,0,No Info,36.66,5.7,100
99997,2018,Male,46.0,Wyoming,0,1,0,0,0,0,0,ever,36.12,6.2,158
99998,2018,Female,51.0,Wyoming,1,0,0,0,0,0,0,not current,29.29,6.0,155


In [8]:
y

0        0
1        0
2        0
3        0
4        0
        ..
99995    0
99996    0
99997    0
99998    0
99999    0
Name: diabetes, Length: 100000, dtype: int64

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score

model = LogisticRegression(max_iter=1000)
model.fit(X_train_prepared, y_train)

y_pred = model.predict(X_test_prepared)
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, model.predict_proba(X_test_prepared)[:, 1])

print(f"Accuracy: {accuracy:.4f}, ROC AUC: {roc_auc:.4f}")

Accuracy: 0.9597, ROC AUC: 0.9587


In [10]:
import joblib

joblib.dump(model, 'model.pkl')
joblib.dump(preprocessor, 'preprocessor.pkl')

print("Model and preprocessor saved successfully.")

Model and preprocessor saved successfully.


In [11]:
cat_feature_names = preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_cols)
feature_names = np.concatenate((numerical_cols, cat_feature_names), axis=None)

coefficients = model.coef_[0]

feature_importance = pd.DataFrame(list(zip(feature_names, coefficients)), columns=['Feature', 'Coefficient'])

feature_importance['abs_coefficient'] = feature_importance['Coefficient'].abs()
feature_importance_sorted = feature_importance.sort_values(by='abs_coefficient', ascending=False)

top_10_features = feature_importance_sorted.head(10)
print(top_10_features)

                        Feature  Coefficient  abs_coefficient
3                   hbA1c_level     2.518005         2.518005
5                 gender_Female    -1.655956         1.655956
4           blood_glucose_level     1.354499         1.354499
6                   gender_Male    -1.337933         1.337933
1                           age     1.146832         1.146832
63      smoking_history_No Info    -1.092483         1.092483
2                           bmi     0.624571         0.624571
68  smoking_history_not current    -0.533560         0.533560
67        smoking_history_never    -0.446371         0.446371
66       smoking_history_former    -0.340521         0.340521
