In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder,StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

In [3]:
data = pd.read_csv('data/data.csv')

In [4]:
data.drop(['id','alcohol_consumption_per_week','screen_time_hours_per_day','sleep_hours_per_day'],axis=1,inplace=True)

In [5]:
data['metabolic_risk'] = data['bmi']*data['cholesterol_total']
data['bad_diet_bmi'] = data['bmi']*(10-data['diet_score'])
data['lifestyle_risk'] = (data['bmi']*(data['cholesterol_total'])*(10-data['diet_score'])) - data['physical_activity_minutes_per_week']
data['non_genetic_risk'] = data['lifestyle_risk']*(((1-data['family_history_diabetes'])+(1-data['cardiovascular_history'])+(1-data['hypertension_history']))/3)

In [6]:
data.drop(['lifestyle_risk'],axis=1,inplace=True)

In [7]:
x = data.drop(['diagnosed_diabetes'],axis=1)
y = data['diagnosed_diabetes']

In [8]:
train_x,test_x,train_y,test_y = train_test_split(x,y,test_size=0.2,random_state=42)

In [9]:
for i in ['age','physical_activity_minutes_per_week', 'diet_score', 'bmi','waist_to_hip_ratio', 'systolic_bp', 'diastolic_bp', 'heart_rate','cholesterol_total', 'hdl_cholesterol', 'ldl_cholesterol','triglycerides']:
    Q1 = train_x[i].quantile(0.25)
    Q3 = train_x[i].quantile(0.75)
    IQR = Q3 - Q1
    lower_limit = Q1 - IQR
    upper_limit = Q3 + IQR
    train_x[i] = np.where(train_x[i]<lower_limit,lower_limit,np.where(train_x[i]>upper_limit,upper_limit,train_x[i]))
    test_x[i] = np.where(test_x[i]<lower_limit,lower_limit,np.where(test_x[i]>upper_limit,upper_limit,test_x[i]))

In [10]:
num_col = ['age','physical_activity_minutes_per_week', 'diet_score','bmi','waist_to_hip_ratio', 'systolic_bp', 'diastolic_bp', 'heart_rate','cholesterol_total', 'hdl_cholesterol', 'ldl_cholesterol','triglycerides','metabolic_risk','bad_diet_bmi','non_genetic_risk']
ord_col = ['education_level','income_level']
ord_categories = [['No formal','Highschool','Graduate','Postgraduate'],['Low','Lower-Middle','Middle','Upper-Middle','High']]
ohe_col = ['gender', 'ethnicity', 'smoking_status', 'employment_status']

In [11]:
numeric_transformer = Pipeline(
    steps=[
        ('impute',SimpleImputer(strategy='median')),
        ('scale',StandardScaler())
    ]
)

In [12]:
ordinal_transformer = Pipeline(
    steps=[
        ('impute',SimpleImputer(strategy='most_frequent')),
        ('encoding',OrdinalEncoder(categories=ord_categories))
    ]
)

In [13]:
ohe_transformer = Pipeline(
    steps=[
        ('impute',SimpleImputer(strategy='most_frequent')),
        ('encoding',OneHotEncoder())
    ]
)

In [14]:
Preprocessor = ColumnTransformer(
    transformers=[
        ('numeric',numeric_transformer,num_col),
        ('ordinal',ordinal_transformer,ord_col),
        ('onehotencoding',ohe_transformer,ohe_col)
    ],remainder='passthrough'
)

In [15]:
model = Pipeline(
    steps=[
        ('preprocess',Preprocessor),
        ('LR_model',LogisticRegression(max_iter=1000))
    ]
)

In [16]:
model.fit(train_x,train_y)

0,1,2
,steps,"[('preprocess', ...), ('LR_model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('numeric', ...), ('ordinal', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,"[['No formal', 'Highschool', ...], ['Low', 'Lower-Middle', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [17]:
model.score(train_x,train_y)

0.6636392857142858

In [18]:
pred = model.predict(test_x)

In [19]:
from sklearn.metrics import accuracy_score,precision_score,recall_score,roc_auc_score,confusion_matrix
print('accuracy ',accuracy_score(test_y,pred))
print('precision',precision_score(test_y,pred))
print('recall',recall_score(test_y,pred))
print('roc_auc_score',roc_auc_score(test_y,pred))

accuracy  0.6620285714285714
precision 0.6852494195780263
recall 0.8479129230522714
roc_auc_score 0.600674620715936


In [20]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'LR_model__C':[0.01,0.1,1,5,10],
    'LR_model__penalty': ['l2'],
    'LR_model__class_weight':['balanced'],
    'LR_model__solver':['saga'],
    'LR_model__max_iter':[500,1000]
}

grid = GridSearchCV(model,param_grid,scoring='recall',cv=5,n_jobs=1)

grid.fit(train_x,train_y)

0,1,2
,estimator,Pipeline(step..._iter=1000))])
,param_grid,"{'LR_model__C': [0.01, 0.1, ...], 'LR_model__class_weight': ['balanced'], 'LR_model__max_iter': [500, 1000], 'LR_model__penalty': ['l2'], ...}"
,scoring,'recall'
,n_jobs,1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('numeric', ...), ('ordinal', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,"[['No formal', 'Highschool', ...], ['Low', 'Lower-Middle', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,0.01
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'saga'
,max_iter,500


In [21]:
grid.best_params_

{'LR_model__C': 0.01,
 'LR_model__class_weight': 'balanced',
 'LR_model__max_iter': 500,
 'LR_model__penalty': 'l2',
 'LR_model__solver': 'saga'}

In [22]:
LR_model = grid.best_estimator_

In [23]:
y_pred = LR_model.predict(test_x)

In [24]:
from sklearn.metrics import accuracy_score,precision_score,recall_score,roc_auc_score,confusion_matrix
print('accuracy ',accuracy_score(test_y,y_pred))
print('precision',precision_score(test_y,y_pred))
print('recall',recall_score(test_y,y_pred))
print('roc_auc_score',roc_auc_score(test_y,y_pred))

accuracy  0.6279142857142858
precision 0.7541678073802971
recall 0.5990546062194549
roc_auc_score 0.6374398608250554


In [25]:
y_proba = LR_model.predict_proba(test_x)[:,1]

In [26]:
threshold = [0.5,0.3,0.25,0.2,0.1]
results = []

for t in threshold:
    pred_prob = (y_proba>=t).astype(int)

    recall = recall_score(test_y,pred_prob)
    precision = precision_score(test_y,pred_prob)
    tn,fp,fn,tp = confusion_matrix(test_y,pred_prob).ravel()
    results.append({
        'threshold':t,
        'recall':recall,
        'precission':precision,
        'TP':tp,
        "FN":fn,
        "TN":tn,
        "FP":fp
    })

df_result = pd.DataFrame(results)
df_result

Unnamed: 0,threshold,recall,precission,TP,FN,TN,FP
0,0.5,0.599055,0.754168,52340,35031,35568,17061
1,0.3,0.942635,0.653275,82359,5012,8917,43712
2,0.25,0.977887,0.639093,85439,1932,4380,48249
3,0.2,0.993819,0.629247,86831,540,1468,51161
4,0.1,1.0,0.624083,87371,0,1,52628


In [29]:
import joblib
joblib.dump(LR_model,'model')

['model']