In [20]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import (GridSearchCV, StratifiedKFold,
                                     cross_validate, train_test_split)
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator

## Preprocessing  Prior Training/Predictions
1. Lower case all column names
2. Columns to one hot encode:
    nominal_columns = ["highbp","highchol","cholcheck","smoker","stroke","heartdiseaseorattack","physactivity","fruits","veggies","hvyalcoholconsump","anyhealthcare","nodocbccost","diffwalk","sex"]
3. Columns to standardize:
    numerical_columns = ["bmi","age","income","menthlth","physhlth","education","genhlth"]
    



In [10]:
data = pd.read_csv("../data/raw/diabetes_binary_health_indicators_BRFSS2015.csv")
data.columns = data.columns.str.lower()

In [15]:
# Split data into train and test datasets. Cross validation will be performed on train dataset.
X = data.drop(columns=["diabetes_binary"])
y = data["diabetes_binary"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=2025, shuffle=True, stratify=y)

In [25]:
# Building Preprocessing Pipeline
def create_pipeline(model: tuple[str, BaseEstimator])->Pipeline:
    nominal_columns = ["highbp","highchol","cholcheck","smoker","stroke","heartdiseaseorattack","physactivity","fruits","veggies","hvyalcoholconsump","anyhealthcare","nodocbccost","diffwalk","sex"]
    numerical_columns = ["bmi","age","income","menthlth","physhlth","education","genhlth"]

    column_trans = ColumnTransformer([
            ('numerical', StandardScaler(), numerical_columns),
            ('categorical', OneHotEncoder(drop='first', handle_unknown='ignore'), nominal_columns)
        ])

    pipe = Pipeline([("transformer", column_trans),model])
    return pipe

Unnamed: 0,highbp,highchol,cholcheck,bmi,smoker,stroke,heartdiseaseorattack,physactivity,fruits,veggies,...,anyhealthcare,nodocbccost,genhlth,menthlth,physhlth,diffwalk,sex,age,education,income
203487,1.0,1.0,1.0,26.0,1.0,0.0,0.0,1.0,1.0,1.0,...,1.0,0.0,2.0,1.0,0.0,0.0,1.0,11.0,6.0,8.0
246380,1.0,1.0,1.0,33.0,0.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,3.0,0.0,6.0,1.0,0.0,11.0,6.0,6.0
203251,1.0,1.0,1.0,37.0,0.0,0.0,0.0,1.0,0.0,1.0,...,1.0,0.0,4.0,0.0,20.0,1.0,1.0,10.0,4.0,5.0
16092,1.0,1.0,1.0,28.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,4.0,13.0,20.0,1.0,0.0,9.0,2.0,2.0
50398,1.0,0.0,1.0,21.0,0.0,0.0,0.0,1.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,10.0,6.0,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139109,0.0,1.0,1.0,27.0,1.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,3.0,0.0,1.0,1.0,1.0,10.0,5.0,3.0
169435,0.0,0.0,1.0,25.0,1.0,0.0,0.0,1.0,1.0,0.0,...,1.0,0.0,2.0,0.0,0.0,0.0,1.0,5.0,4.0,8.0
138935,1.0,1.0,1.0,33.0,1.0,0.0,0.0,1.0,1.0,1.0,...,1.0,0.0,4.0,0.0,0.0,0.0,1.0,9.0,5.0,6.0
136675,0.0,0.0,1.0,32.0,1.0,1.0,0.0,1.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,3.0,6.0,7.0


In [24]:
def pipe_builder (model:tuple[str, BaseEstimator]) -> Pipeline:
    scaled_cols = ['ela',
                'financeAmount',
                'netSalary',
                'loanApplied',
                'ratio_loan_finance',
                'ratio_loan_eligibility',
                'ratio_loan_salary',
                'ratio_salary_ela',
                'monthly_payable',
                'diff_salary_payable',
                'diff_loan_finance',
                'interest_amount',
                'percent_salary_payable',
                'loanTenure']

    column_trans = ColumnTransformer([
        ('numerical', StandardScaler(), scaled_cols),
        ('categorical', OneHotEncoder(drop='first', handle_unknown='ignore'), ['loanReason'])
    ])
    pipe = Pipeline([('transformer', column_trans), model])
    
    return pipe

log_pipe = pipe_builder(('model', LogisticRegression(random_state=2024)))
log_pipe