In [21]:
import pandas as pd

In [22]:
data = pd.read_csv('./diabetes_binary_health_indicators_BRFSS2015.csv')
pd.set_option('display.max_columns', None)

In [23]:
from sklearn.model_selection import train_test_split

X = data.drop('Diabetes_binary', axis=1)
y = data['Diabetes_binary']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [24]:
import numpy as np
from scipy.stats import boxcox
from sklearn.preprocessing import PowerTransformer, QuantileTransformer, Normalizer



continuous_cols = ['BMI', 'MentHlth', 'PhysHlth']

def apply_boxcox(series):
    shift = 0
    if series.min() <= 0:
        shift = abs(series.min()) + 1e-6
    transformed, lam = boxcox(series + shift)
    return transformed

transformation_names = [
    "Original",            
    "PowerTransformer",    
    "Log1p",               
    "Sqrt",                
    "BoxCox",              
    "QuantileTransformer", 
    "Normalization"        
]

transformed_train = {}
transformed_test = {}

for trans_name in transformation_names:
    X_train_copy = X_train.copy()
    X_test_copy = X_test.copy()
    
    if trans_name == "Original":
        pass
    elif trans_name == "PowerTransformer":
        pt = PowerTransformer(method='yeo-johnson')
        X_train_copy[continuous_cols] = pt.fit_transform(X_train_copy[continuous_cols])
        X_test_copy[continuous_cols] = pt.transform(X_test_copy[continuous_cols])
    elif trans_name == "Log1p":
        X_train_copy[continuous_cols] = np.log1p(X_train_copy[continuous_cols])
        X_test_copy[continuous_cols] = np.log1p(X_test_copy[continuous_cols])
    elif trans_name == "Sqrt":
        X_train_copy[continuous_cols] = np.sqrt(X_train_copy[continuous_cols])
        X_test_copy[continuous_cols] = np.sqrt(X_test_copy[continuous_cols])
    elif trans_name == "BoxCox":
        X_train_copy[continuous_cols] = X_train_copy[continuous_cols].apply(apply_boxcox)
        X_test_copy[continuous_cols] = X_test_copy[continuous_cols].apply(apply_boxcox)
    elif trans_name == "QuantileTransformer":
        qt = QuantileTransformer(output_distribution='normal', random_state=42)
        X_train_copy[continuous_cols] = qt.fit_transform(X_train_copy[continuous_cols])
        X_test_copy[continuous_cols] = qt.transform(X_test_copy[continuous_cols])
    elif trans_name == "Normalization":
        norm = Normalizer()
        X_train_copy[continuous_cols] = norm.fit_transform(X_train_copy[continuous_cols])
        X_test_copy[continuous_cols] = norm.transform(X_test_copy[continuous_cols])
        
    
    transformed_train[trans_name] = X_train_copy
    transformed_test[trans_name] = X_test_copy



In [25]:
transformed_train.keys(), transformed_test.keys()

(dict_keys(['Original', 'PowerTransformer', 'Log1p', 'Sqrt', 'BoxCox', 'QuantileTransformer', 'Normalization']),
 dict_keys(['Original', 'PowerTransformer', 'Log1p', 'Sqrt', 'BoxCox', 'QuantileTransformer', 'Normalization']))

Logistic Regression

In [26]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

param_grid = {
    'C': [0.001, 0.01, 0.1],
    'penalty': ['l1', 'l2']
}

grid_search_results = {}

print("======= Grid Search For Every Transformation using Logistic Regression =======")
for trans_name in transformation_names:
    print(f"\nTransformation: {trans_name}")
    lr = LogisticRegression(solver='liblinear', max_iter=1000)
    grid = GridSearchCV(lr, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
   
    grid.fit(transformed_train[trans_name], y_train)
    test_score = grid.score(transformed_test[trans_name], y_test)
    
    grid_search_results[trans_name] = {
        'best_params': grid.best_params_,
        'test_score': test_score
    }
    
    print(f"Test Score: {test_score:.4f}")
    print(f"Best Parameters: {grid.best_params_}")

best_trans = None
best_test_score = -np.inf
for trans, result in grid_search_results.items():
    if result['test_score'] > best_test_score:
        best_test_score = result['test_score']
        best_trans = trans

print("\n========== Best Transformation for Logistic Regression ==========")
print(f"Transformation: {best_trans}")
print(f"Test Score: {grid_search_results[best_trans]['test_score']:.4f}")
print(f"Best Parameters: {grid_search_results[best_trans]['best_params']}")


Transformation: Original
Test Score: 0.8655
Best Parameters: {'C': 0.01, 'penalty': 'l1'}

Transformation: PowerTransformer
Test Score: 0.8661
Best Parameters: {'C': 0.1, 'penalty': 'l2'}

Transformation: Log1p
Test Score: 0.8663
Best Parameters: {'C': 0.1, 'penalty': 'l1'}

Transformation: Sqrt
Test Score: 0.8657
Best Parameters: {'C': 0.01, 'penalty': 'l1'}

Transformation: BoxCox
Test Score: 0.8608
Best Parameters: {'C': 0.1, 'penalty': 'l1'}

Transformation: QuantileTransformer
Test Score: 0.8661
Best Parameters: {'C': 0.1, 'penalty': 'l2'}

Transformation: Normalization
Test Score: 0.8615
Best Parameters: {'C': 0.01, 'penalty': 'l1'}

Transformation: Log1p
Test Score: 0.8663
Best Parameters: {'C': 0.1, 'penalty': 'l1'}


Random Forest

In [27]:
from sklearn.ensemble import RandomForestClassifier

param_grid_rf = {
    'n_estimators': [10, 50, 100, 200, 500],
    'max_depth': [1, 2, 5, 10],
    'min_samples_split': [2, 5, 10],
}

grid_search_results_rf = {}

print("======= Grid Search For Every Transformation using Random Forest =======")
for trans_name in transformation_names:
    print(f"\nTransformation: {trans_name}")
    
    rf = RandomForestClassifier(random_state=42)
    
    grid_rf = GridSearchCV(rf, param_grid_rf, cv=5, scoring='accuracy', n_jobs=-1)
    
    
    grid_rf.fit(transformed_train[trans_name], y_train)
    test_score_rf = grid_rf.score(transformed_test[trans_name], y_test)
    
    grid_search_results_rf[trans_name] = {
        'best_params': grid_rf.best_params_,
        'test_score': test_score_rf
    }
    
    print(f"Test Score: {test_score_rf:.4f}")
    print(f"Best Parameters: {grid_rf.best_params_}")

best_trans_rf = None
best_test_score_rf = -np.inf
for trans, result in grid_search_results_rf.items():
    if result['test_score'] > best_test_score_rf:
        best_test_score_rf = result['test_score']
        best_trans_rf = trans

print("\n========== Best Transformation for Random Forest ==========")
print(f"Transformation: {best_trans_rf}")
print(f"Test Score: {grid_search_results_rf[best_trans_rf]['test_score']:.4f}")
print(f"Best Parameters: {grid_search_results_rf[best_trans_rf]['best_params']}")


Transformation: Original
Test Score: 0.8668
Best Parameters: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 500}

Transformation: PowerTransformer
Test Score: 0.8668
Best Parameters: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 500}

Transformation: Log1p
Test Score: 0.8668
Best Parameters: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 500}

Transformation: Sqrt
Test Score: 0.8668
Best Parameters: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 500}

Transformation: BoxCox
Test Score: 0.8635
Best Parameters: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 500}

Transformation: QuantileTransformer
Test Score: 0.8668
Best Parameters: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 500}

Transformation: Normalization
Test Score: 0.8641
Best Parameters: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 200}

Transformation: Original
Test Score: 0.8668
Best Parameters: {'max_depth': 10, 'min_samples_split': 5, 'n_es

In [28]:
from xgboost import XGBClassifier

param_grid_xgb = {
    'n_estimators': [50, 100, 200, 500],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0]
}

grid_search_results_xgb = {}

print("======= Grid Search For Every Transformation using XGBoost =======")
for trans_name in transformation_names:
    print(f"\nTransformation: {trans_name}")
    
    xgb_clf = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
    
    grid_xgb = GridSearchCV(xgb_clf, param_grid_xgb, cv=5, scoring='accuracy', n_jobs=-1)
    
 
    grid_xgb.fit(transformed_train[trans_name], y_train)
    test_score_xgb = grid_xgb.score(transformed_test[trans_name], y_test)
    
    grid_search_results_xgb[trans_name] = {
        'best_params': grid_xgb.best_params_,
        'test_score': test_score_xgb
    }
    
    print(f"Test Score: {test_score_xgb:.4f}")
    print(f"Best Parameters: {grid_xgb.best_params_}")

best_trans_xgb = None
best_test_score_xgb = -np.inf
for trans, result in grid_search_results_xgb.items():
    if result['test_score'] > best_test_score_xgb:
        best_test_score_xgb = result['test_score']
        best_trans_xgb = trans

print("\n========== Best Transformation for XGBoost ==========")
print(f"Transformation: {best_trans_xgb}")
print(f"Test Score: {grid_search_results_xgb[best_trans_xgb]['test_score']:.4f}")
print(f"Best Parameters: {grid_search_results_xgb[best_trans_xgb]['best_params']}")


Transformation: Original


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Test Score: 0.8674
Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 500, 'subsample': 1.0}

Transformation: PowerTransformer


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Test Score: 0.8674
Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 500, 'subsample': 1.0}

Transformation: Log1p


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Test Score: 0.8674
Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 500, 'subsample': 1.0}

Transformation: Sqrt


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Test Score: 0.8674
Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 500, 'subsample': 1.0}

Transformation: BoxCox


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Test Score: 0.8532
Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 500, 'subsample': 1.0}

Transformation: QuantileTransformer


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Test Score: 0.8674
Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 500, 'subsample': 1.0}

Transformation: Normalization


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Test Score: 0.8652
Best Parameters: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 50, 'subsample': 0.8}

Transformation: Original
Test Score: 0.8674
Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 500, 'subsample': 1.0}
