In [None]:
import pandas as pd
import os
from datetime import datetime

import joblib

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, KFold, train_test_split

from sklearn.metrics import accuracy_score, f1_score, cohen_kappa_score, r2_score, roc_auc_score, mean_absolute_error, confusion_matrix, classification_report 

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier

In [None]:
def metrics(txt_file, y_true, y_pred, y_prob):
    print('Model Metrics: \n')
    txt_file.write('Model Metrics: \n')
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    kappa = cohen_kappa_score(y_true, y_pred)
    cm = confusion_matrix(y_true, y_pred)
    cr = classification_report(y_true, y_pred)
    
    print(f'Accuracy: {accuracy}')
    txt_file.write(f'Accuracy: {accuracy}\n')
    print(f'F1 Score: {f1}')
    txt_file.write(f'F1 Score: {f1}\n')
    print(f'Cohen Kappa Score: {kappa}')
    txt_file.write(f'Cohen Kappa Score: {kappa}\n')
    print(f'Confusion Matrix:\n {cm}')
    txt_file.write(f'Confusion Matrix:\n {cm}\n')
    print(f'Classification Report:\n {cr}')
    txt_file.write(f'Classification Report:\n {cr}\n')
    
    print(f'R2 : {r2_score(y_true, y_pred)}')
    txt_file.write(f'R2 : {r2_score(y_true, y_pred)}\n')
    print(f'MAE : {mean_absolute_error(y_true, y_pred)}')
    txt_file.write(f'MAE : {mean_absolute_error(y_true, y_pred)}\n')
    print(f'ROC AUC Score: {roc_auc_score(y_true, y_prob)}')
    txt_file.write(f'ROC AUC Score: {roc_auc_score(y_true, y_prob)}\n')

In [None]:
def remove_outliers(df, column, lower_threshold=None, upper_threshold=None):
    print(f'{column} - Size before removing outliers: {df.shape}')
    if lower_threshold:
        df = df[df[column] >= lower_threshold]
    if upper_threshold:
        df = df[df[column] <= upper_threshold]
    print(f'{column} -Size after removing outliers: {df.shape}')
    
    return df

In [None]:
def preprocess_data(data):
    data = remove_outliers(data, "ap_hi", 50, 160)
    data = remove_outliers(data, "ap_lo", 20, 110)
    
    data = data.drop(columns=["id"]) 
    data["BMI"] = data["weight"] / (data["height"] / 100) ** 2
    data['age'] = data['age'].apply(lambda x: x / 365.25).astype('int16')
    data = data.drop(columns=["weight", "height"])
    
    data = remove_outliers(data, "BMI", 14, 50)
    
    return data

In [None]:
def model():
    result_path = './results'
    now = datetime.now()
    folder_name = now.strftime("%Y-%m-%d_%H-%M-%S")
    os.makedirs(os.path.join(result_path, folder_name), exist_ok=True)
    print('Results will be saved in: ', folder_name)
    txt_file = open(os.path.join(result_path, folder_name, 'results.txt'), 'a')
    txt_file.write('Simple Model\n')

    data = pd.read_csv('cardio_train.csv', sep=';')
    
    transformed_data = preprocess_data(data)
    
    X = transformed_data.drop('cardio', axis=1)
    y = transformed_data['cardio']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    models = {
        'randomForest': RandomForestClassifier(random_state=42),
        'gradientBoosting': GradientBoostingClassifier(random_state=42),
        'catBoost': CatBoostClassifier(random_state=42, verbose=0)
    }

    param_grids = {
        'randomForest': {
            'n_estimators': [100, 300, 500],
            'max_depth': [None, 50, 100],
            'criterion': ['gini', 'entropy']
        },
        'gradientBoosting': {
            'learning_rate': [0.1, 0.01, 0.001],
            'n_estimators': [100, 300, 500],
            'max_depth': [3, 5, 10]
        },
        'catBoost': {
            'depth': [3, 5, 7],
            'learning_rate': [0.1, 0.01, 0.001],
            'iterations': [100, 300, 500]
        }
    }

    print("Models Configured")
    for model_name, model in models.items():
        grid = GridSearchCV(estimator=model, param_grid=param_grids[model_name], cv=kf, scoring='f1_weighted')
        pipe = Pipeline([
            ('scaler', StandardScaler()),
            ('grid', grid)
        ])
        print(datetime.now())
        print("Training started")
        pipe.fit(X_train, y_train)
        print(datetime.now())
        print(f'Model: {model_name}')
        txt_file.write(f'Model: {model_name}\n')
        print(f'Best params: {pipe.named_steps["grid"].best_params_}')
        txt_file.write(f'Best params: {pipe.named_steps["grid"].best_params_}\n')
        print(f'Best validation score: {pipe.named_steps["grid"].best_score_}')
        txt_file.write(f'Best validation score: {pipe.named_steps["grid"].best_score_}\n')

        best_model = pipe.named_steps['grid'].best_estimator_
        best_model.fit(X_train, y_train)
        
        model_save_path = os.path.join(result_path, folder_name, f'{model_name}.joblib')
        joblib.dump(best_model, model_save_path)
        print(f'Best model saved at: {model_save_path}')
        txt_file.write(f'Best model saved at: {model_save_path}\n')

        y_test_pred = best_model.predict(X_test)
        y_prob = best_model.predict_proba(X_test)[:, 1]
        metrics(txt_file, y_test, y_test_pred, y_prob)

    txt_file.close()

In [None]:
print(datetime.now())
model()
print(datetime.now())