In [None]:
import pandas as pd
import os
from datetime import datetime

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from collections import Counter

from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV, KFold, train_test_split

from sklearn.metrics import accuracy_score, f1_score, cohen_kappa_score, r2_score, roc_auc_score, mean_absolute_error, confusion_matrix, classification_report 

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier

In [None]:
def metrics(txt_file, y_true, y_pred):
    print('Model Metrics: \n')
    txt_file.write('Model Metrics: \n')
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, pos_label='Yes')
    kappa = cohen_kappa_score(y_true, y_pred)
    cm = confusion_matrix(y_true, y_pred)
    cr = classification_report(y_true, y_pred)
    
    print(f'Accuracy: {accuracy}')
    txt_file.write(f'Accuracy: {accuracy}\n')
    print(f'F1 Score: {f1}')
    txt_file.write(f'F1 Score: {f1}\n')
    print(f'Cohen Kappa Score: {kappa}')
    txt_file.write(f'Cohen Kappa Score: {kappa}\n')
    print(f'Confusion Matrix:\n {cm}')
    txt_file.write(f'Confusion Matrix:\n {cm}\n')
    print(f'Classification Report:\n {cr}')
    txt_file.write(f'Classification Report:\n {cr}\n')

In [None]:
def model():
    result_path = './results'
    now = datetime.now()
    folder_name = now.strftime("%Y-%m-%d_%H-%M-%S")
    os.makedirs(os.path.join(result_path, folder_name), exist_ok=True)
    print('Results will be saved in: ', folder_name)
    txt_file = open(os.path.join(result_path, folder_name, 'results.txt'), 'a')
    txt_file.write('Simple Model\n')

    data = pd.read_csv('/kaggle/input/cardiovascular-diseases-risk-prediction-dataset/CVD_cleaned.csv')

    data["Heart_Disease"] = data["Heart_Disease"].map({"Yes": 1, "No": 0})
    
    X = data.drop('Heart_Disease', axis=1)
    y = data['Heart_Disease']
    
    categorical_pipeline = make_pipeline(OneHotEncoder())
    numerical_pipeline = make_pipeline(StandardScaler())
    age_pipeline = make_pipeline(
        OrdinalEncoder(categories=[['18-24', '25-29', '30-34', '35-39', '40-44', '45-49', '50-54', '55-59', '60-64', '65-69', '70-74', '75-79', '80+']])
    )
    genhealth_pipeline = make_pipeline(
        OrdinalEncoder(categories=[['Poor', 'Fair', 'Good', 'Very Good', 'Excellent']])
    )
    checkup_pipeline = make_pipeline(
        OrdinalEncoder(categories=[['Within the past year','Within the past 2 years','Within the past 5 years','5 or more years ago','Never']])
    )
    
    numerical_cols = ['Height_(cm)', 'Weight_(kg)', 'BMI', 'Alcohol_Consumption', 'Fruit_Consumption', 'Green_Vegetables_Consumption', 'FriedPotato_Consumption']
    categorical_cols = ['Exercise', 'Skin_Cancer', 'Other_Cancer', 'Depression', 'Diabetes', 'Arthritis', 'Sex', 'Smoking_History']
    
    preprocessor = ColumnTransformer([
            ('num', numerical_pipeline, numerical_cols),
            ('cat', categorical_pipeline, categorical_cols),
            ('age', age_pipeline, ['Age_Category']),
            ('genhealth', genhealth_pipeline, ['General_Health']),
            ('checkup', checkup_pipeline, ['Checkup'])
        ], remainder='passthrough')
    
    X_transformed = preprocessor.fit_transform(X)
    
    X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)
    
    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    models = {
        'randomForest': RandomForestClassifier(random_state=42),
        'gradientBoosting': GradientBoostingClassifier(random_state=42),
        'catBoost': CatBoostClassifier(random_state=42, verbose=0)
    }

    param_grids = {
        'randomForest': {
            'n_estimators': [100, 300, 500],
            'max_depth': [None, 50, 100],
            'criterion': ['gini', 'entropy']
        },
        'gradientBoosting': {
            'learning_rate': [0.1, 0.01, 0.001],
            'n_estimators': [100, 300, 500],
            'max_depth': [3, 5, 10]
        },
        'catBoost': {
            'depth': [3, 5, 7],
            'learning_rate': [0.1, 0.01, 0.001],
            'iterations': [100, 300, 500]
        }
    }
    
    resampling = SMOTE(random_state=42)
    scaler = MinMaxScaler()

    print("Models Configured")
    for model_name, model in models.items():
        grid = GridSearchCV(estimator=model, param_grid=param_grids[model_name], cv=kf, scoring='f1')
        pipe = Pipeline([
            ('scaler', scaler),
            ('SMOTE', resampling),
            ('grid', grid)
        ])
        print("Training started")
        pipe.fit(X_train, y_train)
        print("Training is over")
        print(f'Model: {model_name}')
        txt_file.write(f'Model: {model_name}\n')
        print(f'Best params: {pipe.named_steps["grid"].best_params_}')
        txt_file.write(f'Best params: {pipe.named_steps["grid"].best_params_}\n')
        print(f'Best validation score: {pipe.named_steps["grid"].best_score_}')
        txt_file.write(f'Best validation score: {pipe.named_steps["grid"].best_score_}\n')

        best_model = pipe.named_steps['grid'].best_estimator_
        best_model.fit(X_train, y_train)

        y_test_pred = best_model.predict(X_test)
        metrics(txt_file, y_test, y_test_pred)

    txt_file.close()

In [None]:
model()