# Workflow

In [9]:
import pandas as pd
import optuna
import ast
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import confusion_matrix, matthews_corrcoef, recall_score, precision_score, f1_score, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import VotingClassifier

data_dir = '/home/darshana/Projects/druggable_proteins/processed_dataset'
feature_engineered_data_dir = '/home/darshana/Projects/druggable_proteins/feature_engineered_dataset'

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)


def evaluate_model(name, model, X_train, y_train, X_test, y_test, results_dataframe, feature_type):
    # evaluate model
    scores = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)
    accuracy = scores.mean()

    # fit the model on the training set
    model.fit(X_train, y_train)

    # predict the test set results
    y_pred = model.predict(X_test)

    # compute the confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    # calculate precision, recall (sensitivity), f1-score
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # calculate specificity
    tn, fp, fn, tp = cm.ravel()
    specificity = tn / (tn+fp)

    # calculate MCC
    mcc = matthews_corrcoef(y_test, y_pred)

    temp_df = pd.DataFrame({
        'feature_type': feature_type, 
        'model': name, 
        'with_hypertuning': False,
        'best_params': 'None',
        'accuracy': accuracy, 
        'sensitivity': recall, 
        'specificity': specificity, 
        'precision': precision, 
        'f1': f1, 
        'mcc': mcc,
        'index': f'{feature_type}_{name}_no_hypertuning'
        }, index=['index'])
    # results_dataframe is an empty dataframe to store results with the columns feature_type, model, with_hypertuning, accuracy, sensitivity, specificity, precision, f1, mcc
    return pd.concat([results_dataframe, temp_df])


def optimize_hyperparameters(name, model, objective, trials, results_dataframe, feature_type, X_train, y_train, X_test, y_test):
    def optuna_objective(trial):
        params = objective(trial)
        model_instance = model(**params)
        model_instance.fit(X_train, y_train)
        y_pred = model_instance.predict(X_test)

        # compute the confusion matrix
        cm = confusion_matrix(y_test, y_pred)

        # calculate precision, recall (sensitivity), f1-score
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        
        # calculate specificity
        tn, fp, fn, tp = cm.ravel()
        specificity = tn / (tn+fp)

        # calculate MCC
        mcc = matthews_corrcoef(y_test, y_pred)

        accuracy = (tp + tn) / (tp + tn + fp + fn)

        # Set user attributes
        trial.set_user_attr("precision", precision)
        trial.set_user_attr("recall", recall)
        trial.set_user_attr("f1", f1)
        trial.set_user_attr("specificity", specificity)
        trial.set_user_attr("mcc", mcc)

        return accuracy

    study = optuna.create_study(direction='maximize')
    study.optimize(optuna_objective, n_trials=trials)

    temp_df = pd.DataFrame({
        'feature_type': feature_type, 
        'model': name, 
        'with_hypertuning': True,
        'best_params': [str(study.best_trial.params)],
        'accuracy': study.best_trial.value, 
        'sensitivity': study.best_trial.user_attrs['recall'], 
        'specificity': study.best_trial.user_attrs['specificity'], 
        'precision': study.best_trial.user_attrs['precision'], 
        'f1': study.best_trial.user_attrs['f1'], 
        'mcc': study.best_trial.user_attrs['mcc'],
        'index': f'{feature_type}_{name}_with_hypertuning'
        }, index=['index'])
    results_dataframe = pd.concat([results_dataframe, temp_df])
    return results_dataframe


# Define models
models = {
    'LogisticRegression': LogisticRegression(),
    'SVC': SVC(),
    'XGBClassifier': XGBClassifier(),
    'LGBMClassifier': LGBMClassifier()
}

models_ = {
    'LogisticRegression': LogisticRegression,
    'SVC': SVC,
    'XGBClassifier': XGBClassifier,
    'LGBMClassifier': LGBMClassifier
}

# Define objectives for hyperparameters tuning
objectives = {
    'LogisticRegression': lambda trial: {
        'C': trial.suggest_float('C', 1e-2, 1e-1),
        'penalty': trial.suggest_categorical('penalty', ['l1', 'l2']),
        'solver': trial.suggest_categorical('solver', ['liblinear', 'saga']),
        'max_iter': trial.suggest_int('max_iter', 100, 1000)
    },
    'SVC': lambda trial: {
        'C': trial.suggest_float('svc_c', 1e-2, 1e2),
        'gamma': trial.suggest_float('svc_gamma', 1e-2, 1e2),
    },
    'XGBClassifier': lambda trial: {
        'learning_rate': trial.suggest_float("learning_rate", 1e-2, 0.3),
        'max_depth': trial.suggest_int("max_depth", 2, 6),
        'n_estimators': trial.suggest_int("n_estimators", 100, 1000)
    },
    'LGBMClassifier': lambda trial: {
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'max_depth': trial.suggest_int('max_depth', 2, 50),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 100, 2000)
    }
}

# Without Feature Selection

In [11]:
# empty dataframe to store results with the columns feature_type, model, with_hypertuning, accuracy, sensitivity, specificity, precision, f1, mcc
results = pd.DataFrame(columns=['feature_type', 'model', 'with_hypertuning', 'best_params', 'accuracy', 'sensitivity', 'specificity', 'precision', 'f1', 'mcc', 'index'])
feature_types = ['AAC', 'APAAC', 'CTD', 'DPC', 'PAAC']
for feature_type in feature_types:

    # Load the training dataset
    data = pd.read_csv(f'{data_dir}/TR_{feature_type}.csv')

    # Separate features and target
    X = data.drop(columns=['label', 'id'], axis=1)
    y = data['label']

    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Evaluate models without hyperparameters tuning
    for name, model in models.items():
        print(f"Evaluating {feature_type} {name}")
        results = evaluate_model(name, model, X_train, y_train, X_test, y_test, results, feature_type)
        print(results)

    # Optimize hyperparameters
    for name, model in models_.items():
        objective = objectives.get(name)
        if objective is not None:
            print(f"Optimizing {feature_type} {name}")
            results = optimize_hyperparameters(name, model, objective, trials=100, results_dataframe=results, feature_type=feature_type, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
            print(results)

results.to_csv('results_v2.csv', index=False)

# With Feature Selection

In [None]:
# empty dataframe to store results with the columns feature_type, model, with_hypertuning, accuracy, sensitivity, specificity, precision, f1, mcc
results = pd.DataFrame(columns=['feature_type', 'model', 'with_hypertuning', 'best_params', 'accuracy', 'sensitivity', 'specificity', 'precision', 'f1', 'mcc', 'index'])
feature_types = ['selected_features_all_best20', 'selected_features_all_best30', 'selected_features_all_best50', 'selected_features_all_best100']
for feature_type in feature_types:

    # Load the training dataset
    data = pd.read_csv(f'{feature_engineered_data_dir}/TR_{feature_type}.csv')

    # Separate features and target
    X = data.drop(columns=['label', 'id'], axis=1)
    y = data['label']

    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Evaluate models without hyperparameters tuning
    for name, model in models.items():
        print(f"Evaluating {feature_type} {name}")
        results = evaluate_model(name, model, X_train, y_train, X_test, y_test, results, feature_type)
        print(results)

    # Optimize hyperparameters
    for name, model in models_.items():
        objective = objectives.get(name)
        if objective is not None:
            print(f"Optimizing {feature_type} {name}")
            results = optimize_hyperparameters(name, model, objective, trials=100, results_dataframe=results, feature_type=feature_type, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
            print(results)

results.to_csv(f'{feature_engineered_data_dir}/results_20&30&50&100.csv', index=False)

Evaluating selected_features_all_best20 LogisticRegression
                       feature_type               model with_hypertuning   
index  selected_features_all_best20  LogisticRegression            False  \

      best_params  accuracy  sensitivity  specificity  precision        f1   
index        None   0.86163     0.832653     0.850575   0.839506  0.836066  \

           mcc                                              index  
index  0.68342  selected_features_all_best20_LogisticRegressio...  
Evaluating selected_features_all_best20 SVC
                       feature_type               model with_hypertuning   
index  selected_features_all_best20  LogisticRegression            False  \
index  selected_features_all_best20                 SVC            False   

      best_params  accuracy  sensitivity  specificity  precision        f1   
index        None  0.861630     0.832653     0.850575   0.839506  0.836066  \
index        None  0.884356     0.816327     0.915709   0.900901  

[32m[I 2023-05-14 11:32:32,517][0m A new study created in memory with name: no-name-3df58edf-44c6-4cf2-9751-20d6cafa617a[0m
[32m[I 2023-05-14 11:32:32,628][0m Trial 0 finished with value: 0.857707509881423 and parameters: {'C': 0.07403839557995674, 'penalty': 'l2', 'solver': 'saga', 'max_iter': 496}. Best is trial 0 with value: 0.857707509881423.[0m
[32m[I 2023-05-14 11:32:32,657][0m Trial 1 finished with value: 0.8537549407114624 and parameters: {'C': 0.0810939833636358, 'penalty': 'l2', 'solver': 'liblinear', 'max_iter': 522}. Best is trial 0 with value: 0.857707509881423.[0m
[32m[I 2023-05-14 11:32:32,684][0m Trial 2 finished with value: 0.857707509881423 and parameters: {'C': 0.08162923941197098, 'penalty': 'l1', 'solver': 'liblinear', 'max_iter': 802}. Best is trial 0 with value: 0.857707509881423.[0m


                       feature_type               model with_hypertuning   
index  selected_features_all_best20  LogisticRegression            False  \
index  selected_features_all_best20                 SVC            False   
index  selected_features_all_best20       XGBClassifier            False   
index  selected_features_all_best20      LGBMClassifier            False   

      best_params  accuracy  sensitivity  specificity  precision        f1   
index        None  0.861630     0.832653     0.850575   0.839506  0.836066  \
index        None  0.884356     0.816327     0.915709   0.900901  0.856531   
index        None  0.865564     0.840816     0.908046   0.895652  0.867368   
index        None  0.870999     0.832653     0.919540   0.906667  0.868085   

            mcc                                              index  
index  0.683420  selected_features_all_best20_LogisticRegressio...  
index  0.737224    selected_features_all_best20_SVC_no_hypertuning  
index  0.751600  sele

[32m[I 2023-05-14 11:32:32,872][0m Trial 3 finished with value: 0.8596837944664032 and parameters: {'C': 0.06441953223187927, 'penalty': 'l1', 'solver': 'saga', 'max_iter': 574}. Best is trial 3 with value: 0.8596837944664032.[0m
[32m[I 2023-05-14 11:32:32,897][0m Trial 4 finished with value: 0.857707509881423 and parameters: {'C': 0.040412568834002696, 'penalty': 'l2', 'solver': 'liblinear', 'max_iter': 939}. Best is trial 3 with value: 0.8596837944664032.[0m
[32m[I 2023-05-14 11:32:32,923][0m Trial 5 finished with value: 0.857707509881423 and parameters: {'C': 0.026477898873890016, 'penalty': 'l2', 'solver': 'liblinear', 'max_iter': 184}. Best is trial 3 with value: 0.8596837944664032.[0m
[32m[I 2023-05-14 11:32:33,059][0m Trial 6 finished with value: 0.8616600790513834 and parameters: {'C': 0.03342895231698814, 'penalty': 'l1', 'solver': 'saga', 'max_iter': 773}. Best is trial 6 with value: 0.8616600790513834.[0m
[32m[I 2023-05-14 11:32:33,079][0m Trial 7 finished with

                       feature_type               model with_hypertuning   
index  selected_features_all_best20  LogisticRegression            False  \
index  selected_features_all_best20                 SVC            False   
index  selected_features_all_best20       XGBClassifier            False   
index  selected_features_all_best20      LGBMClassifier            False   
index  selected_features_all_best20  LogisticRegression             True   

                                             best_params  accuracy   
index                                               None  0.861630  \
index                                               None  0.884356   
index                                               None  0.865564   
index                                               None  0.870999   
index  {'C': 0.05772645471274076, 'penalty': 'l1', 's...  0.863636   

       sensitivity  specificity  precision        f1       mcc   
index     0.832653     0.850575   0.839506  0.836066  0.

[32m[I 2023-05-14 11:32:39,517][0m Trial 0 finished with value: 0.5177865612648221 and parameters: {'svc_c': 10.7533945146195, 'svc_gamma': 73.75896312119299}. Best is trial 0 with value: 0.5177865612648221.[0m
[32m[I 2023-05-14 11:32:39,983][0m Trial 1 finished with value: 0.5177865612648221 and parameters: {'svc_c': 13.280062430077226, 'svc_gamma': 79.73462726837084}. Best is trial 0 with value: 0.5177865612648221.[0m
[32m[I 2023-05-14 11:32:40,413][0m Trial 2 finished with value: 0.5177865612648221 and parameters: {'svc_c': 88.65711768594213, 'svc_gamma': 70.64852112597094}. Best is trial 0 with value: 0.5177865612648221.[0m
[32m[I 2023-05-14 11:32:40,929][0m Trial 3 finished with value: 0.5177865612648221 and parameters: {'svc_c': 18.089833877835254, 'svc_gamma': 67.08274284473659}. Best is trial 0 with value: 0.5177865612648221.[0m
[32m[I 2023-05-14 11:32:41,457][0m Trial 4 finished with value: 0.5177865612648221 and parameters: {'svc_c': 86.76762802466011, 'svc_gamm

                       feature_type               model with_hypertuning   
index  selected_features_all_best20  LogisticRegression            False  \
index  selected_features_all_best20                 SVC            False   
index  selected_features_all_best20       XGBClassifier            False   
index  selected_features_all_best20      LGBMClassifier            False   
index  selected_features_all_best20  LogisticRegression             True   
index  selected_features_all_best20                 SVC             True   

                                             best_params  accuracy   
index                                               None  0.861630  \
index                                               None  0.884356   
index                                               None  0.865564   
index                                               None  0.870999   
index  {'C': 0.05772645471274076, 'penalty': 'l1', 's...  0.863636   
index  {'svc_c': 95.94113392322917, 'svc_gamma'

[32m[I 2023-05-14 11:33:19,624][0m Trial 0 finished with value: 0.8656126482213439 and parameters: {'learning_rate': 0.07874189857592591, 'max_depth': 6, 'n_estimators': 154}. Best is trial 0 with value: 0.8656126482213439.[0m
[32m[I 2023-05-14 11:33:20,400][0m Trial 1 finished with value: 0.8596837944664032 and parameters: {'learning_rate': 0.1705788115114957, 'max_depth': 3, 'n_estimators': 484}. Best is trial 0 with value: 0.8656126482213439.[0m
[32m[I 2023-05-14 11:33:20,881][0m Trial 2 finished with value: 0.8557312252964426 and parameters: {'learning_rate': 0.07877682785006325, 'max_depth': 2, 'n_estimators': 451}. Best is trial 0 with value: 0.8656126482213439.[0m
[32m[I 2023-05-14 11:33:21,344][0m Trial 3 finished with value: 0.8537549407114624 and parameters: {'learning_rate': 0.14347569237428598, 'max_depth': 4, 'n_estimators': 268}. Best is trial 0 with value: 0.8656126482213439.[0m
[32m[I 2023-05-14 11:33:22,881][0m Trial 4 finished with value: 0.8715415019762

                       feature_type               model with_hypertuning   
index  selected_features_all_best20  LogisticRegression            False  \
index  selected_features_all_best20                 SVC            False   
index  selected_features_all_best20       XGBClassifier            False   
index  selected_features_all_best20      LGBMClassifier            False   
index  selected_features_all_best20  LogisticRegression             True   
index  selected_features_all_best20                 SVC             True   
index  selected_features_all_best20       XGBClassifier             True   

                                             best_params  accuracy   
index                                               None  0.861630  \
index                                               None  0.884356   
index                                               None  0.865564   
index                                               None  0.870999   
index  {'C': 0.05772645471274076, 'penalt

[32m[I 2023-05-14 11:35:53,434][0m Trial 0 finished with value: 0.8754940711462451 and parameters: {'num_leaves': 103, 'max_depth': 11, 'learning_rate': 0.03276897858896604, 'n_estimators': 488}. Best is trial 0 with value: 0.8754940711462451.[0m
[32m[I 2023-05-14 11:35:54,516][0m Trial 1 finished with value: 0.8794466403162056 and parameters: {'num_leaves': 139, 'max_depth': 18, 'learning_rate': 0.20546953893784983, 'n_estimators': 633}. Best is trial 1 with value: 0.8794466403162056.[0m
[32m[I 2023-05-14 11:35:55,226][0m Trial 2 finished with value: 0.8715415019762845 and parameters: {'num_leaves': 45, 'max_depth': 10, 'learning_rate': 0.060968532596519846, 'n_estimators': 334}. Best is trial 1 with value: 0.8794466403162056.[0m
[32m[I 2023-05-14 11:35:56,912][0m Trial 3 finished with value: 0.8735177865612648 and parameters: {'num_leaves': 211, 'max_depth': 48, 'learning_rate': 0.046618425613169104, 'n_estimators': 312}. Best is trial 1 with value: 0.8794466403162056.[0m

                       feature_type               model with_hypertuning   
index  selected_features_all_best20  LogisticRegression            False  \
index  selected_features_all_best20                 SVC            False   
index  selected_features_all_best20       XGBClassifier            False   
index  selected_features_all_best20      LGBMClassifier            False   
index  selected_features_all_best20  LogisticRegression             True   
index  selected_features_all_best20                 SVC             True   
index  selected_features_all_best20       XGBClassifier             True   
index  selected_features_all_best20      LGBMClassifier             True   

                                             best_params  accuracy   
index                                               None  0.861630  \
index                                               None  0.884356   
index                                               None  0.865564   
index                              

[32m[I 2023-05-14 11:38:23,091][0m A new study created in memory with name: no-name-c0c0039d-afa3-4213-8124-122d20179b0c[0m
[32m[I 2023-05-14 11:38:23,130][0m Trial 0 finished with value: 0.8893280632411067 and parameters: {'C': 0.09688479766360383, 'penalty': 'l2', 'solver': 'liblinear', 'max_iter': 289}. Best is trial 0 with value: 0.8893280632411067.[0m


                       feature_type               model with_hypertuning   
index  selected_features_all_best20  LogisticRegression            False  \
index  selected_features_all_best20                 SVC            False   
index  selected_features_all_best20       XGBClassifier            False   
index  selected_features_all_best20      LGBMClassifier            False   
index  selected_features_all_best20  LogisticRegression             True   
index  selected_features_all_best20                 SVC             True   
index  selected_features_all_best20       XGBClassifier             True   
index  selected_features_all_best20      LGBMClassifier             True   
index  selected_features_all_best30  LogisticRegression            False   
index  selected_features_all_best30                 SVC            False   
index  selected_features_all_best30       XGBClassifier            False   
index  selected_features_all_best30      LGBMClassifier            False   

           

[32m[I 2023-05-14 11:38:23,387][0m Trial 1 finished with value: 0.8893280632411067 and parameters: {'C': 0.08105103415050194, 'penalty': 'l2', 'solver': 'saga', 'max_iter': 562}. Best is trial 0 with value: 0.8893280632411067.[0m
[32m[I 2023-05-14 11:38:23,415][0m Trial 2 finished with value: 0.8873517786561265 and parameters: {'C': 0.046218227368261915, 'penalty': 'l2', 'solver': 'liblinear', 'max_iter': 907}. Best is trial 0 with value: 0.8893280632411067.[0m
[32m[I 2023-05-14 11:38:23,444][0m Trial 3 finished with value: 0.8873517786561265 and parameters: {'C': 0.05199988061090262, 'penalty': 'l2', 'solver': 'liblinear', 'max_iter': 492}. Best is trial 0 with value: 0.8893280632411067.[0m
[32m[I 2023-05-14 11:38:23,967][0m Trial 4 finished with value: 0.8853754940711462 and parameters: {'C': 0.06651836663093466, 'penalty': 'l1', 'solver': 'saga', 'max_iter': 333}. Best is trial 0 with value: 0.8893280632411067.[0m
[32m[I 2023-05-14 11:38:24,171][0m Trial 5 finished wit

                       feature_type               model with_hypertuning   
index  selected_features_all_best20  LogisticRegression            False  \
index  selected_features_all_best20                 SVC            False   
index  selected_features_all_best20       XGBClassifier            False   
index  selected_features_all_best20      LGBMClassifier            False   
index  selected_features_all_best20  LogisticRegression             True   
index  selected_features_all_best20                 SVC             True   
index  selected_features_all_best20       XGBClassifier             True   
index  selected_features_all_best20      LGBMClassifier             True   
index  selected_features_all_best30  LogisticRegression            False   
index  selected_features_all_best30                 SVC            False   
index  selected_features_all_best30       XGBClassifier            False   
index  selected_features_all_best30      LGBMClassifier            False   
index  selec

[32m[I 2023-05-14 11:38:34,655][0m Trial 0 finished with value: 0.5177865612648221 and parameters: {'svc_c': 45.48258663520493, 'svc_gamma': 29.982864410821676}. Best is trial 0 with value: 0.5177865612648221.[0m
[32m[I 2023-05-14 11:38:35,190][0m Trial 1 finished with value: 0.5177865612648221 and parameters: {'svc_c': 69.03665719583451, 'svc_gamma': 63.58316045628349}. Best is trial 0 with value: 0.5177865612648221.[0m
[32m[I 2023-05-14 11:38:35,674][0m Trial 2 finished with value: 0.5138339920948617 and parameters: {'svc_c': 10.981700362263517, 'svc_gamma': 92.44976306738184}. Best is trial 0 with value: 0.5177865612648221.[0m
[32m[I 2023-05-14 11:38:36,196][0m Trial 3 finished with value: 0.5177865612648221 and parameters: {'svc_c': 45.78783115598922, 'svc_gamma': 6.90138642611082}. Best is trial 0 with value: 0.5177865612648221.[0m
[32m[I 2023-05-14 11:38:36,757][0m Trial 4 finished with value: 0.5177865612648221 and parameters: {'svc_c': 11.490446606635224, 'svc_gam

                       feature_type               model with_hypertuning   
index  selected_features_all_best20  LogisticRegression            False  \
index  selected_features_all_best20                 SVC            False   
index  selected_features_all_best20       XGBClassifier            False   
index  selected_features_all_best20      LGBMClassifier            False   
index  selected_features_all_best20  LogisticRegression             True   
index  selected_features_all_best20                 SVC             True   
index  selected_features_all_best20       XGBClassifier             True   
index  selected_features_all_best20      LGBMClassifier             True   
index  selected_features_all_best30  LogisticRegression            False   
index  selected_features_all_best30                 SVC            False   
index  selected_features_all_best30       XGBClassifier            False   
index  selected_features_all_best30      LGBMClassifier            False   
index  selec

[32m[I 2023-05-14 11:39:22,560][0m Trial 0 finished with value: 0.883399209486166 and parameters: {'learning_rate': 0.15309337268637435, 'max_depth': 3, 'n_estimators': 993}. Best is trial 0 with value: 0.883399209486166.[0m
[32m[I 2023-05-14 11:39:23,772][0m Trial 1 finished with value: 0.8893280632411067 and parameters: {'learning_rate': 0.22028169260786543, 'max_depth': 3, 'n_estimators': 496}. Best is trial 1 with value: 0.8893280632411067.[0m
[32m[I 2023-05-14 11:39:24,947][0m Trial 2 finished with value: 0.8853754940711462 and parameters: {'learning_rate': 0.14942829827562668, 'max_depth': 4, 'n_estimators': 392}. Best is trial 1 with value: 0.8893280632411067.[0m
[32m[I 2023-05-14 11:39:26,892][0m Trial 3 finished with value: 0.8932806324110671 and parameters: {'learning_rate': 0.04132576060911622, 'max_depth': 4, 'n_estimators': 502}. Best is trial 3 with value: 0.8932806324110671.[0m
[32m[I 2023-05-14 11:39:29,048][0m Trial 4 finished with value: 0.88932806324110

                       feature_type               model with_hypertuning   
index  selected_features_all_best20  LogisticRegression            False  \
index  selected_features_all_best20                 SVC            False   
index  selected_features_all_best20       XGBClassifier            False   
index  selected_features_all_best20      LGBMClassifier            False   
index  selected_features_all_best20  LogisticRegression             True   
index  selected_features_all_best20                 SVC             True   
index  selected_features_all_best20       XGBClassifier             True   
index  selected_features_all_best20      LGBMClassifier             True   
index  selected_features_all_best30  LogisticRegression            False   
index  selected_features_all_best30                 SVC            False   
index  selected_features_all_best30       XGBClassifier            False   
index  selected_features_all_best30      LGBMClassifier            False   
index  selec

[32m[I 2023-05-14 11:42:48,639][0m Trial 0 finished with value: 0.8893280632411067 and parameters: {'num_leaves': 151, 'max_depth': 46, 'learning_rate': 0.124478597224429, 'n_estimators': 1893}. Best is trial 0 with value: 0.8893280632411067.[0m
[32m[I 2023-05-14 11:42:50,784][0m Trial 1 finished with value: 0.8913043478260869 and parameters: {'num_leaves': 137, 'max_depth': 18, 'learning_rate': 0.13594225171487398, 'n_estimators': 1698}. Best is trial 1 with value: 0.8913043478260869.[0m
[32m[I 2023-05-14 11:42:51,333][0m Trial 2 finished with value: 0.8972332015810277 and parameters: {'num_leaves': 141, 'max_depth': 11, 'learning_rate': 0.25414532320145833, 'n_estimators': 187}. Best is trial 2 with value: 0.8972332015810277.[0m
[32m[I 2023-05-14 11:42:54,517][0m Trial 3 finished with value: 0.8853754940711462 and parameters: {'num_leaves': 219, 'max_depth': 45, 'learning_rate': 0.060002349042560445, 'n_estimators': 1210}. Best is trial 2 with value: 0.8972332015810277.[0

                       feature_type               model with_hypertuning   
index  selected_features_all_best20  LogisticRegression            False  \
index  selected_features_all_best20                 SVC            False   
index  selected_features_all_best20       XGBClassifier            False   
index  selected_features_all_best20      LGBMClassifier            False   
index  selected_features_all_best20  LogisticRegression             True   
index  selected_features_all_best20                 SVC             True   
index  selected_features_all_best20       XGBClassifier             True   
index  selected_features_all_best20      LGBMClassifier             True   
index  selected_features_all_best30  LogisticRegression            False   
index  selected_features_all_best30                 SVC            False   
index  selected_features_all_best30       XGBClassifier            False   
index  selected_features_all_best30      LGBMClassifier            False   
index  selec

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


                       feature_type               model with_hypertuning   
index  selected_features_all_best20  LogisticRegression            False  \
index  selected_features_all_best20                 SVC            False   
index  selected_features_all_best20       XGBClassifier            False   
index  selected_features_all_best20      LGBMClassifier            False   
index  selected_features_all_best20  LogisticRegression             True   
index  selected_features_all_best20                 SVC             True   
index  selected_features_all_best20       XGBClassifier             True   
index  selected_features_all_best20      LGBMClassifier             True   
index  selected_features_all_best30  LogisticRegression            False   
index  selected_features_all_best30                 SVC            False   
index  selected_features_all_best30       XGBClassifier            False   
index  selected_features_all_best30      LGBMClassifier            False   
index  selec

[32m[I 2023-05-14 11:46:12,598][0m A new study created in memory with name: no-name-c3ef2b8a-55f4-4ddc-b565-7e3c9cc3467b[0m
[32m[I 2023-05-14 11:46:12,670][0m Trial 0 finished with value: 0.8774703557312253 and parameters: {'C': 0.08582067856630485, 'penalty': 'l1', 'solver': 'liblinear', 'max_iter': 790}. Best is trial 0 with value: 0.8774703557312253.[0m


                       feature_type               model with_hypertuning   
index  selected_features_all_best20  LogisticRegression            False  \
index  selected_features_all_best20                 SVC            False   
index  selected_features_all_best20       XGBClassifier            False   
index  selected_features_all_best20      LGBMClassifier            False   
index  selected_features_all_best20  LogisticRegression             True   
index  selected_features_all_best20                 SVC             True   
index  selected_features_all_best20       XGBClassifier             True   
index  selected_features_all_best20      LGBMClassifier             True   
index  selected_features_all_best30  LogisticRegression            False   
index  selected_features_all_best30                 SVC            False   
index  selected_features_all_best30       XGBClassifier            False   
index  selected_features_all_best30      LGBMClassifier            False   
index  selec

[32m[I 2023-05-14 11:46:13,055][0m Trial 1 finished with value: 0.8814229249011858 and parameters: {'C': 0.04575595001292701, 'penalty': 'l2', 'solver': 'saga', 'max_iter': 614}. Best is trial 1 with value: 0.8814229249011858.[0m
[32m[I 2023-05-14 11:46:13,096][0m Trial 2 finished with value: 0.8794466403162056 and parameters: {'C': 0.05986810018960859, 'penalty': 'l1', 'solver': 'liblinear', 'max_iter': 825}. Best is trial 1 with value: 0.8814229249011858.[0m
[32m[I 2023-05-14 11:46:13,126][0m Trial 3 finished with value: 0.8636363636363636 and parameters: {'C': 0.014235066072406884, 'penalty': 'l1', 'solver': 'liblinear', 'max_iter': 353}. Best is trial 1 with value: 0.8814229249011858.[0m
[32m[I 2023-05-14 11:46:13,163][0m Trial 4 finished with value: 0.8814229249011858 and parameters: {'C': 0.01703248903658563, 'penalty': 'l2', 'solver': 'liblinear', 'max_iter': 984}. Best is trial 1 with value: 0.8814229249011858.[0m
[32m[I 2023-05-14 11:46:13,202][0m Trial 5 finishe

                       feature_type               model with_hypertuning   
index  selected_features_all_best20  LogisticRegression            False  \
index  selected_features_all_best20                 SVC            False   
index  selected_features_all_best20       XGBClassifier            False   
index  selected_features_all_best20      LGBMClassifier            False   
index  selected_features_all_best20  LogisticRegression             True   
index  selected_features_all_best20                 SVC             True   
index  selected_features_all_best20       XGBClassifier             True   
index  selected_features_all_best20      LGBMClassifier             True   
index  selected_features_all_best30  LogisticRegression            False   
index  selected_features_all_best30                 SVC            False   
index  selected_features_all_best30       XGBClassifier            False   
index  selected_features_all_best30      LGBMClassifier            False   
index  selec

[32m[I 2023-05-14 11:46:37,959][0m Trial 0 finished with value: 0.5177865612648221 and parameters: {'svc_c': 40.793552289548295, 'svc_gamma': 12.001104843654636}. Best is trial 0 with value: 0.5177865612648221.[0m
[32m[I 2023-05-14 11:46:38,414][0m Trial 1 finished with value: 0.5138339920948617 and parameters: {'svc_c': 58.707983744808175, 'svc_gamma': 99.07600808314264}. Best is trial 0 with value: 0.5177865612648221.[0m
[32m[I 2023-05-14 11:46:39,087][0m Trial 2 finished with value: 0.5177865612648221 and parameters: {'svc_c': 17.864641376687583, 'svc_gamma': 25.242611630690515}. Best is trial 0 with value: 0.5177865612648221.[0m
[32m[I 2023-05-14 11:46:39,641][0m Trial 3 finished with value: 0.5177865612648221 and parameters: {'svc_c': 98.56400962744742, 'svc_gamma': 44.6188513506334}. Best is trial 0 with value: 0.5177865612648221.[0m
[32m[I 2023-05-14 11:46:40,244][0m Trial 4 finished with value: 0.5177865612648221 and parameters: {'svc_c': 45.454149872922336, 'svc_

                       feature_type               model with_hypertuning   
index  selected_features_all_best20  LogisticRegression            False  \
index  selected_features_all_best20                 SVC            False   
index  selected_features_all_best20       XGBClassifier            False   
index  selected_features_all_best20      LGBMClassifier            False   
index  selected_features_all_best20  LogisticRegression             True   
index  selected_features_all_best20                 SVC             True   
index  selected_features_all_best20       XGBClassifier             True   
index  selected_features_all_best20      LGBMClassifier             True   
index  selected_features_all_best30  LogisticRegression            False   
index  selected_features_all_best30                 SVC            False   
index  selected_features_all_best30       XGBClassifier            False   
index  selected_features_all_best30      LGBMClassifier            False   
index  selec

[32m[I 2023-05-14 11:47:35,034][0m Trial 0 finished with value: 0.8873517786561265 and parameters: {'learning_rate': 0.2747358549724469, 'max_depth': 4, 'n_estimators': 206}. Best is trial 0 with value: 0.8873517786561265.[0m
[32m[I 2023-05-14 11:47:38,764][0m Trial 1 finished with value: 0.8932806324110671 and parameters: {'learning_rate': 0.049433721318135074, 'max_depth': 3, 'n_estimators': 927}. Best is trial 1 with value: 0.8932806324110671.[0m
[32m[I 2023-05-14 11:47:40,534][0m Trial 2 finished with value: 0.8893280632411067 and parameters: {'learning_rate': 0.15622721022045377, 'max_depth': 5, 'n_estimators': 190}. Best is trial 1 with value: 0.8932806324110671.[0m
[32m[I 2023-05-14 11:47:42,826][0m Trial 3 finished with value: 0.8774703557312253 and parameters: {'learning_rate': 0.1511430892067905, 'max_depth': 2, 'n_estimators': 996}. Best is trial 1 with value: 0.8932806324110671.[0m
[32m[I 2023-05-14 11:47:45,294][0m Trial 4 finished with value: 0.8833992094861

                       feature_type               model with_hypertuning   
index  selected_features_all_best20  LogisticRegression            False  \
index  selected_features_all_best20                 SVC            False   
index  selected_features_all_best20       XGBClassifier            False   
index  selected_features_all_best20      LGBMClassifier            False   
index  selected_features_all_best20  LogisticRegression             True   
index  selected_features_all_best20                 SVC             True   
index  selected_features_all_best20       XGBClassifier             True   
index  selected_features_all_best20      LGBMClassifier             True   
index  selected_features_all_best30  LogisticRegression            False   
index  selected_features_all_best30                 SVC            False   
index  selected_features_all_best30       XGBClassifier            False   
index  selected_features_all_best30      LGBMClassifier            False   
index  selec

[32m[I 2023-05-14 11:53:27,429][0m Trial 0 finished with value: 0.8774703557312253 and parameters: {'num_leaves': 227, 'max_depth': 43, 'learning_rate': 0.2778019303343961, 'n_estimators': 1461}. Best is trial 0 with value: 0.8774703557312253.[0m
[32m[I 2023-05-14 11:53:31,940][0m Trial 1 finished with value: 0.8893280632411067 and parameters: {'num_leaves': 223, 'max_depth': 16, 'learning_rate': 0.10290531634605304, 'n_estimators': 1215}. Best is trial 1 with value: 0.8893280632411067.[0m
[32m[I 2023-05-14 11:53:38,623][0m Trial 2 finished with value: 0.8853754940711462 and parameters: {'num_leaves': 106, 'max_depth': 27, 'learning_rate': 0.031657949034507527, 'n_estimators': 1008}. Best is trial 1 with value: 0.8893280632411067.[0m
[32m[I 2023-05-14 11:53:40,454][0m Trial 3 finished with value: 0.8814229249011858 and parameters: {'num_leaves': 7, 'max_depth': 40, 'learning_rate': 0.2804540099026719, 'n_estimators': 1662}. Best is trial 1 with value: 0.8893280632411067.[0m

                       feature_type               model with_hypertuning   
index  selected_features_all_best20  LogisticRegression            False  \
index  selected_features_all_best20                 SVC            False   
index  selected_features_all_best20       XGBClassifier            False   
index  selected_features_all_best20      LGBMClassifier            False   
index  selected_features_all_best20  LogisticRegression             True   
index  selected_features_all_best20                 SVC             True   
index  selected_features_all_best20       XGBClassifier             True   
index  selected_features_all_best20      LGBMClassifier             True   
index  selected_features_all_best30  LogisticRegression            False   
index  selected_features_all_best30                 SVC            False   
index  selected_features_all_best30       XGBClassifier            False   
index  selected_features_all_best30      LGBMClassifier            False   
index  selec

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

                        feature_type               model with_hypertuning   
index   selected_features_all_best20  LogisticRegression            False  \
index   selected_features_all_best20                 SVC            False   
index   selected_features_all_best20       XGBClassifier            False   
index   selected_features_all_best20      LGBMClassifier            False   
index   selected_features_all_best20  LogisticRegression             True   
index   selected_features_all_best20                 SVC             True   
index   selected_features_all_best20       XGBClassifier             True   
index   selected_features_all_best20      LGBMClassifier             True   
index   selected_features_all_best30  LogisticRegression            False   
index   selected_features_all_best30                 SVC            False   
index   selected_features_all_best30       XGBClassifier            False   
index   selected_features_all_best30      LGBMClassifier            False   

[32m[I 2023-05-14 11:56:35,852][0m A new study created in memory with name: no-name-43b144b1-9ee0-4e09-9aa9-0b9af1db68db[0m
[32m[I 2023-05-14 11:56:35,963][0m Trial 0 finished with value: 0.8814229249011858 and parameters: {'C': 0.08534929086077442, 'penalty': 'l1', 'solver': 'liblinear', 'max_iter': 755}. Best is trial 0 with value: 0.8814229249011858.[0m


                        feature_type               model with_hypertuning   
index   selected_features_all_best20  LogisticRegression            False  \
index   selected_features_all_best20                 SVC            False   
index   selected_features_all_best20       XGBClassifier            False   
index   selected_features_all_best20      LGBMClassifier            False   
index   selected_features_all_best20  LogisticRegression             True   
index   selected_features_all_best20                 SVC             True   
index   selected_features_all_best20       XGBClassifier             True   
index   selected_features_all_best20      LGBMClassifier             True   
index   selected_features_all_best30  LogisticRegression            False   
index   selected_features_all_best30                 SVC            False   
index   selected_features_all_best30       XGBClassifier            False   
index   selected_features_all_best30      LGBMClassifier            False   

[32m[I 2023-05-14 11:56:39,417][0m Trial 1 finished with value: 0.8853754940711462 and parameters: {'C': 0.06225657591442953, 'penalty': 'l1', 'solver': 'saga', 'max_iter': 788}. Best is trial 1 with value: 0.8853754940711462.[0m
[32m[I 2023-05-14 11:56:39,514][0m Trial 2 finished with value: 0.8932806324110671 and parameters: {'C': 0.08263726939093811, 'penalty': 'l2', 'solver': 'liblinear', 'max_iter': 623}. Best is trial 2 with value: 0.8932806324110671.[0m
[32m[I 2023-05-14 11:56:40,858][0m Trial 3 finished with value: 0.8952569169960475 and parameters: {'C': 0.08082468014153397, 'penalty': 'l2', 'solver': 'saga', 'max_iter': 497}. Best is trial 3 with value: 0.8952569169960475.[0m
[32m[I 2023-05-14 11:56:40,941][0m Trial 4 finished with value: 0.8932806324110671 and parameters: {'C': 0.07663931196568836, 'penalty': 'l2', 'solver': 'liblinear', 'max_iter': 320}. Best is trial 3 with value: 0.8952569169960475.[0m
[32m[I 2023-05-14 11:56:41,018][0m Trial 5 finished with

                        feature_type               model with_hypertuning   
index   selected_features_all_best20  LogisticRegression            False  \
index   selected_features_all_best20                 SVC            False   
index   selected_features_all_best20       XGBClassifier            False   
index   selected_features_all_best20      LGBMClassifier            False   
index   selected_features_all_best20  LogisticRegression             True   
index   selected_features_all_best20                 SVC             True   
index   selected_features_all_best20       XGBClassifier             True   
index   selected_features_all_best20      LGBMClassifier             True   
index   selected_features_all_best30  LogisticRegression            False   
index   selected_features_all_best30                 SVC            False   
index   selected_features_all_best30       XGBClassifier            False   
index   selected_features_all_best30      LGBMClassifier            False   

[32m[I 2023-05-14 11:58:18,108][0m Trial 0 finished with value: 0.5177865612648221 and parameters: {'svc_c': 78.12209306408987, 'svc_gamma': 26.10190874864247}. Best is trial 0 with value: 0.5177865612648221.[0m
[32m[I 2023-05-14 11:58:18,822][0m Trial 1 finished with value: 0.5138339920948617 and parameters: {'svc_c': 56.51771659426793, 'svc_gamma': 93.67982138851708}. Best is trial 0 with value: 0.5177865612648221.[0m
[32m[I 2023-05-14 11:58:19,444][0m Trial 2 finished with value: 0.5177865612648221 and parameters: {'svc_c': 70.16516705408472, 'svc_gamma': 19.471233381399788}. Best is trial 0 with value: 0.5177865612648221.[0m
[32m[I 2023-05-14 11:58:20,054][0m Trial 3 finished with value: 0.5177865612648221 and parameters: {'svc_c': 6.87552501441381, 'svc_gamma': 42.74508324946356}. Best is trial 0 with value: 0.5177865612648221.[0m
[32m[I 2023-05-14 11:58:20,652][0m Trial 4 finished with value: 0.5138339920948617 and parameters: {'svc_c': 92.10256990605347, 'svc_gamma

                        feature_type               model with_hypertuning   
index   selected_features_all_best20  LogisticRegression            False  \
index   selected_features_all_best20                 SVC            False   
index   selected_features_all_best20       XGBClassifier            False   
index   selected_features_all_best20      LGBMClassifier            False   
index   selected_features_all_best20  LogisticRegression             True   
index   selected_features_all_best20                 SVC             True   
index   selected_features_all_best20       XGBClassifier             True   
index   selected_features_all_best20      LGBMClassifier             True   
index   selected_features_all_best30  LogisticRegression            False   
index   selected_features_all_best30                 SVC            False   
index   selected_features_all_best30       XGBClassifier            False   
index   selected_features_all_best30      LGBMClassifier            False   

[32m[I 2023-05-14 11:59:21,611][0m Trial 0 finished with value: 0.8893280632411067 and parameters: {'learning_rate': 0.21246312417174512, 'max_depth': 2, 'n_estimators': 616}. Best is trial 0 with value: 0.8893280632411067.[0m
[32m[I 2023-05-14 11:59:25,717][0m Trial 1 finished with value: 0.8972332015810277 and parameters: {'learning_rate': 0.05172888818693755, 'max_depth': 5, 'n_estimators': 403}. Best is trial 1 with value: 0.8972332015810277.[0m
[32m[I 2023-05-14 11:59:30,699][0m Trial 2 finished with value: 0.8932806324110671 and parameters: {'learning_rate': 0.05498622277393403, 'max_depth': 4, 'n_estimators': 496}. Best is trial 1 with value: 0.8972332015810277.[0m
[32m[I 2023-05-14 11:59:33,145][0m Trial 3 finished with value: 0.8873517786561265 and parameters: {'learning_rate': 0.2611868850156039, 'max_depth': 4, 'n_estimators': 354}. Best is trial 1 with value: 0.8972332015810277.[0m
[32m[I 2023-05-14 11:59:38,608][0m Trial 4 finished with value: 0.8932806324110

                        feature_type               model with_hypertuning   
index   selected_features_all_best20  LogisticRegression            False  \
index   selected_features_all_best20                 SVC            False   
index   selected_features_all_best20       XGBClassifier            False   
index   selected_features_all_best20      LGBMClassifier            False   
index   selected_features_all_best20  LogisticRegression             True   
index   selected_features_all_best20                 SVC             True   
index   selected_features_all_best20       XGBClassifier             True   
index   selected_features_all_best20      LGBMClassifier             True   
index   selected_features_all_best30  LogisticRegression            False   
index   selected_features_all_best30                 SVC            False   
index   selected_features_all_best30       XGBClassifier            False   
index   selected_features_all_best30      LGBMClassifier            False   

[32m[I 2023-05-14 12:06:48,046][0m Trial 0 finished with value: 0.8873517786561265 and parameters: {'num_leaves': 98, 'max_depth': 22, 'learning_rate': 0.26204601918854353, 'n_estimators': 444}. Best is trial 0 with value: 0.8873517786561265.[0m
[32m[I 2023-05-14 12:06:49,925][0m Trial 1 finished with value: 0.8913043478260869 and parameters: {'num_leaves': 37, 'max_depth': 32, 'learning_rate': 0.0747843128262703, 'n_estimators': 364}. Best is trial 1 with value: 0.8913043478260869.[0m
[32m[I 2023-05-14 12:06:53,805][0m Trial 2 finished with value: 0.8893280632411067 and parameters: {'num_leaves': 177, 'max_depth': 32, 'learning_rate': 0.14651735429553095, 'n_estimators': 1942}. Best is trial 1 with value: 0.8913043478260869.[0m
[32m[I 2023-05-14 12:06:56,951][0m Trial 3 finished with value: 0.8913043478260869 and parameters: {'num_leaves': 120, 'max_depth': 8, 'learning_rate': 0.09491243639916087, 'n_estimators': 1849}. Best is trial 1 with value: 0.8913043478260869.[0m
[

                        feature_type               model with_hypertuning   
index   selected_features_all_best20  LogisticRegression            False  \
index   selected_features_all_best20                 SVC            False   
index   selected_features_all_best20       XGBClassifier            False   
index   selected_features_all_best20      LGBMClassifier            False   
index   selected_features_all_best20  LogisticRegression             True   
index   selected_features_all_best20                 SVC             True   
index   selected_features_all_best20       XGBClassifier             True   
index   selected_features_all_best20      LGBMClassifier             True   
index   selected_features_all_best30  LogisticRegression            False   
index   selected_features_all_best30                 SVC            False   
index   selected_features_all_best30       XGBClassifier            False   
index   selected_features_all_best30      LGBMClassifier            False   

# Best Model with Full Training Dataset

In [None]:
# Load the results
results_without_selected_features = pd.read_csv('results_v2.csv')
results_with_selected_features = pd.read_csv(f'{feature_engineered_data_dir}/results_20&30&50&100.csv')

feature_types = ['AAC', 'APAAC', 'CTD', 'DPC', 'PAAC']
selected_feature_types = ['selected_features_all_best20', 'selected_features_all_best30', 'selected_features_all_best50', 'selected_features_all_best100']

# Combine the feature types
feature_types.extend(selected_feature_types)

test_results = []

# iterate through each row of results
for feature_type in feature_types:

    # Check if the feature type is selected features
    if 'selected_features' in feature_type:
        # Load the training dataset
        train_data = pd.read_csv(f'{feature_engineered_data_dir}/TR_{feature_type}.csv')
        test_data = pd.read_csv(f'{feature_engineered_data_dir}/TS_{feature_type}.csv')
        results = results_with_selected_features
    else:
        # Load the training dataset
        train_data = pd.read_csv(f'{data_dir}/TR_{feature_type}.csv')
        test_data = pd.read_csv(f'{data_dir}/TS_{feature_type}.csv')
        results = results_without_selected_features

    # Separate features and target
    X_train = train_data.drop(columns=['label', 'id'], axis=1)
    y_train = train_data['label']

    X_test = test_data.drop(columns=['label', 'id'], axis=1)
    y_test = test_data['label']

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # iterate through each model
    for name, model in models.items():
        # get the row of the model

        
        rows = results[(results['feature_type'] == feature_type) & (results['model'] == name)]

        # iterate through each row
        for index, row in rows.iterrows():

            # check whether the model has hyperparameters
            if row['with_hypertuning'] == True:
                hyperparameters = ast.literal_eval(row['best_params'])
                # check the model is SVC
                if row['model'] == 'SVC':
                    hyperparameters = {k[4:]: v for k, v in hyperparameters.items()}
                    # make key 'c' to 'C'
                    hyperparameters['C'] = hyperparameters.pop('c')
                # set best hyperparameters
                model.set_params(**hyperparameters)

            # fit model
            model.fit(X_train, y_train)

            # predict
            y_pred = model.predict(X_test)

            # evaluate using accuracy, sensitivity, specificity, precision, f1, mcc
            accuracy = accuracy_score(y_test, y_pred)
            sensitivity = recall_score(y_test, y_pred)
            specificity = recall_score(y_test, y_pred, pos_label=0)
            precision = precision_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred)
            mcc = matthews_corrcoef(y_test, y_pred)

            # append to test_results
            test_results.append({'feature_type': feature_type, 'model': name, 'with_hypertuning': row['with_hypertuning'], 'best_params': row['best_params'], 'accuracy': accuracy, 'sensitivity': sensitivity, 'specificity': specificity, 'precision': precision, 'f1': f1, 'mcc': mcc, 'index': row['index']})
    print(f'Feature Type: {feature_type} done!')

test_results = pd.DataFrame(test_results)
test_results.to_csv('test_results.csv', index=False)

Feature Type: AAC done!
Feature Type: APAAC done!




Feature Type: CTD done!




Feature Type: DPC done!
Feature Type: PAAC done!
Feature Type: selected_features_all_best20 done!
Feature Type: selected_features_all_best30 done!




Feature Type: selected_features_all_best50 done!




Feature Type: selected_features_all_best100 done!


# Model Ensembling

In [1]:
#  Create a dictionary to store the trained models
trained_models = {}

feature_types = ['AAC', 'APAAC', 'DPC', 'PAAC']

# get the results
results = pd.read_csv('results_v2.csv')

# create an empty DataFrame to store the merged dataset
merged_train_data = pd.DataFrame()
merged_test_data = pd.DataFrame()

# iterate through the feature types
for feature_type in feature_types:
    # Load the training dataset
    train_data = pd.read_csv(f'{, columns=data_dir}/TR_{feature_type}.csv')
    test_data = pd.read_csv(f'{data_dir}/TS_{feature_type}.csv')
    
    # Separate features and target
    X_train = train_data.drop(columns=['label', 'id'], axis=1)
    y_train = train_data['label']

    X_test = test_data.drop(columns=['label', 'id'], axis=1)
    y_test = test_data['label']
    
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    train_data = pd.concat([train_data['id'], train_data['label'], pd.DataFrame(X_train)], axis=1)
    test_data = pd.concat([test_data['id'], test_data['label'], pd.DataFrame(X_test)], axis=1)

    # check whether the merged dataset is empty
    if merged_train_data.empty:
        merged_train_data = train_data
        merged_test_data = test_data
    else:
        # assume 'id' is the common column
        merged_train_data = pd.merge(merged_train_data, train_data, on=['id', 'label'])
        merged_test_data = pd.merge(merged_test_data, test_data, on=['id', 'label'])

    # Get the best model for each feature type
    best_model = results[results['feature_type'] == feature_type].sort_values(by='accuracy', ascending=False).iloc[0]
    model = models[best_model['model']]
    # check whether the model has hyperparameters
    if best_model['with_hypertuning'] == True:
        hyperparameters = ast.literal_eval(best_model['best_params'])
        # check the model is SVC
        if best_model['model'] == 'SVC':
            hyperparameters = {k[4:]: v for k, v in hyperparameters.items()}
            # make key 'c' to 'C'
            hyperparameters['C'] = hyperparameters.pop('c')
        # set best hyperparameters
        model.set_params(**hyperparameters)
    # fit model
    model.fit(X_train, y_train)
    # append to trained_models 
    trained_models[feature_type] = model

# Create the ensemble model
ensemble = VotingClassifier(estimators=list(trained_models.items()), voting='hard')

# Seperate features and target
X_train = merged_train_data.drop(columns=['label', 'id'], axis=1)
y_train = merged_train_data['label']

X_test = merged_test_data.drop(columns=['label', 'id'], axis=1)
y_test = merged_test_data['label']

# fit model
ensemble.fit(X_train, y_train)

# predict
y_pred = ensemble.predict(X_test)

# evaluate using accuracy, sensitivity, specificity, precision, f1, mcc
accuracy = accuracy_score(y_test, y_pred)
sensitivity = recall_score(y_test, y_pred)
specificity = recall_score(y_test, y_pred, pos_label=0)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
mcc = matthews_corrcoef(y_test, y_pred)

print(f'Accuracy: {accuracy}')

# Model Ensembling with Selected Features