In [1]:
# Import Libraries
import os
import sys
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
def run_train_pipeline(input_file:str = None,  
                       target_column:str = None,
                       input_type:str = "csv",
                       training_type:str = "clf",
                       engineer_new_features:bool = False, 
                       output_base:str = "", 
                       test_size:float = 0.2, 
                       no_standard_scaling:bool = False,
                       feature_selection:bool = False, 
                       feature_selection_method:str = "addition",
                       selectkbest_num_features: int = 32, 
                       output_dir:str = None,
                       return_data:bool = False):
    """
    Function to facilitate the training of multiple machine learning models,
    optimize the models, and save the trained models. It also conducts model
    evaluation using diverse methods. Additionally, the function is capable
    of handling both regression and classification tasks.

    Args:
        input_file (str): Path to the input data in CSV/TSV format.
        input_type (str): Type of input file format (csv or tsv).
        training_type (str): Type of training ("classification" or "regression").
        target_column (str): Name of the target column in the input dataframe.
        engineer_new_features (bool, optional): Flag to perform engineering of
            new features or not. Defaults to False.
        output_base (str, optional): Base Name for most output files. Defaults to None.
        test_size (float, optional): Fraction of the dataset to be used for testing.
            Defaults to 0.2.
        no_standard_scaling (bool, optional): Whether or not to apply scikit-learn
            standard scaler on the data. Defaults to False.
        feature_selection (bool, optional): Whether or not to perform feature
            selection on the dataset. Defaults to False.
        feature_selection_method (str, optional): Specify between recursive
            feature addition and recursive feature elimination algorithms for
            classification. Defaults to None.
        selectkbest_num_features (int, optional): Number of top features to select.
            For regression only. Defaults to None.
        output_dir (str, optional): Custom Name of Output Folder. Defaults to None.
        return_data (bool, optional): Select to include raw data, training data
            and test data in the output folders. Defaults to False.
    """
    if  feature_selection:
        feature_selection = " --feature_selection"
    else:
        feature_selection = ""

    if engineer_new_features:
        engineer_new_features = " --engineer_new_features"
    else:
        engineer_new_features = ""

    if no_standard_scaling:
        no_standard_scaling = " --no_standard_scaling"
    else:
        no_standard_scaling = ""

    if output_base == "":
        output_base = f" --output_base {training_type}"
    else:
        output_base = f" --output_base {output_base}"
        
    # handle options
    script = f"run_train_pipeline --input_file {input_file} --target_column {target_column} --selectkbest_num_features {selectkbest_num_features} --training_type {training_type} --test_size {test_size} --feature_selection_method {feature_selection_method} --output_dir {output_dir}" + feature_selection + engineer_new_features + no_standard_scaling + output_base

    print(script)
    # Your code to execute the training pipeline goes here
    os.system(script)

In [5]:
# Example usage:
run_train_pipeline(input_file = "data/data.csv",  
                   target_column = "incidence",
                   input_type = "csv",
                   training_type = "reg",
                   engineer_new_features=False,
                   output_base="",
                   test_size=0.2, 
                   no_standard_scaling=False,
                   feature_selection=False, 
                   feature_selection_method="addition",
                   selectkbest_num_features=32, 
                   output_dir="outputs"
                  )


run_train_pipeline --input_file data/data.csv --target_column incidence --selectkbest_num_features 32 --training_type reg --test_size 0.2 --feature_selection_method addition --output_dir outputs --output_base reg


100%|██████████| 7/7 [05:38<00:00, 48.38s/it] 


Best model: Random Forest
Best model cross validation score: 0.9531815048874028


In [6]:
# load data
data = pd.read_csv("data/data.csv")
data.head()

Unnamed: 0,year,country,precipitation,AvMeanSurAirTemp,AvMaxSurAirTemp,AvMinSurAirTemp,incidence,longitude,latitude,group
0,2000,Congo,1644.79,24.49,29.03,19.99,353.41557,15.827659,-0.228021,high incidence
1,2001,Congo,1516.01,24.68,29.22,20.19,350.93625,15.827659,-0.228021,high incidence
2,2002,Congo,1717.96,24.76,29.3,20.26,321.67402,15.827659,-0.228021,high incidence
3,2003,Congo,1573.98,24.73,29.27,20.23,319.21132,15.827659,-0.228021,high incidence
4,2004,Congo,1507.59,24.83,29.37,20.33,317.81208,15.827659,-0.228021,high incidence


In [7]:
# Prepare Data
# make directories
classification_path = "data/classification"
regression_path = "data/regression"
os.makedirs(classification_path,exist_ok=True)
os.makedirs(regression_path,exist_ok=True)

# prepare regression data
# with both country and year columns
reg1 = data.drop(["group"], axis = 1)
reg1.to_csv("data/regression/reg1.csv", index = False)

# without country column
reg2 = data.drop(["group", "country"], axis = 1)
reg2.to_csv("data/regression/reg2.csv", index = False)

# without year column
reg3 = data.drop(["group", "year"], axis = 1)
reg3.to_csv("data/regression/reg3.csv", index = False)

# without both country and year columns
reg4 = data.drop(["group", "year", "country"], axis = 1)
reg4.to_csv("data/regression/reg4.csv", index = False)

# prepare classification data
# with both country and year columns
clf1 = data.drop(["incidence"], axis = 1)
clf1.to_csv("data/classification/clf1.csv", index = False)

# without country column
clf2 = data.drop(["incidence", "country"], axis = 1)
clf2.to_csv("data/classification/clf2.csv", index = False)

# without year column
clf3 = data.drop(["incidence", "year"], axis = 1)
clf3.to_csv("data/classification/clf3.csv", index = False)

# without both country and year columns
clf4 = data.drop(["incidence", "year", "country"], axis = 1)
clf4.to_csv("data/classification/clf4.csv", index = False)

## Experiment 1: How does the Year and Country Info Affect Model Performance?
### A. Regression

In [8]:
# Configure Experiments
# make experiment output directory
experiment1_output_path = "data/regression/Experiment1"
os.makedirs(experiment1_output_path,exist_ok=True)


experiment_config = {
    "with year and country": "data/regression/reg1.csv",
    "with year and without country": "data/regression/reg2.csv",
    "without year and with country": "data/regression/reg3.csv",
    "without year and without country": "data/regression/reg4.csv"
}

In [9]:
# Run Experiments
for experiment_name, data_path in tqdm(experiment_config.items()):
    print(f"\nTraining Models {experiment_name} data")
    outdir = os.path.join(experiment1_output_path, os.path.basename(data_path).split(".")[0])
    # run experiment
    run_train_pipeline(input_file = data_path,  
                       target_column = "incidence",
                       input_type = "csv",
                       training_type = "reg",
                       engineer_new_features=False,
                       output_base="",
                       test_size=0.2, 
                       no_standard_scaling=False,
                       feature_selection=False, 
                       feature_selection_method="addition",
                       selectkbest_num_features=32, 
                       output_dir=outdir
                      )

  0%|          | 0/4 [00:00<?, ?it/s]


Training Models with year and country data
run_train_pipeline --input_file data/regression/reg1.csv --target_column incidence --selectkbest_num_features 32 --training_type reg --test_size 0.2 --feature_selection_method addition --output_dir data/regression/reg1 --output_base reg


100%|██████████| 7/7 [07:16<00:00, 62.35s/it] 


Best model: Random Forest
Best model cross validation score: 0.9553503102274707

Training Models with year and without country data
run_train_pipeline --input_file data/regression/reg2.csv --target_column incidence --selectkbest_num_features 32 --training_type reg --test_size 0.2 --feature_selection_method addition --output_dir data/regression/reg2 --output_base reg


100%|██████████| 7/7 [04:23<00:00, 37.61s/it]


Best model: CatBoosting Regressor
Best model cross validation score: 0.9538056179981247

Training Models without year and with country data
run_train_pipeline --input_file data/regression/reg3.csv --target_column incidence --selectkbest_num_features 32 --training_type reg --test_size 0.2 --feature_selection_method addition --output_dir data/regression/reg3 --output_base reg


100%|██████████| 7/7 [06:05<00:00, 52.15s/it] 


Best model: Gradient Boosting
Best model cross validation score: 0.8748943182645273

Training Models without year and without country data
run_train_pipeline --input_file data/regression/reg4.csv --target_column incidence --selectkbest_num_features 32 --training_type reg --test_size 0.2 --feature_selection_method addition --output_dir data/regression/reg4 --output_base reg


100%|██████████| 7/7 [03:34<00:00, 30.70s/it]


Best model: CatBoosting Regressor
Best model cross validation score: 0.8714057203433427


It seems making use of both the year and country columns affects the model performance in a positive way. While removing the year column affects the models very badly, removing the country column affects it only slightly. This may be due to the fact that other country data such as longitude and latitude are still being used. If similar results are produced in classification, then the country column can be discarded. 

### B. Classification

In [10]:
# Configure Experiments
# make experiment output directory
experiment1_output_path = "data/classification/Experiment1"
os.makedirs(experiment1_output_path,exist_ok=True)


experiment_config = {
    "with year and country": "data/classification/clf1.csv",
    "with year and without country": "data/classification/clf2.csv",
    "without year and with country": "data/classification/clf3.csv",
    "without year and without country": "data/classification/clf4.csv"
}

In [11]:
# Run Experiments
for experiment_name, data_path in tqdm(experiment_config.items()):
    print(f"\nTraining Models {experiment_name} data")
    outdir = os.path.join(experiment1_output_path, os.path.basename(data_path).split(".")[0])
    # run experiment
    run_train_pipeline(input_file = data_path,  
                       target_column = "group",
                       input_type = "csv",
                       training_type = "clf",
                       engineer_new_features=False,
                       output_base="",
                       test_size=0.2, 
                       no_standard_scaling=False,
                       feature_selection=False, 
                       feature_selection_method="addition",
                       selectkbest_num_features=32, 
                       output_dir=outdir
                      )

  0%|          | 0/4 [00:00<?, ?it/s]


Training Models with year and country data
run_train_pipeline --input_file data/classification/clf1.csv --target_column group --selectkbest_num_features 32 --training_type clf --test_size 0.2 --feature_selection_method addition --output_dir data/classification/clf1 --output_base clf


100%|██████████| 9/9 [06:38<00:00, 44.33s/it]


Best model: MLPClassifier
Best model cross validation score: 0.9685672434312919

Training Models with year and without country data
run_train_pipeline --input_file data/classification/clf2.csv --target_column group --selectkbest_num_features 32 --training_type clf --test_size 0.2 --feature_selection_method addition --output_dir data/classification/clf2 --output_base clf


100%|██████████| 9/9 [05:01<00:00, 33.55s/it]


Best model: XGBClassifier
Best model cross validation score: 0.9576764625102994

Training Models without year and with country data
run_train_pipeline --input_file data/classification/clf3.csv --target_column group --selectkbest_num_features 32 --training_type clf --test_size 0.2 --feature_selection_method addition --output_dir data/classification/clf3 --output_base clf


100%|██████████| 9/9 [05:11<00:00, 34.62s/it]


Best model: MLPClassifier
Best model cross validation score: 0.9256394763343405

Training Models without year and without country data
run_train_pipeline --input_file data/classification/clf4.csv --target_column group --selectkbest_num_features 32 --training_type clf --test_size 0.2 --feature_selection_method addition --output_dir data/classification/clf4 --output_base clf


100%|██████████| 9/9 [04:38<00:00, 30.98s/it]


Best model: Random Forest
Best model cross validation score: 0.922012267692026


It seems like country and year column affect classification the same way as it affects regression. Hover, removing the country column seems to drop the accuracy significantly. Moreover, for comparative studies, we will keep both the year and country columns.

## Experiment 2: How Does Data Normalization Affect Model Performance
### A. Regression

In [13]:
# Configure Experiments
# make experiment output directory
experiment2_output_path = "data/regression/Experiment2"
os.makedirs(experiment2_output_path,exist_ok=True)

experiment_config = {
    "with normalization": {"no_standard_scaling": False, "output_dir": "data/regression/Experiment2/standard_scaling"},
    "without normalization": {"no_standard_scaling":True, "output_dir": "data/regression/Experiment2/no_standard_scaling"}
}

In [14]:
# Run Experiments
data_path = "data/regression/reg1.csv"
for experiment_name, config in tqdm(experiment_config.items()):
    print(f"\nTraining Models {experiment_name} data")
    outdir = config["output_dir"]
    no_standard_scaling = config["no_standard_scaling"]
    # run experiment
    run_train_pipeline(input_file = data_path,  
                       target_column = "incidence",
                       input_type = "csv",
                       training_type = "reg",
                       engineer_new_features=False,
                       output_base="",
                       test_size=0.2, 
                       no_standard_scaling=no_standard_scaling,
                       feature_selection=False, 
                       feature_selection_method="addition",
                       selectkbest_num_features=32, 
                       output_dir=outdir
                      )

  0%|          | 0/2 [00:00<?, ?it/s]


Training Models data with normalization
run_train_pipeline --input_file data/regression/reg1.csv --target_column incidence --selectkbest_num_features 32 --training_type reg --test_size 0.2 --feature_selection_method addition --output_dir data/regression/Experiment2/standard_scaling --output_base reg


100%|██████████| 7/7 [06:24<00:00, 55.00s/it] 


Best model: Random Forest
Best model cross validation score: 0.9553503102274707

Training Models data without normalization
run_train_pipeline --input_file data/regression/reg1.csv --target_column incidence --selectkbest_num_features 32 --training_type reg --test_size 0.2 --feature_selection_method addition --output_dir data/regression/Experiment2/no_standard_scaling --no_standard_scaling --output_base reg


100%|██████████| 7/7 [06:15<00:00, 53.66s/it] 


Best model: Random Forest
Best model cross validation score: 0.9566985202775051


### B. Classification

In [15]:
# Configure Experiments
# make experiment output directory
experiment2_output_path = "data/classification/Experiment2"
os.makedirs(experiment2_output_path,exist_ok=True)

experiment_config = {
    "with normalization": {"no_standard_scaling": False, "output_dir": "data/classification/Experiment2/standard_scaling"},
    "without normalization": {"no_standard_scaling":True, "output_dir": "data/classification/Experiment2/no_standard_scaling"}
}

In [18]:
# Run Experiments
data_path = "data/classification/clf1.csv"
for experiment_name, config in tqdm(experiment_config.items()):
    print(f"\nTraining Models {experiment_name} data")
    outdir = config["output_dir"]
    no_standard_scaling = config["no_standard_scaling"]
    # run experiment
    run_train_pipeline(input_file = data_path,  
                       target_column = "group",
                       input_type = "csv",
                       training_type = "clf",
                       engineer_new_features=False,
                       output_base="",
                       test_size=0.2, 
                       no_standard_scaling=no_standard_scaling,
                       feature_selection=False, 
                       feature_selection_method="addition",
                       selectkbest_num_features=32, 
                       output_dir=outdir
                      )

  0%|          | 0/2 [00:00<?, ?it/s]


Training Models with normalization data
run_train_pipeline --input_file data/classification/clf1.csv --target_column group --selectkbest_num_features 32 --training_type clf --test_size 0.2 --feature_selection_method addition --output_dir data/classification/Experiment2/standard_scaling --output_base clf


100%|██████████| 9/9 [05:59<00:00, 39.96s/it]


Best model: MLPClassifier
Best model cross validation score: 0.9685672434312919

Training Models without normalization data
run_train_pipeline --input_file data/classification/clf1.csv --target_column group --selectkbest_num_features 32 --training_type clf --test_size 0.2 --feature_selection_method addition --output_dir data/classification/Experiment2/no_standard_scaling --no_standard_scaling --output_base clf


100%|██████████| 9/9 [06:22<00:00, 42.51s/it]


Best model: CatBoosting Classifier
Best model cross validation score: 0.9661484940034789


From the above results, normalization affects regression model performance slighly positively while the reverse in true for classification. However, in research findings, normalization has shown to generally yield better results. Therefore, we will implement normalization on our project.

## Experiment 3: How Does Feature Engineering Affect Model Performance?
### A. Regression

In [21]:
# Configure Experiments
# make experiment output directory
experiment3_output_path = "data/regression/Experiment3"
os.makedirs(experiment3_output_path,exist_ok=True)

experiment_config = {
    "with featuring engineering": {"engineer_new_features": True, "output_dir": "data/regression/Experiment3/feature-engineering"},
    "without feature engineering": {"engineer_new_features": False, "output_dir": "data/regression/Experiment3/no-feature-engineering"}
}

In [22]:
# Run Experiments
data_path = "data/regression/reg1.csv"
for experiment_name, config in tqdm(experiment_config.items()):
    print(f"\nTraining Models {experiment_name} on data")
    outdir = config["output_dir"]
    engineer_new_features = config["engineer_new_features"]
    # run experiment
    run_train_pipeline(input_file = data_path,  
                       target_column = "incidence",
                       input_type = "csv",
                       training_type = "reg",
                       engineer_new_features=engineer_new_features,
                       output_base="",
                       test_size=0.2, 
                       no_standard_scaling=False,
                       feature_selection=False, 
                       feature_selection_method="addition",
                       selectkbest_num_features=32, 
                       output_dir=outdir
                      )

  0%|          | 0/2 [00:00<?, ?it/s]


Training Models with featuring engineering on data
run_train_pipeline --input_file data/regression/reg1.csv --target_column incidence --selectkbest_num_features 32 --training_type reg --test_size 0.2 --feature_selection_method addition --output_dir data/regression/Experiment3/feature-engineering --engineer_new_features --output_base reg


100%|██████████| 7/7 [38:52<00:00, 333.18s/it]  


Best model: CatBoosting Regressor
Best model cross validation score: 0.9535993097575188

Training Models without feature engineering on data
run_train_pipeline --input_file data/regression/reg1.csv --target_column incidence --selectkbest_num_features 32 --training_type reg --test_size 0.2 --feature_selection_method addition --output_dir data/regression/Experiment3/no-feature-engineering --output_base reg


100%|██████████| 7/7 [06:04<00:00, 52.13s/it] 


Best model: Random Forest
Best model cross validation score: 0.9553503102274707


### B. Classification

In [23]:
# Configure Experiments
# make experiment output directory
experiment3_output_path = "data/classification/Experiment3"
os.makedirs(experiment3_output_path,exist_ok=True)

experiment_config = {
    "with featuring engineering": {"engineer_new_features": True, "output_dir": "data/classification/Experiment3/feature-engineering"},
    "without feature engineering": {"engineer_new_features": False, "output_dir": "data/classification/Experiment3/no-feature-engineering"}
}

In [24]:
# Run Experiments
data_path = "data/classification/clf1.csv"
for experiment_name, config in tqdm(experiment_config.items()):
    print(f"\nTraining Models {experiment_name} on data")
    outdir = config["output_dir"]
    engineer_new_features = config["engineer_new_features"]
    # run experiment
    run_train_pipeline(input_file = data_path,  
                       target_column = "group",
                       input_type = "csv",
                       training_type = "clf",
                       engineer_new_features=engineer_new_features,
                       output_base="",
                       test_size=0.2, 
                       no_standard_scaling=False,
                       feature_selection=False, 
                       feature_selection_method="addition",
                       selectkbest_num_features=32, 
                       output_dir=outdir
                      )

  0%|          | 0/2 [00:00<?, ?it/s]


Training Models with featuring engineering on data
run_train_pipeline --input_file data/classification/clf1.csv --target_column group --selectkbest_num_features 32 --training_type clf --test_size 0.2 --feature_selection_method addition --output_dir data/classification/Experiment3/feature-engineering --engineer_new_features --output_base clf


100%|██████████| 9/9 [23:55<00:00, 159.46s/it]


Best model: XGBClassifier
Best model cross validation score: 0.9673551222191706

Training Models without feature engineering on data
run_train_pipeline --input_file data/classification/clf1.csv --target_column group --selectkbest_num_features 32 --training_type clf --test_size 0.2 --feature_selection_method addition --output_dir data/classification/Experiment3/no-feature-engineering --output_base clf


100%|██████████| 9/9 [06:22<00:00, 42.54s/it]


Best model: MLPClassifier
Best model cross validation score: 0.9685672434312919


Apparently, feature engineering does not improve results. In fact, using the original dataset performs better than engineering new features. However, will try to perform future selection on the engineered futures to see if the performance would increase. Normally, feature selection could be applied on the original dataset as well, but our dataset already has a small number of features.

## Experiment 4: How does Feature Selection Affect Model Performance?
Since we already have results for no feature selection, we will only perform experiments for the different feature selection methods and options.
### A. Regression

In [46]:
# Configure Experiments
# make experiment output directory
experiment4_output_path = "data/regression/Experiment4"
os.makedirs(experiment4_output_path,exist_ok=True)

experiment_config = {
    "with featuring selection": {"feature_selection": True, 
                                 "output_dir": "data/regression/Experiment4/feature_selection",
                                 "engineer_new_features": True,
                                 "num_features": [8,16,32, 64]
                                },
    "without feature selection": {"feature_selection": False, 
                                  "output_dir": "data/regression/Experiment4/no_feature_selection",
                                  "engineer_new_features":False,
                                  "num_features": [32]
                                 }
}

In [47]:
# Run Experiments
data_path = "data/regression/reg1.csv"
for experiment_name, config in tqdm(experiment_config.items()):
    print(f"\nTraining Models {experiment_name} on data")
    
    engineer_new_features = config["engineer_new_features"]
    feature_selection = config["feature_selection"]
    num_features_list = config["num_features"]

    for num_features in num_features_list:
        outdir = config["output_dir"] + f"_{num_features}"
        # run experiment
        run_train_pipeline(input_file = data_path,  
                           target_column = "incidence",
                           input_type = "csv",
                           training_type = "reg",
                           engineer_new_features=engineer_new_features,
                           output_base="",
                           test_size=0.2, 
                           no_standard_scaling=False,
                           feature_selection=feature_selection, 
                           feature_selection_method="addition",
                           selectkbest_num_features=num_features, 
                           output_dir=outdir
                          )

  0%|          | 0/2 [00:00<?, ?it/s]


Training Models with featuring selection on data
run_train_pipeline --input_file data/regression/reg1.csv --target_column incidence --selectkbest_num_features 8 --training_type reg --test_size 0.2 --feature_selection_method addition --output_dir data/regression/Experiment4/feature_selection_8 --feature_selection --engineer_new_features --output_base reg


100%|██████████| 7/7 [04:46<00:00, 40.86s/it]


Best model: XGBRegressor
Best model cross validation score: 0.842113285417654
run_train_pipeline --input_file data/regression/reg1.csv --target_column incidence --selectkbest_num_features 16 --training_type reg --test_size 0.2 --feature_selection_method addition --output_dir data/regression/Experiment4/feature_selection_16 --feature_selection --engineer_new_features --output_base reg


100%|██████████| 7/7 [09:09<00:00, 78.57s/it] 


Best model: XGBRegressor
Best model cross validation score: 0.8745290372774303
run_train_pipeline --input_file data/regression/reg1.csv --target_column incidence --selectkbest_num_features 32 --training_type reg --test_size 0.2 --feature_selection_method addition --output_dir data/regression/Experiment4/feature_selection_32 --feature_selection --engineer_new_features --output_base reg


100%|██████████| 7/7 [15:12<00:00, 130.39s/it]


Best model: Random Forest
Best model cross validation score: 0.9321241616037724
run_train_pipeline --input_file data/regression/reg1.csv --target_column incidence --selectkbest_num_features 64 --training_type reg --test_size 0.2 --feature_selection_method addition --output_dir data/regression/Experiment4/feature_selection_64 --feature_selection --engineer_new_features --output_base reg


100%|██████████| 7/7 [22:57<00:00, 196.82s/it]


Best model: XGBRegressor
Best model cross validation score: 0.9565538811032365

Training Models without feature selection on data
run_train_pipeline --input_file data/regression/reg1.csv --target_column incidence --selectkbest_num_features 32 --training_type reg --test_size 0.2 --feature_selection_method addition --output_dir data/regression/Experiment4/no_feature_selection_32 --output_base reg


100%|██████████| 7/7 [06:00<00:00, 51.51s/it] 


Best model: Random Forest
Best model cross validation score: 0.9553503102274707


### B. Classification

In [61]:
# Configure Experiments
# make experiment output directory
experiment4_output_path = "data/classification/Experiment4"
os.makedirs(experiment4_output_path,exist_ok=True)

experiment_config = {
    "without feature selection": {"feature_selection": False, 
                                  "output_dir": "data/classification/Experiment4/no_feature_selection",
                                  "engineer_new_features":False,
                                  "feature_selection_method": ["addition"]
                                 },
    "with featuring selection": {"feature_selection": True, 
                                 "output_dir": "data/classification/Experiment4/feature_selection",
                                 "engineer_new_features": True,
                                 "feature_selection_method": ["addition", "elimination"]
                                }
}

In [62]:
# Run Experiments
data_path = "data/classification/clf1.csv"
for experiment_name, config in tqdm(experiment_config.items()):
    print(f"\nTraining Models {experiment_name} on data")
    
    engineer_new_features = config["engineer_new_features"]
    feature_selection = config["feature_selection"]
    feature_selection_methods = config["feature_selection_method"]

    for feature_selection_method in feature_selection_methods:
        outdir = config["output_dir"] + f"_{feature_selection_method}"
        # run experiment
        run_train_pipeline(input_file = data_path,  
                           target_column = "group",
                           input_type = "csv",
                           training_type = "clf",
                           engineer_new_features=engineer_new_features,
                           output_base="",
                           test_size=0.2, 
                           no_standard_scaling=False,
                           feature_selection=feature_selection, 
                           feature_selection_method=feature_selection_method,
                           selectkbest_num_features=32, 
                           output_dir=outdir
                          )

  0%|          | 0/2 [00:00<?, ?it/s]


Training Models without feature selection on data
run_train_pipeline --input_file data/classification/clf1.csv --target_column group --selectkbest_num_features 32 --training_type clf --test_size 0.2 --feature_selection_method addition --output_dir data/classification/Experiment4/no_feature_selection_addition --output_base clf


100%|██████████| 9/9 [05:57<00:00, 39.76s/it]


Best model: MLPClassifier
Best model cross validation score: 0.9685672434312919

Training Models with featuring selection on data
run_train_pipeline --input_file data/classification/clf1.csv --target_column group --selectkbest_num_features 32 --training_type clf --test_size 0.2 --feature_selection_method addition --output_dir data/classification/Experiment4/feature_selection_addition --feature_selection --engineer_new_features --output_base clf


100%|██████████| 9/9 [02:06<00:00, 14.08s/it]


Best model: Random Forest
Best model cross validation score: 0.926847935548842
run_train_pipeline --input_file data/classification/clf1.csv --target_column group --selectkbest_num_features 32 --training_type clf --test_size 0.2 --feature_selection_method elimination --output_dir data/classification/Experiment4/feature_selection_elimination --feature_selection --engineer_new_features --output_base clf


100%|██████████| 9/9 [02:08<00:00, 14.32s/it]


Best model: Random Forest
Best model cross validation score: 0.926847935548842


In [19]:
# Visualize Experimental Results For Publishing
# Check research papers for the types of plots