In [None]:
# Import required python packages
import numpy as np
import pandas as pd
import importlib
import sklearn.metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ParameterGrid
import time
import matplotlib.pyplot as plt

In [None]:
# import required helper functions
from helper_functions import methods_prediction as pred_meth
from helper_functions import methods_sampling as sampling_meth
from helper_functions import keras_NN as keras
from helper_functions import methods_prediction as pred_meth
from helper_functions import control_methods as meth_control
from helper_functions import feature_engineering as feature_eng
from helper_functions import methods_PrNrn as meth
from helper_functions import test_grids
from helper_functions import parametergrids
from helper_functions import params

In [None]:
# Import the datasets to test/predict on
df_sqa = pd.read_csv(params.filepath_project_folder + '\SQA_full_prepro_data.csv', index_col=0)
df_features_full = pd.read_csv(params.filepath_project_folder + '\\features_full.csv', index_col=0)
df_features_full_reduced = pd.read_csv(params.filepath_project_folder + '\\features_full_reduced.csv', index_col=0)
df_features_kaco = pd.read_csv(params.filepath_project_folder + '\\features_kaco.csv', index_col=0)
df_features_kaco_reduced = pd.read_csv(params.filepath_project_folder + '\\features_kaco_reduced.csv', index_col=0)

# Engineer additional datsets to test/predict on
df_features_kaco_noAudit_noMess = feature_eng.remove_audit(feature_eng.remove_mess(df_features_kaco))

In [None]:
# Define the datasets to iterate through feature params and provide names for the datasets

feature_dfs = [df_features_kaco_reduced]
feature_dfs_names = ['df_features_kaco_reduced']

In [None]:
# Import the paramtergrids for the meta paramters and the model parameters
importlib.reload(test_grids)
importlib.reload(parametergrids)

model_paramgrid = test_grids.paramgrid_binary_RF
meta_paramgrid = test_grids.meta_paramgrid

In [None]:
# Create empty dataframe to store the prediction results
df_results = pd.DataFrame(columns=['feature_set', 'model', 'cut_off_high', 'cut_off_low', 'delete', 'threshhold_late', 'target_col', 'sampling', 
                                   'sample_frac', 'cv_num', 'pred_certainty', 'model_params', 'y_test', 'y_pred', 'f1', 'f2', 'precision', 'recall',
                                   'cm', 'timestamp'])

df_results_bin_NN = pd.DataFrame(columns=['layer_depth', 'layer_architecture', 'input_activation_function', 
                                       'input_neurons', 'dropout', 'hidden_neurons', 'hidden_activation_function', 
                                       'output_activation_function', 'output_neurons', 'optimizer', 'epochs', 
                                       'batch_size'])

df_results_bin_XGB = pd.DataFrame(columns=['booster', 'eta', 'gamma', 
                                       'max_depth', 'lambda', 'alpha', 'tree_method', 
                                       'num_parallel_tree'])

In [None]:
# Start the prediction
timestr = time.strftime("%Y%m%d-%H%M%S")
importlib.reload(meth_control)

# Choose the prediction model to use
model = 'binary_RF'


# Define save parameters
if model == 'binary_RF':
    model_id = '\RFClf'
    filepath_mod = params.filepath_project_folder + '\prediction_models\binary_classifier_RF'
    filepath_res = params.filepath_project_folder + '\prediction_results\binary_classifier_RF'
    
if model == 'binary_NN':
    model_id = '\NNClf'

    filepath_mod = params.filepath_project_folder + '\prediction_models\binary_classifier_NN'
    filepath_res = params.filepath_project_folder + '\prediction_results\binary_classifier_NN'
    
if model == 'binary_XGB':
    model_id = '\XGBClf'

    filepath_mod = params.filepath_project_folder + '\prediction_models\binary_classifier_XGB'
    filepath_res = params.filepath_project_folder + '\prediction_results\binary_classifier_XGB'
    

# Initiate counter
c = 0
feature_set_counter = 0

print(time.strftime("%Y%m%d-%H%M%S"))

# Iterate over the datasets to compare
for df_features in feature_dfs:
    
    feature_set_counter = feature_set_counter+1
    print('- Dataset ' + str(feature_set_counter) + '/' + str(len(feature_dfs)))
    
    # Iterate over the meta-parametergrid, e.g. sampling algorithms
    for i in range (0, len(meta_paramgrid)):
        print('- Meta-Parameter run ' + str(i+1) + '/' + str(len(meta_paramgrid))) 
        
        # Iterate over the model parameter grid (model hyperparameter)
        for j in range (0, len(model_paramgrid)):
            print('- Model-Parameter run ' + str(j+1) + '/' + str(len(model_paramgrid)))
            
            # Check which model to use
            if model == 'binary_NN':
                
                # Call method to perform the classification task
                trained_models, f1_scores, f2_scores, precision_scores, recall_scores, cms, y_preds, y_tests, model_results = meth_control.perform_binary_classification(
                                df_features, df_sqa, meta_paramgrid[i], model_paramgrid[j])
                
                # Store results
                df_results_bin_NN.loc[c] = [model_paramgrid[j]['layer_depth'], model_paramgrid[j]['layer_architecture'], 
                                        model_paramgrid[j]['input_activation_function'], 
                                        model_paramgrid[j]['input_neurons'], 
                                        model_paramgrid[j]['dropout'], model_paramgrid[j]['hidden_neurons'], 
                                        model_paramgrid[j]['hidden_activation_function'], 
                                        model_paramgrid[j]['output_activation_function'], 
                                        model_paramgrid[j]['output_neurons'], model_paramgrid[j]['optimizer'], 
                                        model_paramgrid[j]['epochs'], model_paramgrid[j]['batch_size']]
                
            if model == 'binary_RF':           
                
                # Call method to perform the classification task
                trained_models, f1_scores, f2_scores, precision_scores, recall_scores, cms, y_preds, y_tests, model_results = meth_control.perform_binary_classification_RF(
                                df_features, df_sqa, meta_paramgrid[i], model_paramgrid[j])
            
            if model == 'binary_XGB':           
                
                # Call method to perform the classification task
                trained_models, f1_scores, f2_scores, precision_scores, recall_scores, cms, y_preds, y_tests, model_results = meth_control.perform_binary_classification_XGBoost(
                                df_features, df_sqa, meta_paramgrid[i], model_paramgrid[j])      
                
                df_results_bin_XGB.loc[c] = [model_paramgrid[j]['booster'], model_paramgrid[j]['eta'], 
                                        model_paramgrid[j]['gamma'], 
                                        model_paramgrid[j]['max_depth'], 
                                        model_paramgrid[j]['lambda'], model_paramgrid[j]['alpha'], 
                                        model_paramgrid[j]['tree_method'], 
                                        model_paramgrid[j]['num_parallel_tree']]
                
            # Save results in prepared DataFrame
            df_results.loc[c] = [feature_dfs_names[feature_set_counter-1], model, meta_paramgrid[i]['cut_off_high'], 
                                         meta_paramgrid[i]['cut_off_low'], meta_paramgrid[i]['delete'],
                                         meta_paramgrid[i]['threshhold_late'], meta_paramgrid[i]['target_col'], 
                                         meta_paramgrid[i]['sampling'], meta_paramgrid[i]['sample_frac'], 
                                         meta_paramgrid[i]['cv_num'], meta_paramgrid[i]['pred_certainty'],
                                         model_paramgrid[j], 
                                         y_tests, y_preds, f1_scores, f2_scores,
                                         precision_scores, recall_scores, cms, time.strftime("%Y%m%d-%H%M%S")]
            c = c+1
            
print(time.strftime("%Y%m%d-%H%M%S"))

In [None]:
# Save raw results

filename = model_id + timestr + '_results.csv'
df_results.to_csv(filepath_res + filename)

# Save model parameter results
if model == 'binary_NN':
    filename = model_id + timestr + '_results_binary_NN.csv'
    df_results_bin_NN.to_csv(filepath_res + filename)
    
if model == 'binary_XGBoost':  
    filename = model_id + timestr + '_results_binary_XGB.csv'
    df_results_bin_XGB.to_csv(filepath_res + filename)

In [None]:
# Prepare results for plotting
for cat in [0, 1]:
    for col in ['f1', 'f2', 'recall', 'precision']:
        new_col = col + '_' + str(cat)
        df_results[new_col] = 99.99
        for i in range (0, len(df_results)):
            new_val = 0
            for j in range (0, df_results['cv_num'][i]):
                new_val = new_val + df_results[col][i][j][cat]
            new_val = new_val/df_results['cv_num'][i]
            df_results.at[i, new_col] = new_val
            
filename = model_id + timestr + '_results_edited.csv'
df_results.to_csv(filepath_res + filename)

In [None]:
# Plot results
f = plt.figure()

plt.title('Model Performance', color='black')

df_results.plot(x='model', y=['f2_0', 'f2_1', 'f1_0', 'f1_1'], kind='line', ax=f.gca(), figsize=(16,10))

plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
plt.show()