In [None]:
# Import required python packages
import pandas as pd
import importlib
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ParameterGrid
import time
import matplotlib.pyplot as plt
from keras.utils import to_categorical
from keras import models
from keras import layers

In [None]:
# import required helper functions
from helper_functions import methods_prediction as pred_meth
from helper_functions import methods_PrNrn as meth
from helper_functions import control_methods as meth_control
from helper_functions import SQA_preprocessing as SQA_prepro
from helper_functions import evaluation_metrics as eval_metr
from helper_functions import feature_engineering as feature_eng
from helper_functions import test_grids
from helper_functions import parametergrids

In [None]:
# Import the datasets to test/predict on
df_sqa = pd.read_csv(params.filepath_project_folder + '\SQA_full_prepro_data.csv', index_col=0)
df_features_full = pd.read_csv(params.filepath_project_folder + '\\features_full.csv', index_col=0)
df_features_full_reduced = pd.read_csv(params.filepath_project_folder + '\\features_full_reduced.csv', index_col=0)
df_features_kaco = pd.read_csv(params.filepath_project_folder + '\\features_kaco.csv', index_col=0)
df_features_kaco_reduced = pd.read_csv(params.filepath_project_folder + '\\features_kaco_reduced.csv', index_col=0)

In [None]:
# Define save parameters

model_id = '\Regr'
timestr = time.strftime("%Y%m%d-%H%M%S")

filepath_mod = ##path to prediction model
filepath_res = ##path to prediction model

In [None]:
# Define the datasets to iterate through feature params and provide names for the datasets

feature_dfs = [df_features_full_reduced]
feature_dfs_names = ['Features_all_reduced']

In [None]:
# Import the paramtergrids for the meta paramters and the model parameters
importlib.reload(test_grids)
importlib.reload(parametergrids)

model_paramgrid = parametergrids.paramgrid_regression_NN
meta_paramgrid = parametergrids.meta_paramgrid

In [None]:
# Create empty dataframe to store the prediction results
df_results = pd.DataFrame(columns=['feature_set','model', 'cut_off_high', 'cut_off_low','delete', 'threshhold_late', 'target_col', 'sampling', 
                                   'sample_frac', 'cv_num', 'model_params', 'y_test', 'y_pred', 'mse', 'rmse', 'mae', 'mape', 'rae', 'r_squared', 
                                   'adj_r_squared', 'median_abs_error', 'timestamp'])

df_results_reg_NN = pd.DataFrame(columns=['layer_depth', 'layer_architecture', 'input_activation_function', 
                                       'input_neurons', 'dropout', 'hidden_neurons', 'hidden_activation_function', 
                                       'output_activation_function', 'output_neurons', 'optimizer', 'epochs', 
                                       'batch_size'])

In [None]:
# Start the prediction
timestr = time.strftime("%Y%m%d-%H%M%S")
importlib.reload(meth_control)

# Initiate counter
c = 0
feature_set_counter = 0

# Iterate over the datasets to compare
for df_features in feature_dfs:
    feature_set_counter = feature_set_counter+1

    # Iterate over the meta-parametergrid, e.g. sampling algorithms
    for i in range (0, len(meta_paramgrid)):
        print('- Meta-Parameter run ' + str(i+1) + '/' + str(len(meta_paramgrid)))
    
        # Iterate over the model parameter grid (model hyperparameter)
        for j in range (0, len(model_paramgrid)):
            print('- Model-Parameter run ' + str(j+1) + '/' + str(len(model_paramgrid)))
            
            # Call method to perform the regression task
            models, mse, rmse, mae, mape, rae, r_squared, adj_r_squared, median_abs_error, y_preds, y_tests, model_results = meth_control.perform_regression(
                                            df_features, df_sqa, meta_paramgrid[i], model_paramgrid[j])
            
            # Store results
            df_results.loc[c] = [feature_dfs_names[feature_set_counter-1], 'regression_NN', meta_paramgrid[i]['cut_off_high'], 
                                         meta_paramgrid[i]['cut_off_low'], meta_paramgrid[i]['delete'],
                                         meta_paramgrid[i]['threshhold_late'], meta_paramgrid[i]['target_col'], 
                                         meta_paramgrid[i]['sampling'], meta_paramgrid[i]['sample_frac'], 
                                         meta_paramgrid[i]['cv_num'],
                                         model_paramgrid[j], y_tests, y_preds,
                                         mse, rmse, mae, mape, rae, r_squared, adj_r_squared,
                                         median_abs_error, time.strftime("%Y%m%d-%H%M%S")]
            c=c+1

In [None]:
# Decide for best regression params
# Hidden activation function: relu outperformns sigmoid
# Output activation function: linear outperforms sigmoid, relu

In [None]:
# Save raw results

filename = model_id + timestr + '_results.csv'
df_results.to_csv(filepath_res + filename)

In [None]:
for col in ['mse', 'rmse', 'mae', 'mape', 'rae', 'r_squared', 'adj_r_squared', 'median_abs_error']:
    for row in range(0, len(df_results)):
        mean_value = sum(df_results.iloc[row][col])/len(df_results.iloc[row][col])
        df_results[col][row] = mean_value
        
filename = model_id + timestr + '_results_edit.csv'
df_results.to_csv(filepath_res + filename)

In [None]:
# Prepare for plotting
c = 0

for r in range (0, len(feature_dfs_names)):
    
    y_pred = []
    for i in range(0, len(df_results.loc[r]['y_pred'][c])):
        y_pred.append(df_results.loc[r]['y_pred'][c][i][0]) 
        
    y_test = []
    for i in range(0, len(df_results.loc[r]['y_test'][c])):
        y_test.append(df_results.loc[r]['y_test'][c][i][0])

    df_line_plot = pd.DataFrame(y_pred, columns=['y_pred'])
    df_line_plot['y_test'] = y_test
    df_line_plot['delta'] = df_line_plot['y_test'] - df_line_plot['y_pred']

    filename = model_id + timestr + '_' + feature_dfs_names[r] +'_plotting.csv'
    df_line_plot.to_csv(filepath_res + filename)

In [None]:
# Plot I-Cart comparing the true value, the predicted value and the delta
f = plt.figure()

plt.title('Model Performance', color='black')

df_line_plot.plot(kind='line', ax=f.gca(), figsize=(16,10))

plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
plt.show()

In [None]:
# Plot boxplots of the distribution of true value, prediction and delta
df_line_plot.boxplot(figsize=(2,10))

In [None]:
# Create a plotly graph to visualize regression results
# I-Chart that compares true value, prediction and delta
import plotly
import plotly.graph_objects as go
import numpy as np

x = ['Test', 'Pred', 'Delta']

fig = go.Figure()
fig.add_trace(go.Box(y=df_line_plot['y_test'], name='F1: On-Time Class'))
fig.add_trace(go.Box(y=df_line_plot['y_pred'], name='F1: On-Time Class'))
fig.add_trace(go.Box(y=df_line_plot['delta'], name='F1: On-Time Class'))

    
fig.update_layout(
    title='Score for different models',
    font=dict(
        family="Courier New, monospace",
        size=20
    ),
    yaxis=dict(
        autorange=True,
        showgrid=True,
        zeroline=True,
        title='SQA'
    ))


plotly.offline.plot(fig, auto_open=True)