In [None]:
# Import required python packages
import pandas as pd
import importlib
import matplotlib.pyplot as plt
import csv

In [None]:
# import required helper functions
from helper_functions import params
from helper_functions import feature_engineering as feature_eng

### Data Loading

In [None]:
# Load all relevant preprocessed datasets
df_sqa = pd.read_csv(params.filepath_project_folder + '\SQA_full_prepro_data.csv', index_col=0)
df_PrNrn_raw = pd.read_csv(params.filepath_project_folder + '\PrNrn_prepro_data.csv', index_col=0)
df_A600 = pd.read_csv(params.filepath_project_folder + '\extra_features\A600_Date.csv', index_col=0)
df_isoweekday = pd.read_csv(params.filepath_project_folder + '\extra_features\isoweekday_feature.csv', index_col=0)
df_kaco_laco = pd.read_csv(params.filepath_project_folder + '\extra_features\kaco_laco_PrNrn.csv', index_col=0)
df_ZP8_planned = pd.read_csv(params.filepath_project_folder + '\extra_features\ZP8_planned_dates.csv', index_col=0)
df_sequence_micro = pd.read_csv(params.filepath_project_folder + '\extra_features\PrNrn_sequence_micro.csv', index_col=0)
df_sequence_macro = pd.read_csv(params.filepath_project_folder + '\extra_features\PrNrn_sequence_macro.csv', index_col=0)
df_sequence_kaco_micro = pd.read_csv(params.filepath_project_folder + '\extra_features\kaco_sequence_micro.csv', index_col=0)
df_sequence_kaco_macro = pd.read_csv(params.filepath_project_folder + '\extra_features\kaco_sequence_macro.csv', index_col=0)

### Join Features

In [None]:
# Join KacoLaco feature, dates and sequence micro (neighborhood) features
df_features_sequence_micro_kaco = df_kaco_laco.join(df_ZP8_planned).join(df_isoweekday).join(df_sequence_kaco_micro)

# Join KacoLaco feature, dates and sequence macro (batch) features
df_features_sequence_macro_kaco = df_kaco_laco.join(df_ZP8_planned).join(df_isoweekday).join(df_sequence_kaco_macro)

# Save both to separate csv files
df_features_sequence_micro_kaco.to_csv(params.filepath_project_folder + '\\features_sequence_micro_kaco.csv')
df_features_sequence_macro_kaco.to_csv(params.filepath_project_folder + '\\features_sequence_macro_kaco.csv')

In [None]:
# Join Prnrn features, dates and sequence features
df_features_full = df_PrNrn_raw.join(df_ZP8_planned).join(df_isoweekday)
df_features_kaco = df_kaco_laco.join(df_ZP8_planned).join(df_isoweekday)
df_features_sequence_full = df_PrNrn_raw.join(df_ZP8_planned).join(df_isoweekday).join(df_sequence_micro).join(df_sequence_macro)
df_features_sequence_kaco = df_kaco_laco.join(df_ZP8_planned).join(df_isoweekday).join(df_sequence_kaco_micro).join(df_sequence_kaco_macro)

# Store all to separate csv files
df_features_full.to_csv(params.filepath_project_folder + '\\features_full.csv')
df_features_kaco.to_csv(params.filepath_project_folder + '\\features_kaco.csv')
df_features_sequence_full.to_csv(params.filepath_project_folder + '\\features_sequence_full.csv')
df_features_sequence_kaco.to_csv(params.filepath_project_folder + '\\features_sequence_kaco.csv')

### Manual Feature Selection

In [None]:
# Select PrFams to be removed from dataset to reduce complexity
# SAU, SIZ, BTA sind semantisch durch TPL (=Typprüfland) abgedeckt

importlib.reload(feature_eng)

# Remove manual selected features from PrNrn dataframe and store in new csv file
df_PrNrn_sampled = feature_eng.remove_PrFams(df_PrNrn_raw, params.manual_remove)
df_PrNrn_sampled.to_csv(params.filepath_project_folder + '\\PrNrn_sampled.csv')

# Remove manual selected features from full features dataframe and store in new csv file
df_features_sampled = df_PrNrn_sampled.join(df_ZP8_planned).join(df_isoweekday)
df_features_sampled.to_csv(params.filepath_project_folder + '\\features_sampled.csv')

# Remove manual selected features from full + sequence dataframe and store in new csv file
df_features_sequence_full_sampled = feature_eng.remove_PrFams(df_features_sequence_full, remove)
df_features_sequence_full_sampled.to_csv(params.filepath_project_folder + '\\features_sequence_full_sampled.csv')

# Remove manual selected features from kacolaco + sequence dataframe and store in new csv file
df_features_sequence_kaco_sampled = feature_eng.remove_PrFams(df_features_sequence_kaco, remove)
df_features_sequence_kaco_sampled.to_csv(params.filepath_project_folder + '\\features_sequence_kaco_sampled.csv')

### Automated Feature Reduction

In [None]:
#  Perform automated grouping of correlated features
df_features_full_reduced = feature_eng.feature_grouping(df_features_full, '\full', False)
df_features_kaco_reduced = feature_eng.feature_grouping(df_features_kaco, '\kaco', False)
df_features_sampled_reduced = feature_eng.feature_grouping(df_features_sampled, '\sampled', False)

# Store to new csv files
df_features_full_reduced.to_csv(filepath + '\\features_full_reduced.csv')
df_features_kaco_reduced.to_csv(filepath + '\\features_kaco_reduced.csv')
df_features_sampled_reduced.to_csv(filepath + '\\features_sampled_reduced.csv')