### Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.core.display import display
from src.utils.preprocessing import standardize, classic_preprocessing
from src.utils.get_data import import_data, expert_models
from src.utils.model_helpers import AUC_all_models

In [None]:
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings('ignore')

### Load Data

In [None]:
DATA_PATH = '../../data'
coarse_data, coarse_labels = import_data(DATA_PATH, segmentation_type='coarse',
                                         drop_expert=False, drop_user_features=False)
fine_data, fine_labels = import_data(DATA_PATH, segmentation_type='fine',
                                     drop_expert=False, drop_user_features=False)
no_data, no_labels = import_data(DATA_PATH, segmentation_type='no',
                                 drop_expert=False, drop_user_features=False)

# For later processing rename the index
no_data.index = no_data.index.rename('subject')

In [None]:
coarse_labels.Label.mean()

### Preprocessing

In [None]:
# rename
X_coarse = coarse_data
X_fine = fine_data
X_no = no_data

#### Normalisation

In [None]:
# For case where drop_user_features=False
X_coarse = standardize(X_coarse, idx_start=0, idx_end=-5)
X_fine = standardize(X_fine, idx_start=0, idx_end=-5)
X_no = standardize(X_no, idx_start=0, idx_end=-5)

# Standardise the age
X_coarse = standardize(X_coarse, idx_start=-4, idx_end=-3)
X_fine = standardize(X_fine, idx_start=-4, idx_end=-3)
X_no = standardize(X_no, idx_start=-4, idx_end=-3)

#### Dummy code and correlated feature dropping

In [None]:
X_coarse = classic_preprocessing(X_coarse, norm=False)
X_fine = classic_preprocessing(X_fine, norm=False)
X_no = classic_preprocessing(X_no, norm=False)

#### Get optimal features model

In [None]:
# X_coarse_opt = get_optimal_features_model(X_fine, fine_labels, model=GradientBoostingClassifier(random_state=0), start_idx = 1)

### Model training for each expert with additional features

In [None]:
coarse_results_experts = expert_models(X_coarse, coarse_labels)
fine_results_experts = expert_models(X_fine, fine_labels)
no_results_experts = expert_models(X_no, no_labels)

In [None]:
coarse_results_experts['Data'] = 'coarse'
fine_results_experts['Data'] = 'fine'
no_results_experts['Data'] = 'no_segmentation'
display(coarse_results_experts, fine_results_experts, no_results_experts)

### Model training all data

In [None]:
oversampling = True
coarse_results = AUC_all_models(
    X_coarse, coarse_labels.Label, k=6, oversampling=oversampling)
fine_results = AUC_all_models(
    X_fine, fine_labels.Label, k=6, oversampling=oversampling)
no_results = AUC_all_models(X_no, no_labels.Label,
                            k=6, oversampling=oversampling)

In [None]:
coarse_results = coarse_results.rename(columns={'AUC (mean)': "Coarse_AUC"})
fine_results = fine_results.rename(columns={'AUC (mean)': "Fine_AUC"})
no_results = no_results.rename(columns={'AUC (mean)': "No_Seg_AUC"})

pd.concat([coarse_results, fine_results["Fine_AUC"],
           no_results["No_Seg_AUC"]], axis=1, sort=False)
# Most promising seem to be RandomForest and GradientBoosting --> Tune their hyperparameters!

In [None]:
coarse_results = coarse_results.rename(columns={'AUC (mean)': "Coarse_AUC"})
fine_results = fine_results.rename(columns={'AUC (mean)': "Fine_AUC"})
no_results = no_results.rename(columns={'AUC (mean)': "No_Seg_AUC"})

pd.concat([coarse_results, fine_results["Fine_AUC"],
           no_results["No_Seg_AUC"]], axis=1, sort=False)
# Most promising seem to be RandomForest and GradientBoosting --> Tune their hyperparameters!

### Models without expert features

In [None]:
DATA_PATH = '../../data'
coarse_data_n, coarse_labels_n = import_data(
    DATA_PATH, segmentation_type='coarse', drop_user_features=True)
fine_data_n, fine_labels_n = import_data(
    DATA_PATH, segmentation_type='fine', drop_user_features=True)
no_data_n, no_labels_n = import_data(
    DATA_PATH, segmentation_type='no', drop_user_features=True)

# For later processing rename the index
no_data_n.index = no_data_n.index.rename('subject')

In [None]:
# rename
X_coarse_n = coarse_data_n
X_fine_n = fine_data_n
X_no_n = no_data_n

#### Preprocessing

In [None]:
# For case where is_user_features=False
X_coarse_n = classic_preprocessing(X_coarse_n, start=0, stop=-1)
X_fine_n = classic_preprocessing(X_fine_n, start=0, stop=-1)
X_no_n = classic_preprocessing(X_no_n, start=0, stop=-1)

### Train model for each expert without additional features

In [None]:
coarse_results_experts_n = expert_models(X_coarse_n, coarse_labels_n)
fine_results_experts_n = expert_models(X_fine_n, fine_labels_n)
no_results_experts_n = expert_models(X_no_n, no_labels_n)

In [None]:
coarse_results_experts_n['Data'] = 'coarse'
fine_results_experts_n['Data'] = 'fine'
no_results_experts_n['Data'] = 'no_segmentation'
display(coarse_results_experts_n, fine_results_experts_n, no_results_experts_n)

In [None]:
# Compare: with expert features:
display(coarse_results_experts, fine_results_experts, no_results_experts)

### Train model for all data without additional features

In [None]:
oversampling = True
coarse_results_n = AUC_all_models(
    X_coarse_n, coarse_labels_n.Label, k=6, oversampling=oversampling)
fine_results_n = AUC_all_models(
    X_fine_n, fine_labels_n.Label, k=6, oversampling=oversampling)
no_results_n = AUC_all_models(
    X_no_n, no_labels_n.Label, k=6, oversampling=oversampling)

In [None]:
coarse_results_n = coarse_results_n.rename(
    columns={'AUC (mean)': "Coarse_AUC"})
fine_results_n = fine_results_n.rename(columns={'AUC (mean)': "Fine_AUC"})
no_results_n = no_results_n.rename(columns={'AUC (mean)': "No_Seg_AUC"})

pd.concat([coarse_results_n, fine_results_n["Fine_AUC"],
           no_results_n["No_Seg_AUC"]], axis=1, sort=False)

## Compare the results from hyperparameter tuning

In [None]:
no_seg_no_md = pd.read_pickle("results_no_seg_no_metadata.pkl")
no_seg_md = pd.read_pickle("results_no_seg_metadata.pkl")

coarse_no_md = pd.read_pickle("results_coarse_no_metadata.pkl")
coarse_md = pd.read_pickle("results_coarse_metadata.pkl")

fine_no_md = pd.read_pickle("results_fine_no_metadata.pkl")
fine_md = pd.read_pickle("results_fine_metadata.pkl")

In [None]:
# no_seg_no_md.merge(no_seg_md, left_on = 'models', right_on = 'models')
no_seg_no_md = pd.melt(no_seg_no_md, id_vars=[
                       'models'], value_vars=['auc_expert_weighted'])
# no_seg_md = pd.melt(no_seg_md, id_vars=['models'], value_vars=['auc_best', 'auc_expert_weighted'])

# coarse_no_md = pd.melt(coarse_no_md, id_vars=['models'], value_vars=['auc_best', 'auc_expert_weighted'])
coarse_md = pd.melt(coarse_md, id_vars=['models'], value_vars=[
                    'auc_expert_weighted'])

# fine_no_md = pd.melt(fine_no_md, id_vars=['models'], value_vars=['auc_best', 'auc_expert_weighted'])
fine_md = pd.melt(fine_md, id_vars=['models'], value_vars=[
                  'auc_expert_weighted'])

In [None]:
no_seg_no_md['variable'] = no_seg_no_md['variable'].map(
    {'auc_expert_weighted': 'no'})
coarse_md['variable'] = coarse_md['variable'].map(
    {'auc_expert_weighted': 'coarse'})
fine_md['variable'] = fine_md['variable'].map({'auc_expert_weighted': 'fine'})

In [None]:
fine_md

In [None]:
df = fine_md.append(coarse_md).append(no_seg_no_md)

In [None]:
df

In [None]:
df['models'] = df['models'].map({'logistic': 'Logistic', 'lda': 'LDA', 'knn': 'KNN', 'svc': 'SVC',
                                 'naive_bayes': 'Naive Bayes', 'decision_tree': 'Decision Tree', 'random_forest': 'Random Forest',
                                 'gradient_boosting': 'eXtreme GB'})

In [None]:
plt.figure(figsize=(6, 4))
sns.barplot(data=df, y='models', x='value',
            hue='variable', palette='colorblind')
plt.xlabel('AUC score')
# plt.xticks(rotation=90);
plt.legend(loc='lower right')
plt.xlim(0.5, 0.75)
plt.ylabel('')

VIS_PATH = "../../vis"
plt.savefig(f'{VIS_PATH}/results_classical', format='pdf', bbox_inches='tight')

In [None]:
sns.barplot(data=fine_md, x='models', y='value',
            hue='variable', palette='Set2', alpha=0.5)
sns.barplot(data=coarse_md, x='models', y='value', hue='variable', alpha=0.5)
sns.barplot(data=no_seg_no_md, x='models', y='value',
            hue='variable', palette='husl', alpha=0.5)

In [None]:
# ich will für alle drei datensätze den expert model score