In [6]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
import copy
from sklearn.metrics import accuracy_score,f1_score,roc_auc_score, mean_absolute_error, mean_squared_error


import math
import functions as func

LENGTH = 1000

# lists of labels for each scenario
labels = ["Prog1_scaled","Prog1_beste", "MfI1_beste", "beste"]
others = [["Prog1_beste","MfI1_beste","beste"],["MfI1_beste","beste","Prog1_scaled"],["Prog1_beste","beste","Prog1_scaled"],["Prog1_beste","MfI1_beste","Prog1_scaled"]]
labels_class = ["Prog1_beste", "MfI1_beste", "beste", "beide"]
others_class = [["MfI1_beste","beste","beide"],["Prog1_beste","beste","beide"],["Prog1_beste","MfI1_beste","beide"]
         ,["Prog1_beste","MfI1_beste","beste"]]


# columns with highest correlations in regression scenarios
columns_reg = {"Prog1_scaled": ["Abinote","SKMat_1","SKMat_2","SKMat_3","SKMat_4","mean_SKMat","SKInf_1",
                               "Ktyp_exp", "BFI_K_3", "mean_BFI_K_G", "BM_Inf_17"],
               "Prog1_beste": ["Abinote","SKMat_1","SKMat_2","SKMat_3","SKMat_4","mean_SKMat","SKInf_1", "mean_SKInf",
                               "Ktyp_exp", "BFI_K_3", "mean_BFI_K_G", "BM_Inf_17"],
               "MfI1_beste": ["Abinote","SKMat_1","SKMat_2","SKMat_3","SKMat_4","mean_SKMat","SKInf_1", "SKInf_4",
                              "mean_SKInf", "Kurs_Inf", "Ktyp_exp", "BFI_K_7", "BM_Inf_14","LMI_3","LMI_6"],
               "beste": ["Abinote","SKMat_1","SKMat_2","SKMat_3","SKMat_4","mean_SKMat","SKInf_1","mean_SKInf",
                         "Ktyp_exp", "BFI_K_3","mean_BFI_K_G","BM_Inf_17","LMI_3"]}
predictions_train = [[0] * LENGTH for i in range(8)]
predictions_test = [[0] * LENGTH for i in range(8)]

In [7]:
# Regression
dt_reg = DecisionTreeRegressor(random_state=42)



rmse_train = [[0] * 1 for i in range(4)]
rmse_test = [[0] * 1 for i in range(4)]
mae_train = [[0] * 1 for i in range(4)]
mae_test = [[0] * 1 for i in range(4)]


scenario = 0
other_count = 0

for label in labels:   
    #Load data
    data,_,_, train_X, test_X, train_y, test_y,_,_,_,_ = func.load_data_noten(label,others[other_count][0],others[other_count][1],others[other_count][2])
    print(len(data.index))
    data_temp = copy.deepcopy(data[columns_reg[label]])
    train_X_temp = copy.deepcopy(train_X[columns_reg[label]])
    test_X_temp = copy.deepcopy(test_X[columns_reg[label]])
    
    
    
    
    #Cross_Validation on train data
    print("RMSE ",label)
    func.cross_val(dt_reg, train_X_temp, train_y, scoring="neg_mean_squared_error")
    print("MAE ",label)
    func.cross_val(dt_reg, train_X_temp, train_y, scoring="neg_mean_absolute_error")
    
    
    #Fit and store predictions
    dt_reg.fit(train_X_temp, train_y)
    predictions_train[scenario] = dt_reg.predict(train_X_temp)
    predictions_test[scenario] = dt_reg.predict(test_X_temp)
    
    #Calculate performance
    rmse_train[scenario] = sqrt(mean_squared_error(train_y,predictions_train[scenario]))
    rmse_test[scenario] = sqrt(mean_squared_error(test_y,predictions_test[scenario]))

    mae_train[scenario] = mean_absolute_error(train_y,predictions_train[scenario])
    mae_test[scenario] = mean_absolute_error(test_y,predictions_test[scenario])
    
    
    scenario += 1
    other_count += 1



print("RMSE Trainingsdaten:", rmse_train)
print("RMSE Testdaten:",rmse_test)
print("MAE Trainingsdaten:",mae_train)
print("MAE Testdaten:",mae_test)




Prog1_scaled
274
RMSE  Prog1_scaled
Scores: [164.81014086 134.53544372 172.81189406 151.25528121 130.99276044]
Mean: 150.88110405569847
Standard deviation: 16.357533120764423
MAE  Prog1_scaled
Scores: [-136.57130793  -91.74395142 -122.7211946  -121.53853841  -99.3165363 ]
Mean: -114.37830573282159
Standard deviation: 16.447327846930243
Prog1_beste
274
RMSE  Prog1_beste
Scores: [182.58822946 134.16407865 203.39875278 153.20694639 152.69508661]
Mean: 165.210618776711
Standard deviation: 24.596460240687808
MAE  Prog1_beste
Scores: [-143.58974359  -94.21052632 -157.63157895 -119.21052632 -109.47368421]
Mean: -124.82321187584346
Standard deviation: 22.946975480057734
MfI1_beste
124
RMSE  MfI1_beste
Scores: [ 92.19544457 167.27926632 144.50727643 127.60232342 162.24527547]
Mean: 138.76591724178766
Standard deviation: 27.16869262780518
MAE  MfI1_beste
Scores: [ -57.77777778 -130.         -102.35294118  -67.05882353 -125.29411765]
Mean: -96.4967320261438
Standard deviation: 29.50186924209753
b

In [9]:
# Klassifikation
dt_class = DecisionTreeClassifier(random_state=42)
probs_class_train = [[0] * LENGTH for i in range(7)]
probs_class_test = [[0] * LENGTH for i in range(7)]


acc_train = [[0] * 1 for i in range(7)]
acc_test = [[0] * 1 for i in range(7)]
f1_train = [[0] * 1 for i in range(7)]
f1_test = [[0] * 1 for i in range(7)]
auc_train = [[0] * 1 for i in range(7)]
auc_test = [[0] * 1 for i in range(7)]



scenario = 0
other_count = 0

#columns with highest correlation in classification scenarios
columns_class = {"Prog1_beste": ["Abinote","SKMat_1","SKMat_3","mean_SKMat","SKInf_1"],
               "MfI1_beste": ["Abinote","SKMat_1","SKMat_2","SKMat_3","SKMat_4","mean_SKMat","SKInf_1","mean_SKInf", 
                              "BFI_K_11", "BM_Inf_18"],
                "beste": ["Abinote","SKMat_1","SKMat_3","mean_SKMat","SKInf_1"],
               "beide": ["Abinote","SKMat_1","SKMat_3","SKMat_4","mean_SKMat","SKInf_1","SKInf_4","mean_SKInf","Kurs_Inf",
                         "BFI_K_7","BM_Inf_17","LMI_3","LMI_28","Std_Inf"]}

for label in labels:   
    print("================")
    #Continuuos labels after normalization and scaling not suitable for accuracy
    if label == "Prog1_scaled":
        other_count+=1
        continue
    #Load data
    data,_,_, train_X,test_X, train_y, test_y,_,_,_,_ = func.load_data_noten(label,others[other_count][0],others[other_count][1])
    
    print("Accuracy ",label)
    data_temp = copy.deepcopy(data[columns_reg[label]])
    train_X_temp = copy.deepcopy(train_X[columns_reg[label]])
    test_X_temp = copy.deepcopy(test_X[columns_reg[label]])
    
    #Cross_Validation on train data
    func.cross_val(dt_class, train_X_temp, train_y, scoring="accuracy")
    
    #Fit and store predictions
    dt_class.fit(train_X_temp, train_y)
    predictions_train[scenario] = dt_class.predict(train_X_temp)
    predictions_test[scenario] = dt_class.predict(test_X_temp)
    #Calculate performance
    acc_train[scenario] = accuracy_score(train_y,predictions_train[scenario])
    acc_test[scenario] = accuracy_score(test_y,predictions_test[scenario])   
    scenario += 1
    other_count += 1
    
other_count = 0
for label in labels_class:  
    print("=================")
    #Load data
    data,_,_, train_X, test_X, train_y, test_y,_,_,_,_ = func.load_data_bestanden(label,others_class[other_count][0],
                                                                 others_class[other_count][1],
                                                                 others_class[other_count][2])
    print("Accuracy ",label)
    data_temp = copy.deepcopy(data[columns_class[label]])
    train_X_temp = copy.deepcopy(train_X[columns_class[label]])
    test_X_temp = copy.deepcopy(test_X[columns_class[label]])
    
    #Cross_Validation on train data
    func.cross_val(dt_class, train_X_temp, train_y, scoring="accuracy")
    
    #Fit and store predictions
    dt_class.fit(train_X_temp, train_y)
    predictions_train[scenario] = dt_class.predict(train_X_temp)
    predictions_test[scenario] = dt_class.predict(test_X_temp)
    
    #Calculate performance
    acc_train[scenario] = accuracy_score(train_y,predictions_train[scenario])
    acc_test[scenario] = accuracy_score(test_y,predictions_test[scenario])
    
    # If label==beide only accuracy can be calucalted
    if (label !='beide'):
        #Cross_Validation on train data
        
        print("F1 ",label)
        func.cross_val(dt_class, train_X_temp, train_y, scoring="f1")


        print("AUC ",label)
    
        func.cross_val(dt_class, train_X_temp, train_y, scoring="roc_auc")
        

        #Calculate performance
        f1_train[scenario] = f1_score(train_y,predictions_train[scenario])
        f1_test[scenario] = f1_score(test_y,predictions_test[scenario]) 
        probs_class_train[scenario] = dt_class.predict_proba(train_X_temp)
        probs_class_train[scenario]  = probs_class_train[scenario][:, 1]
        probs_class_test[scenario] = dt_class.predict_proba(test_X_temp)
        probs_class_test[scenario]  = probs_class_test[scenario][:, 1]
        auc_train[scenario] = roc_auc_score(train_y,probs_class_train[scenario])
        auc_test[scenario] = roc_auc_score(test_y,probs_class_test[scenario]) 

    scenario += 1
    other_count += 1

print("Accuracy Trainingsdaten:", acc_train)
print("Accuracy Testdaten:",acc_test)
print("F1 Trainingsdaten:", f1_train)
print("F1 Testdaten:",f1_test)
print("AUC Trainingsdaten:", auc_train)
print("AUC Testdaten:",auc_test)

Prog1_beste
Accuracy  Prog1_beste
Scores: [0.30952381 0.26190476 0.1025641  0.44444444 0.28125   ]
Mean: 0.2799374236874237
Standard deviation: 0.10929419828981898
MfI1_beste
Accuracy  MfI1_beste
Scores: [0.23809524 0.15       0.41176471 0.2        0.38461538]
Mean: 0.2768950657185951
Standard deviation: 0.10326095331762476
beste




Accuracy  beste
Scores: [0.20930233 0.225      0.23076923 0.24324324 0.32352941]
Mean: 0.24636884227171504
Standard deviation: 0.04009345286299267
Accuracy  Prog1_beste
Scores: [0.58974359 0.71794872 0.57894737 0.65789474 0.72972973]
Mean: 0.6548528285370392
Standard deviation: 0.06260630423637735
F1  Prog1_beste
Scores: [0.27272727 0.52173913 0.42857143 0.43478261 0.54545455]
Mean: 0.4406549971767363
Standard deviation: 0.09585176075683574
AUC  Prog1_beste
Scores: [0.46753247 0.66558442 0.56902357 0.5976431  0.68888889]
Mean: 0.5977344877344877
Standard deviation: 0.07834491537951772
Accuracy  MfI1_beste
Scores: [0.61111111 0.64705882 0.52941176 0.64705882 0.70588235]
Mean: 0.6281045751633987
Standard deviation: 0.05796025954874606
F1  MfI1_beste
Scores: [0.53333333 0.57142857 0.55555556 0.5        0.66666667]
Mean: 0.5653968253968255
Standard deviation: 0.0560243261421099
AUC  MfI1_beste
Scores: [0.6038961  0.63571429 0.55714286 0.61428571 0.70714286]
Mean: 0.6236363636363637
Standar