In [1]:
from sklearn.metrics import accuracy_score, mean_squared_error, mean_absolute_error, roc_auc_score, f1_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor

import math
import numpy as np

import functions as func

LENGTH = 1000   

In [2]:
#Regression scenarios 1. - 3.
dt_reg = DecisionTreeRegressor(random_state=42)
#helper variables
predictions_reg_train = [[0] * LENGTH for i in range(4)]
predictions_reg_test = [[0] * LENGTH for i in range(4)]

rmse_reg_train = [[0] * 1 for i in range(4)]
rmse_reg_test = [[0] * 1 for i in range(4)]
mae_reg_train = [[0] * 1 for i in range(4)]
mae_reg_test = [[0] * 1 for i in range(4)]

labels = ["Prog1_scaled","Prog1_beste", "MfI1_beste", "beste"]
others = [["Prog1_beste","MfI1_beste","beste"],["MfI1_beste","beste","Prog1_scaled"],["Prog1_beste","beste","Prog1_scaled"],["Prog1_beste","MfI1_beste","Prog1_scaled"]]


scenario = 0
other_count = 0


for label in labels:  
    print("=============")
    #Load data
    data,X,y,train_X,test_X,train_y,test_y,names,feature_names,numcols,numrows = func.load_data_noten(label,others[other_count][0],others[other_count][1],others[other_count][2])

    #Cross-Validation on train data
    print("RMSE ",label)
    func.cross_val(dt_reg, train_X, train_y, scoring="neg_mean_squared_error")
    print("MAE ",label)
    func.cross_val(dt_reg, train_X, train_y, scoring="neg_mean_absolute_error")
    
    
    #Fit and store predictions
    dt_reg.fit(train_X, train_y)
    predictions_reg_train[scenario] = dt_reg.predict(train_X)
    predictions_reg_test[scenario] = dt_reg.predict(test_X)
    #Calculate performance
    rmse_reg_train[scenario] = sqrt(mean_squared_error(train_y,predictions_reg_train[scenario]))
    rmse_reg_test[scenario] = sqrt(mean_squared_error(test_y,predictions_reg_test[scenario]))

    mae_reg_train[scenario] = mean_absolute_error(train_y,predictions_reg_train[scenario])
    mae_reg_test[scenario] = mean_absolute_error(test_y,predictions_reg_test[scenario])

    scenario += 1
    other_count += 1


print("RMSE Trainingsdaten:", rmse_reg_train)
print("RMSE Testdaten:",rmse_reg_test)
print("MAE Trainingsdaten:",mae_reg_train)
print("MAE Testdaten:",mae_reg_test)

#Classification
dt_class = DecisionTreeClassifier(random_state=42)
#helper variables
predictions_class_train = [[0] * LENGTH for i in range(7)]
predictions_class_test = [[0] * LENGTH for i in range(7)]

probs_class_train = [[0] * LENGTH for i in range(7)]
probs_class_test = [[0] * LENGTH for i in range(7)]


acc_train = [[0] * 1 for i in range(7)]
acc_test = [[0] * 1 for i in range(7)]
f1_train = [[0] * 1 for i in range(7)]
f1_test = [[0] * 1 for i in range(7)]
auc_train = [[0] * 1 for i in range(7)]
auc_test = [[0] * 1 for i in range(7)]


labels_class = ["Prog1_beste", "MfI1_beste", "beste", "beide"]
others_class = [["MfI1_beste","beste","beide"],["Prog1_beste","beste","beide"],["Prog1_beste","MfI1_beste","beide"]
         ,["Prog1_beste","MfI1_beste","beste"]]


scenario = 0
other_count = 0
#Scenarios 1. - 3.
for label in labels:
    #Continuuos labels after normalization and scaling not suitable for accuracy
    if label == "Prog1_scaled":
        other_count+=1
        continue
    print("==============")
    #Load data
    data,X,y,train_X,test_X,train_y,test_y,names,feature_names,numcols,numrows = func.load_data_noten(label,others[other_count][0],others[other_count][1],others[other_count][2])

    #Cross-Validation on train data
    print("ACC ",label)
    func.cross_val(dt_class, train_X, train_y, scoring="accuracy")
 
    
    #Fit and store predictions
    dt_class.fit(train_X, train_y)
    predictions_class_train[scenario] = dt_class.predict(train_X)
    predictions_class_test[scenario] = dt_class.predict(test_X)
    #Calculate accuracy
    acc_train[scenario] = accuracy_score(train_y,predictions_class_train[scenario])
    acc_test[scenario] = accuracy_score(test_y,predictions_class_test[scenario])   
    scenario += 1
    other_count += 1
    
other_count = 0
#Scenarios 4. - 7.
for label in labels_class:   
    print("================")
    #Load data
    data,X,y,train_X,test_X,train_y,test_y,names,feature_names,numcols,numrows = func.load_data_bestanden(label,others_class[other_count][0],
                                                                 others_class[other_count][1],
                                                                 others_class[other_count][2])
    
    #Cross-Validation on train data
    print("ACC ",label)
    func.cross_val(dt_class, train_X, train_y, scoring="accuracy")
    
    #Fit and store predictions
    dt_class.fit(train_X, train_y)
    predictions_class_train[scenario] = dt_class.predict(train_X)
    predictions_class_test[scenario] = dt_class.predict(test_X)
    #Calculate accuracy
    acc_train[scenario] = accuracy_score(train_y,predictions_class_train[scenario])
    acc_test[scenario] = accuracy_score(test_y,predictions_class_test[scenario])
    if(label!="beide"):
        #Cross-Validation F1
        print("F1 ",label)
        func.cross_val(dt_class, train_X, train_y, scoring="f1")

        #Cross-Validation ROC_AUC
        print("AUC ",label)
    
        func.cross_val(dt_class, train_X, train_y, scoring="roc_auc")
        #Calculate performance
        f1_train[scenario] = f1_score(train_y,predictions_class_train[scenario])
        f1_test[scenario] = f1_score(test_y,predictions_class_test[scenario]) 
        probs_class_train[scenario] = dt_class.predict_proba(train_X)
        probs_class_train[scenario]  = probs_class_train[scenario][:, 1]
        probs_class_test[scenario] = dt_class.predict_proba(test_X)
        probs_class_test[scenario]  = probs_class_test[scenario][:, 1]
        auc_train[scenario] = roc_auc_score(train_y,probs_class_train[scenario])
        auc_test[scenario] = roc_auc_score(test_y,probs_class_test[scenario]) 
    scenario += 1
    other_count += 1

print("Accuracy Trainingsdaten:", acc_train)
print("Accuracy Testdaten:",acc_test)
print("F1 Trainingsdaten:", f1_train)
print("F1 Testdaten:",f1_test)
print("AUC Trainingsdaten:", auc_train)
print("AUC Testdaten:",auc_test)

RMSE  Prog1_scaled
Scores: [184.00092167 155.10115123 190.72411617 171.80971503 168.08663293]
Mean: 173.94450740707444
Standard deviation: 12.462991953847148
MAE  Prog1_scaled
Scores: [-153.10405752 -116.18889289 -145.41953498 -141.91007028 -129.40523404]
Mean: -137.20555794220428
Standard deviation: 12.999303112246004
RMSE  Prog1_beste
Scores: [187.65762605 159.52231324 205.02888115 200.7502376  182.15089176]
Mean: 187.02198995878138
Standard deviation: 16.081522093046797
MAE  Prog1_beste
Scores: [-154.87179487 -112.63157895 -166.31578947 -164.34210526 -140.        ]
Mean: -147.6322537112011
Standard deviation: 19.819198441552
RMSE  MfI1_beste
Scores: [135.2364185  189.03781134 160.73361228 162.08204168 144.91376746]
Mean: 158.4007302540195
Standard deviation: 18.309061332815048
MAE  MfI1_beste
Scores: [ -92.22222222 -141.76470588 -127.05882353 -127.05882353 -117.64705882]
Mean: -121.15032679738563
Standard deviation: 16.39439790374566
RMSE  beste
Scores: [182.22977887 154.75373323 19




ACC  Prog1_beste
Scores: [0.66666667 0.61538462 0.60526316 0.68421053 0.62162162]
Mean: 0.638629317576686
Standard deviation: 0.031005219069474278
F1  Prog1_beste
Scores: [0.43478261 0.34782609 0.34782609 0.4        0.36363636]
Mean: 0.3788142292490118
Standard deviation: 0.033859101908442206
AUC  Prog1_beste
Scores: [0.59253247 0.5487013  0.53367003 0.55387205 0.57592593]
Mean: 0.5609403559403561
Standard deviation: 0.020813720069212838
ACC  MfI1_beste
Scores: [0.77777778 0.82352941 0.47058824 0.70588235 0.58823529]
Mean: 0.673202614379085
Standard deviation: 0.12874323923916475
F1  MfI1_beste
Scores: [0.77777778 0.8        0.4        0.54545455 0.58823529]
Mean: 0.6222935234699939
Standard deviation: 0.14982225952383552
AUC  MfI1_beste
Scores: [0.81818182 0.82857143 0.46428571 0.66428571 0.60714286]
Mean: 0.6764935064935066
Standard deviation: 0.13652396264559094
ACC  beste
Scores: [0.69230769 0.66666667 0.76923077 0.55263158 0.76315789]
Mean: 0.6887989203778677
Standard deviation: 