In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
import scipy

In [None]:
train = pd.read_csv("./data/multi/train_hydraulic.csv", sep=",")
test = pd.read_csv("./data/multi/test_hydraulic.csv", sep=",")

In [None]:
class_target_name = "Risk Level"
drop_cols = [class_target_name, "Turbine_ID", "Timestamp", "Unnamed: 0"]

In [None]:
print("\nTest unbalanced:\n")
print(test[class_target_name].value_counts())
print("\nTrain unbalanced:")
train[class_target_name].value_counts()

In [None]:
test_balanced = test.drop(test[test[class_target_name] == "low"].sample(n=126000).index)
train_balanced = train.drop(train[train[class_target_name] == "low"].sample(n=211000).index)

In [None]:
print("\nTest balanced:\n")
print(test_balanced[class_target_name].value_counts())
print("\nTrain balanced:")
train_balanced[class_target_name].value_counts()

In [None]:
class FillMethod():

    def __init__(self, train, test, method) -> None:
        self.method = method
        if method == "median":
            filled_train = train.fillna(train.median())
            filled_test = test.fillna(test.median())
        else:
            filled_train = train.fillna(method=method)
            filled_test = test.fillna(method=method)
        print(filled_train.isna().sum().sum())
        print(filled_test.isna().sum().sum())
        
        # create class balance
        self.test = filled_test.drop(filled_test[filled_test[class_target_name] == "low"].sample(n=126000).index).sample(frac=1)
        self.train = filled_train.drop(filled_train[filled_train[class_target_name] == "low"].sample(n=211000).index).sample(frac=1)

    def get_train_x(self):
        return self.train.drop(columns=drop_cols)

    def get_train_y(self):
        return self.train[class_target_name]

    def get_test_x(self):
        return self.test.drop(columns=drop_cols)

    def get_test_y(self):
        return self.test[class_target_name]

    def get_method(self):
        return self.method

In [None]:
fill_methods = ["median", "ffill", "bfill"]
fillers = []
for method in fill_methods:
    fillers.append(FillMethod(train, test, method))

In [None]:
# check if nan-values vanished
for filler in fillers:
    print(filler.train.isna().sum().sum())
    print(filler.test.isna().sum().sum())
    print(filler.get_method())

In [None]:
def train_run(X, y, X_test, y_test, model):

    clf = model
    clf.fit(X, y)
    y_test_pred = clf.predict(X_test)
    y_train_pred = clf.predict(X)
    f1_train = f1_score(y, y_train_pred, average='weighted')
    precision = precision_score(y_test, y_test_pred, average='weighted')
    recall = recall_score(y_test, y_test_pred, average='weighted')
    f1 = f1_score(y_test, y_test_pred, average='weighted')

    scores = {
        "f1_train": f1_train,
        "f1": f1,
        "precision": precision,
        "recall": recall
    }

    return scores

In [None]:
# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results["rank_test_score"] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print(
                "Mean validation score: {0:.3f} (std: {1:.3f})".format(
                    results["mean_test_score"][candidate],
                    results["std_test_score"][candidate],
                )
            )
            print("Parameters: {0}".format(results["params"][candidate]))
            print("")

In [None]:
def train_runs(fillers):
    
    best_scores = []
    le = preprocessing.LabelEncoder()

    for filler in fillers:
        X_train = filler.get_train_x()
        X_test = filler.get_test_x()
        y_train = filler.get_train_y()
        y_test = filler.get_test_y()

        y_train = le.fit_transform(y_train)
        y_test = le.fit_transform(y_test)

        base_estimator = DecisionTreeClassifier(random_state=0)
        # param_grid = {
        #     "max_depth": [5, 10, 20, 30, 50],
        #     "min_samples_leaf": [1, 2, 5, 10],
        #     "class_weight": ["balanced", None], 
        #     "criterion": ["gini", "entropy", "log_loss"],
        # }
        param_grid = {
            "max_depth": [20],
        }
        sh = GridSearchCV(base_estimator, param_grid, scoring="f1_weighted").fit(X_train, y_train)
        clf = sh.best_estimator_
        report(sh.cv_results_)

        scores = train_run(X_train, y_train, X_test, y_test, model=clf)

        best_scores.append({
            "type": filler.get_method(),  "f1_train": scores["f1_train"], "f1": scores["f1"],
            "precision": scores["precision"], "recall": scores["recall"], "clf": clf})
    
    return best_scores

In [None]:
# %run ./utility/model_loader.py -i

In [None]:
best_scores = train_runs(fillers)

In [None]:
best_scores

In [None]:
y_pred = best_scores[0]["clf"].predict(fillers[0].get_test_x())

In [None]:
plt.hist(y_pred, bins=5)

In [None]:
plt.hist(fillers[0].get_train_y(), bins=5)