In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import f1_score, precision_score, recall_score, mean_squared_error
from sklearn.model_selection import GridSearchCV
import scipy

In [None]:
train = pd.read_csv("./data/first_clean/train_gearbox.csv", sep=",")
test = pd.read_csv("./data/first_clean/test_gearbox.csv", sep=",")

In [None]:
train[train["RUL (Target)"] < 1].sample(5)

In [None]:
reg_target_name = "RUL (Target)"
class_target_name = "Failure (Target)"
drop_cols = [reg_target_name, class_target_name, "Turbine_ID", "Timestamp", "Unnamed: 0"]

In [None]:
def fill_nans_w_method(train, test, methods: list) -> dict:

    method_dict = {
        "X": {},
        "y": {},
        "train_timestamps": {},
        "X_test": {},
        "y_test": {},
        "test_timestamps": {}
    }

    # train samplen, um Trainings-Zeit zu verkürzen
    train_frac = 0.1

    for method in methods:
        if method == "median":
            filled_train = train.fillna(train.median()).sample(frac=train_frac)
            filled_test = test.fillna(test.median()).sample(frac=1)
            method_dict["X"][method] = filled_train.drop(columns=drop_cols)
            method_dict["y"][method] = filled_train[reg_target_name]
            method_dict["train_timestamps"][method] = filled_train[["Turbine_ID", "Timestamp"]]
            method_dict["X_test"][method] = filled_test.drop(columns=drop_cols)
            method_dict["y_test"][method] = filled_test[reg_target_name]
            method_dict["test_timestamps"][method] = filled_test[["Turbine_ID", "Timestamp"]]
        else:
            filled_train = train.fillna(method=method).sample(frac=train_frac)
            filled_test = test.fillna(method=method).sample(frac=1)
            method_dict["X"][method] = filled_train.drop(columns=drop_cols)
            method_dict["y"][method] = filled_train[reg_target_name]
            method_dict["train_timestamps"][method] = filled_train[["Turbine_ID", "Timestamp"]]
            method_dict["X_test"][method] = filled_test.drop(columns=drop_cols)
            method_dict["y_test"][method] = filled_test[reg_target_name]
            method_dict["test_timestamps"][method] = filled_test[["Turbine_ID", "Timestamp"]]

    return method_dict

In [None]:

fill_methods = ["ffill", "bfill", "median"]

method_dict = fill_nans_w_method(train, test, fill_methods)

In [None]:
def discretize(values):
    convert_func = lambda x: 0 if x < 0.99 else 1
    converter = np.vectorize(convert_func)
    return converter(values)

In [None]:
def train_run(X, y, X_test, y_test, clf):

    clf.fit(X, y)
    y_test_pred = clf.predict(X_test)
    y_train_pred = clf.predict(X)
    discrete_y, discrete_y_pred = discretize(y), discretize(y_train_pred)
    discrete_y_test, discrete_y_test_pred = discretize(y_test), discretize(y_test_pred)
    f1_train = f1_score(discrete_y, discrete_y_pred)
    mse = mean_squared_error(y_test, y_test_pred)
    precision = precision_score(discrete_y_test, discrete_y_test_pred)
    recall = recall_score(discrete_y_test, discrete_y_test_pred)
    f1 = f1_score(discrete_y_test, discrete_y_test_pred)

    scores = {
        "f1_train": f1_train,
        "mse": mse,
        "f1": f1,
        "precision": precision,
        "recall": recall
    }

    return scores

In [None]:
# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results["rank_test_score"] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print(
                "Mean validation score: {0:.3f} (std: {1:.3f})".format(
                    results["mean_test_score"][candidate],
                    results["std_test_score"][candidate],
                )
            )
            print("Parameters: {0}".format(results["params"][candidate]))
            print("")

In [None]:
def train_runs(method_dict):
    
    best_scores = []

    for fe_type in fill_methods:
        X_train = method_dict["X"][fe_type]
        X_test = method_dict["X_test"][fe_type]
        y_train = method_dict["y"][fe_type]
        y_test = method_dict["y_test"][fe_type]

        base_estimator = DecisionTreeRegressor(random_state=0)
        param_grid = {
            "max_depth": [20],#scipy.stats.randint(5, 60),
            "min_samples_leaf": [1],#scipy.stats.randint(1, 100),
            "criterion": ["squared_error"],
        }
        # base_estimator = LinearRegression()
        # param_grid = {}
        sh = GridSearchCV(base_estimator, param_grid, scoring="neg_mean_squared_error").fit(X_train, y_train)
        clf = sh.best_estimator_
        report(sh.cv_results_)

        scores = train_run(X_train, y_train, X_test, y_test, clf=clf)

        best_scores.append({
            "type": fe_type, "f1_train": scores["f1_train"],"mse": scores["mse"], "f1": scores["f1"], 
            "precision": scores["precision"], "recall": scores["recall"], "clf": clf})
    
    return best_scores

In [None]:
# %run ./utility/model_loader.py -i

In [None]:
best_scores = train_runs(method_dict=method_dict)

In [None]:
best_scores

In [None]:
best_mse = float('inf')
best_data = None
for score in best_scores:
    print(score["mse"])
    if score["mse"] < best_mse: 
        print(score["type"])
        best_data_train = pd.concat([method_dict["X"][score["type"]], method_dict["train_timestamps"][score["type"]], method_dict["y"][score["type"]]], axis=1)
        best_data_test = pd.concat([method_dict["X_test"][score["type"]], method_dict["test_timestamps"][score["type"]], method_dict["y_test"][score["type"]]], axis=1)
        best_data = pd.concat([best_data_train, best_data_test])

In [None]:
best_data.head()

In [None]:
data_sorted = best_data.sort_values(by="Timestamp")
data_sorted = data_sorted[data_sorted["Turbine_ID"] == "T06"]
data_sorted.head()
# y_pred = best_scores[0]["clf"].predict(data_sorted.drop([reg_target_name, "Turbine_ID", "Timestamp"], axis=1))

In [None]:
fig, (ax1, ax2) = plt.subplots(2,1)
ax1.scatter(range(len(y_pred)), y_pred)
y_label = data_sorted["RUL (Target)"]
ax2.scatter(range(len(y_label)), y_label)

In [None]:
fig, (ax1, ax2) = plt.subplots(2,1)
ax1.hist(y_pred[y_pred!=1], bins=20)
ax2.hist(y_label[y_label!=1], bins=20)

In [None]:
my_clf = best_scores[0]["clf"]

In [None]:
my_data = pd.concat([X_compounded["ffill"], y_compounded["ffill"]], axis=1)
my_sample = my_data[my_data["RUL (Target)"] < 1]
prediction = my_clf.predict(my_sample.drop("RUL (Target)", axis=1))
actual = my_sample["RUL (Target)"]

In [None]:
plt.hist(prediction, bins=20, log=True)
plt.hist(actual, bins=20, log=True)