In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import VarianceThreshold, mutual_info_regression
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_squared_error, f1_score, precision_score, recall_score
from sklearn import tree

In [None]:
train = pd.read_csv("./data/first_clean/train_gearbox.csv", sep=",")
test = pd.read_csv("./data/first_clean/test_gearbox.csv", sep=",")

In [None]:
reg_target_name = "RUL (Target)"
class_target_name = "Failure (Target)"
drop_cols = [reg_target_name, class_target_name, "Turbine_ID", "Timestamp", "Unnamed: 0"]

train_sample = train.sample(frac=0.1)
X_train = train_sample.drop(columns=drop_cols)
y_train = train_sample[class_target_name]
X_valid = test.drop(columns=drop_cols)
y_valid = test[class_target_name]

In [None]:

X_compounded = {}

X_test_compounded = {}

X_compounded["forward_fill"] = X_train.fillna(method="ffill").sample(frac=1)
X_test_compounded["forward_fill"] = X_valid.fillna(method="ffill").sample(frac=1)

# Create mutual information datasets
X_compounded["backward_fill"] = X_train.fillna(method="bfill").sample(frac=1)
X_test_compounded["backward_fill"] = X_valid.fillna(method="bfill").sample(frac=1)

X_compounded["median_fill"] = X_train.fillna(X_train.median()).sample(frac=1)
X_test_compounded["median_fill"] = X_valid.fillna(X_valid.median()).sample(frac=1)

In [None]:
def train_run(X, y, X_test, y_test, depth=5, name=""):

    clf = DecisionTreeClassifier(max_depth=depth)
    clf.fit(X, y)
    y_test_pred = clf.predict(np.array(X_test))
    y_train_pred = clf.predict(np.array(X))
    f1_train = f1_score(y_train, y_train_pred)
    precision = precision_score(y_test, y_test_pred)
    recall = recall_score(y_test, y_test_pred)
    f1 = f1_score(y_test, y_test_pred)
    print("f1 for {}: {}".format(name, f1))
    print("f1 train for {}: {}".format(name, f1_train))
    print("precision for {}: {}".format(name, precision))
    print("recall for {}: {}".format(name, recall))

    scores = {
        "f1": f1,
        "precision": precision,
        "recall": recall
    }

    return scores, clf

In [None]:
def train_runs(X_compound, X_test_compound, y_train, y_test, depths=[5]):
    
    best_scores = []

    for fe_type in X_compound:
        X_train = X_compound[fe_type]
        X_test = X_test_compound[fe_type]
        f1_best = 0
        precision_best = 0
        recall_best = 0
        best_config = {}
        best_clf = None
        for depth in depths:
            scores, clf = train_run(X_train, y_train, X_test, y_test, depth=depth, name=fe_type)
            if f1_best < scores["f1"]:
                f1_best = scores["f1"]
                best_clf = clf
                precision_best = scores["precision"]
                recall_best = scores["recall"]
                best_config["depth"] = depth
        best_scores.append({
            "type": fe_type, "best_config": best_config, 
            "f1": f1_best, "precision": precision_best, "recall": recall_best, "clf": best_clf})
    
    return best_scores

In [None]:
best_scores = train_runs(X_compounded, X_test_compounded, y_train, y_valid, depths=[40, 80, 120])

In [None]:
y_pred = best_scores[0]["clf"].predict(X_compounded["forward_fill"])

In [None]:
plt.hist(y_pred)