In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, mean_squared_error
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
import scipy

In [None]:
train = pd.read_csv("./data/first_clean/train_gearbox.csv", sep=",")
test = pd.read_csv("./data/first_clean/test_gearbox.csv", sep=",")

In [None]:
reg_target_name = "RUL (Target)"
class_target_name = "Failure (Target)"
drop_cols = [reg_target_name, class_target_name, "Turbine_ID", "Timestamp", "Unnamed: 0", "index_y"]
train_frac = 0.2
fill_method="bfill"

filled_train = train.fillna(method=fill_method).sample(frac=train_frac)
filled_test = test.fillna(method=fill_method).sample(frac=1)

X_train = filled_train.drop(columns=drop_cols)
y_train = filled_train[class_target_name]
X_test = filled_test.drop(columns=drop_cols)
y_test = filled_test[class_target_name]

In [None]:
def get_redundant_pairs(X_train):
    '''Get diagonal and lower triangular pairs of correlation matrix'''
    pairs_to_drop = set()
    cols = X_train.columns
    for i in range(0, X_train.shape[1]):
        for j in range(0, i+1):
            pairs_to_drop.add((cols[i], cols[j]))
    return pairs_to_drop

def get_top_abs_correlations(X_train, thresholds=[0.5]):
    au_corr = X_train.corr().abs().unstack()
    labels_to_drop = get_redundant_pairs(X_train)
    au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
    au_corrs = []
    for threshold in thresholds:
        au_corrs.append(au_corr[au_corr > threshold])
    return au_corrs

def get_indexes_to_drop(corr_df, X_train, y_train):

    indexes_to_drop = set()

    for row in corr_df.index:
        if (X_train[row[0]].corr(y_train)) > (X_train[row[1]].corr(y_train)):
            indexes_to_drop.add(row[1])
        else:
            indexes_to_drop.add(row[0])
    
    return indexes_to_drop

def corr_filter(X_train, y_train, thresholds=[0.5]):
    remaining_dfs = []
    corr_dfs = get_top_abs_correlations(X_train, thresholds)
    for corr_df in corr_dfs:
        indexes_to_drop = get_indexes_to_drop(corr_df, X_train, y_train)
        remaining_df = X_train.drop(labels=indexes_to_drop, axis=1)
        remaining_dfs.append(remaining_df)
    return remaining_dfs

In [None]:
def mutual_info(X_train, y_train, num_cols=[4]):
    output=[]
    mutual_info = mutual_info_classif(X_train, y_train)
    order = np.argsort(mutual_info)
    sorted_cols = np.array(X_train.columns)[order[::-1]]
    for col in num_cols:
        cutted_cols = sorted_cols[0:col]
        output.append(X_train[cutted_cols])
    return output

In [None]:
y_train.to_list().count(1)

In [None]:
X_test_compounded = {}

corr_filter_thresholds = [0.6]# [0.5, 0.6, 0.7, 0.8, 0.9]
mutual_info_cols = [10]# [10, 20, 30, 40, 50, 60, 70, 80, 90]

X_compounded = {
    "baseline": [X_train],
    "corr_filter": [],
    "mutual_info": [],
}

X_test_compounded = {
    "baseline": [X_test],
    "corr_filter": [],
    "mutual_info": [],
}

# Create correlation filter datasets
X_compounded["corr_filter"] = corr_filter(X_train, y_train, thresholds=corr_filter_thresholds)

for data in X_compounded["corr_filter"]:
    X_test_compounded["corr_filter"].append(X_test[data.columns])

# Create mutual information datasets
X_compounded["mutual_info"] = mutual_info(X_train, y_train, num_cols=mutual_info_cols)

for data in X_compounded["mutual_info"]:
    X_test_compounded["mutual_info"].append(X_test[data.columns])

In [None]:
def train_run(X, y, X_test, y_test, clf):

    clf.fit(X, y)
    y_test_pred = clf.predict(X_test)
    y_train_pred = clf.predict(X)
    f1_train = f1_score(y, y_train_pred)
    precision = precision_score(y_test, y_test_pred)
    recall = recall_score(y_test, y_test_pred)
    f1 = f1_score(y_test, y_test_pred)

    scores = {
        "f1_train": f1_train,
        "f1": f1,
        "precision": precision,
        "recall": recall
    }

    return scores

In [None]:
# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results["rank_test_score"] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print(
                "Mean validation score: {0:.3f} (std: {1:.3f})".format(
                    results["mean_test_score"][candidate],
                    results["std_test_score"][candidate],
                )
            )
            print("Parameters: {0}".format(results["params"][candidate]))
            print("")

In [None]:
def train_runs(X_compounded, X_test_compounded):
    
    best_scores = []

    for fe_type in X_compounded.keys():
        for i in range(len(X_compounded[fe_type])):
            X_train_loc = X_compounded[fe_type][i]
            X_test_loc = X_test_compounded[fe_type][i]

            # base_estimator = DecisionTreeClassifier(random_state=0)
            # param_grid = {
            #     "max_depth": [5, 10, 20],
            #     "min_samples_leaf": [1, 10]
            # }
            base_estimator = LogisticRegression(max_iter=300, penalty="l2", class_weight="balanced")
            param_grid = {
                # "penalty": ["l1", "l2", "elasticnet"],
                # "class_weight": ["balanced", None],
                "solver": ["saga"]
            }
            sh = GridSearchCV(base_estimator, param_grid, scoring="f1").fit(X_train_loc, y_train)
            clf = sh.best_estimator_
            report(sh.cv_results_)

            scores = train_run(X_train_loc, y_train, X_test_loc, y_test, clf=clf)
            config = {
                "h_param": corr_filter_thresholds[i],
                "iter": i
            } if fe_type == "corr_filter" else None
            config = {
                "h_param": mutual_info_cols[i],
                "iter": i
            } if fe_type == "mutual_info" else config

            best_scores.append({
                "type": fe_type, "config": config, "f1_train": scores["f1_train"], "f1": scores["f1"], 
                "precision": scores["precision"], "recall": scores["recall"], "clf": clf})
    
    return best_scores

In [None]:
best_scores = train_runs(X_compounded, X_test_compounded)

In [None]:
for score in best_scores:
    print("Typ: {} mit Einstellung {}:".format(score["type"], score["config"]["h_param"] if score["config"] else None))
    print(score["clf"])
    print("f1 train: {}, f1: {}, precision: {}, recall: {}".format(score["f1_train"], score["f1"], score["precision"], score["recall"]))

In [None]:
best_f1 = 0
best_data = None
my_clf = None
for score in best_scores:
    if score["f1"] > best_f1: 
        best_f1 = score["f1"]
        iter = 0
        my_clf = score["clf"]
        if score["config"] != None:
            iter = score["config"]["iter"]
        best_data = X_compounded[score["type"]][iter]

In [None]:
my_data = pd.concat([best_data, y_train], axis=1)
my_sample = my_data
prediction = my_clf.predict(my_sample.drop(class_target_name, axis=1))
actual = my_sample[class_target_name]

In [None]:
plt.xticks(rotation=90)
plt.bar(best_data.columns, my_clf.feature_importances_)

In [None]:
actual.to_list().count(1)

In [None]:
plt.hist(prediction, bins=20)
# plt.hist(actual, bins=20)

In [None]:
drop_cols.remove("Unnamed: 0")
train_out = filled_train[drop_cols + best_data.columns.to_list()]
test_out = filled_test[drop_cols + best_data.columns.to_list()]

In [None]:
test_out.head()

In [None]:
check_df = test_out.sort_values(by="Timestamp")

In [None]:
plt.plot(check_df[check_df["Turbine_ID"] == "T06"][class_target_name])

In [None]:
import os
os.makedirs("./data/feature_selected", exist_ok=True)

In [None]:
train_out.to_csv("./data/feature_selected/train_gearbox_classif.csv")
test_out.to_csv("./data/feature_selected/test_gearbox_classif.csv")