In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_selection import VarianceThreshold, mutual_info_regression
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn import tree

In [None]:
train = pd.read_csv("./data/first_clean/train_simple_regression_gearbox.csv", sep=",")
test = pd.read_csv("./data/first_clean/test_simple_regression_gearbox.csv", sep=",")

In [None]:
train.head()

In [None]:
X_train = train.drop(columns=["RUL (Target)", "Turbine_ID", "Timestamp"])
y_train = train["RUL (Target)"]
X_test = test.drop(columns=["RUL (Target)", "Turbine_ID", "Timestamp"])
y_test = test["RUL (Target)"]

In [None]:
def get_redundant_pairs(X_train):
    '''Get diagonal and lower triangular pairs of correlation matrix'''
    pairs_to_drop = set()
    cols = X_train.columns
    for i in range(0, X_train.shape[1]):
        for j in range(0, i+1):
            pairs_to_drop.add((cols[i], cols[j]))
    return pairs_to_drop

def get_top_abs_correlations(X_train, threshold=0.5):
    au_corr = X_train.corr().abs().unstack()
    labels_to_drop = get_redundant_pairs(X_train)
    au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
    return au_corr[au_corr > threshold]

def get_indexes_to_drop(corr_df, X_train, y_train):

    indexes_to_drop = set()

    for row in corr_df.index:
        if (X_train[row[0]].corr(y_train)) > (X_train[row[1]].corr(y_train)):
            indexes_to_drop.add(row[1])
        else:
            indexes_to_drop.add(row[0])
    
    return indexes_to_drop

def corr_filter(X_train, y_train, threshold=0.5):
    corr_df = get_top_abs_correlations(X_train, threshold)
    indexes_to_drop = get_indexes_to_drop(corr_df, X_train, y_train)
    remaining_df = X_train.drop(labels=indexes_to_drop, axis=1)
    return remaining_df

In [None]:
def mutual_info(X_train, y_train, num_cols=4):
    mutual_info = mutual_info_regression(X_train, y_train)
    order = np.argsort(mutual_info)
    sorted_cols = np.array(X_train.columns)[order[::-1]]
    cutted_cols = sorted_cols[0:num_cols]
    return X_train[cutted_cols]

In [None]:
# Hyperparams for grid search
corr_filter_thresholds = [0.8, 0.9]#[0.5, 0.6, 0.7, 0.8, 0.9]
mutual_info_cols = [80, 90]#[10, 20, 30, 40, 50, 60, 70, 80, 90]

X_compounded = {
    "baseline": [X_train],
    "corr_filter": [],
    "mutual_info": [],
}
# Create correlation filter datasets
X_train_corr_filters = []
for threshold in corr_filter_thresholds:
    X_train_corr_filters.append(corr_filter(X_train, y_train, threshold=threshold))

X_compounded["corr_filter"] = X_train_corr_filters

# Create mutual information datasets
X_train_mutuals = []
for num_cols in mutual_info_cols:
    X_train_mutuals.append(mutual_info(X_train, y_train, num_cols=num_cols))

X_compounded["mutual_info"] = X_train_mutuals

In [None]:
def train_run(X, y, n_splits=5):

    skf = KFold(n_splits=n_splits)
    mses = []
    clfs = []

    for train_index, test_index in skf.split(X, y):
        X_train, X_test = np.array(X)[train_index], np.array(X)[test_index]
        y_train, y_test = np.array(y)[train_index], np.array(y)[test_index]

        clf = DecisionTreeRegressor(random_state=10)
        clf.fit(X_train, y_train)
        y_test_pred = clf.predict(X_test)
        clfs.append(clf)
        mses.append(mean_squared_error(y_test, y_test_pred))
        print(mses)

    return np.mean(mses), clfs


In [None]:
def train_runs(X_compound, y_train):
    
    best_scores = []

    for fe_type in X_compound:
        best_score = 0
        best_config = None
        for i, dataset in enumerate(X_compound[fe_type]):
            mse, clfs = train_run(dataset, y_train)
            if best_score < mse:
                best_score = mse
                best_config = i
        best_scores.append({fe_type: {"best_config": best_config, "score": best_score, "clfs": clfs}})
    
    return best_scores

In [None]:
best_scores = train_runs(X_compounded, y_train)

In [None]:
import pygraphviz as graphviz

In [None]:
some_tree = best_scores[0]["baseline"]["clfs"][0]
dot_data = tree.plot_tree(some_tree)