In [1]:
import math
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.inspection import permutation_importance
from matplotlib import pyplot as plt
from pandas import DataFrame
from sklearn.metrics import mean_absolute_error, median_absolute_error, r2_score

In [2]:
train = pd.read_csv("./data/feature_selected/train_gearbox.csv", sep=",")
test = pd.read_csv("./data/feature_selected/test_gearbox.csv", sep=",")

In [3]:
reg_target_name = "RUL (Target)"
class_target_name = "Failure (Target)"
drop_cols = [reg_target_name, class_target_name, "Turbine_ID", "Timestamp", "Unnamed: 0", "index_y"]

fill_method="ffill"

filled_train = train.fillna(method=fill_method).sample(frac=0.1)
filled_test = test.fillna(method=fill_method).sample(frac=1)

X_train = filled_train.drop(columns=drop_cols)
y_train = filled_train[reg_target_name]
X_test = filled_test.drop(columns=drop_cols)
y_test = filled_test[reg_target_name]

In [4]:
configuration = {
    "max_depth": 20, 
    "min_samples_leaf":100,
    "random_state":0
}

In [5]:
def feature_importance_gini_importance(data_x:DataFrame, data_y: DataFrame):
    rf = DecisionTreeRegressor()
    rf.fit(data_x, data_y)
    fiArr = rf.feature_importances_
    fiArr = normalize(fiArr)
    names = data_x.columns.array
    fi = {}
    for i in range(len(names)):
        fi[names[i]] = fiArr[i]
    return fi

def feature_importance_permutation_based_importance(data_x:DataFrame, data_y:DataFrame):
    rf = DecisionTreeRegressor()
    rf.fit(data_x, data_y)
    fiArr = permutation_importance(rf, data_x, data_y)["importances_mean"]
    fiArr = normalize(fiArr)
    names = data_x.columns.array    
    fi = {}
    for i in range(len(names)):
        fi[names[i]] = fiArr[i]
    return fi

def get_unimportant_features(fi: dict[str, dict], threshold: float, names: dict[str]):
    methods = ["gi", "pbi"]        
    cols_to_drop = []
    for feature in names:
        if((not fi["gi"][feature]>threshold) | (not fi["pbi"][feature]>threshold)):
            cols_to_drop.append(feature)
    return cols_to_drop

def normalize(values: list):
    highestValue = 0
    for value in values:
        if(value>highestValue):
            highestValue=value
    for value in values:
        value=value/highestValue
    return values

fi = {
    "gi": feature_importance_gini_importance(X_train,y_train),
    "pbi": feature_importance_permutation_based_importance(X_train, y_train)
}


In [6]:
drop_cols_high = get_unimportant_features(fi, 0.10, X_train.columns.array)
drop_cols_medium = get_unimportant_features(fi, 0.05, X_train.columns.array)
drop_cols_low = get_unimportant_features(fi, 0.01, X_train.columns.array)

x_train={}
x_test={}

x_train["high_drop"] = X_train.drop(columns = drop_cols_high)
x_test["high_drop"] = X_test.drop(columns = drop_cols_high)
x_train["medium_drop"] = X_train.drop(columns = drop_cols_medium)
x_test["medium_drop"] = X_test.drop(columns = drop_cols_medium)
x_train["low_drop"] = X_train.drop(columns = drop_cols_low)
x_test["low_drop"] = X_test.drop(columns = drop_cols_low)
x_train["no_drop"] = X_train
x_test["no_drop"] = X_test

In [7]:
def get_eval_scores(inputs_train:DataFrame, targets_train:DataFrame, inputs_test:DataFrame, targets_test:DataFrame):
    dtr = DecisionTreeRegressor()
    dtr.fit(inputs_train, targets_train)
    eval_predict = dtr.predict(inputs_test)
    mae = mean_absolute_error(targets_test, eval_predict)
    r2 = r2_score(targets_test, eval_predict)
    return mae, r2

eval_items = {"no_drop", "low_drop", "medium_drop", "high_drop"}
mae = {}
r2 = {}
for item in eval_items:
    mae[item], r2[item] = get_eval_scores(x_train[item], y_train, x_test[item], y_test)
print("Mean Absolute Error:")
print(mae)
print("R2 Score:")
print(r2)


Mean Absolute Error:
{'medium_drop': 0.06502440594501455, 'high_drop': 0.05963964540974167, 'low_drop': 0.06649980982995893, 'no_drop': 0.06708901974398859}
R2 Score:
{'medium_drop': -0.41428430154780704, 'high_drop': -0.30398012547763487, 'low_drop': -0.434008216414856, 'no_drop': -0.4617962263129638}
