In [1]:
import pandas as pd
from pandas import DataFrame
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_absolute_error, median_absolute_error, r2_score, accuracy_score
from sklearn.feature_selection import mutual_info_regression
from sdv.tabular import CTGAN
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
import matplotlib.pyplot as plt

In [2]:
train = pd.read_csv("./data/multi/train_hydraulic.csv", sep=",")

drop_cols = ["Turbine_ID", "Timestamp", "Unnamed: 0", "index_y"]
train = train.drop(columns = drop_cols)

fill_method="ffill"
train = train.fillna(method=fill_method)

class_target_name = "Risk Level"

test = pd.read_csv("./data/multi/test_hydraulic.csv", sep=",")
drop_cols = ["Turbine_ID", "Timestamp", "Unnamed: 0", "index_y"]
test = test.drop(columns = drop_cols)
fill_method="ffill"
test = test.fillna(method = fill_method)
test = test.fillna(method = "backfill")

In [3]:
def split_x_y(data:DataFrame ,target_name: str):
    drop = [target_name]
    x = data.drop(columns=target_name)
    y = data[target_name]
    return x,y

def checkNaN(data: DataFrame):
    re = str(data.isnull().sum().sum())
    #print(re)
    return re

def get_eval_scores(inputs_train:DataFrame, targets_train:DataFrame, inputs_test:DataFrame, targets_test:DataFrame, args):
    dtr = DecisionTreeClassifier(
        max_depth = args["max_depth"],
        min_samples_leaf = args["min_samples_leaf"],
        random_state = args["random_state"],
        max_features = args["max_features"],
        criterion = args["criterion"],
        #normalize = args["normalize"],
    )
    dtr.fit(inputs_train, targets_train)
    eval_predict = dtr.predict(inputs_test)
    accuracy = accuracy_score(targets_test, eval_predict)
    return accuracy

def train_run(data: DataFrame, test:DataFrame, args):
    train_x, train_y = split_x_y(data, class_target_name)    
    test_x, test_y = split_x_y(test, class_target_name)
    return 1-get_eval_scores(train_x, train_y, test_x, test_y, args)

def get_args():
    args = {
        "max_depth": hp.choice('max_depth', range(1,100)),
        "min_samples_leaf": hp.choice("min_samples_leaf", range(1,15)),
        "random_state": hp.randint("random_state", 3000),
        "max_features": hp.choice('max_features', range(1,50)),
        "criterion": hp.choice('criterion', ["gini", "entropy"]),
        #"normalize": hp.choice('normalize', [0, 1])
    }
    return args
    
def f(args):
    acc = train_run(train, test, args)
    return {'loss': acc, 'status': STATUS_OK}

In [4]:
trials = Trials()
args_ = get_args()
best = fmin(f, args_, algo=tpe.suggest, max_evals=300, trials=trials)
print ("best:" + str(best))

100%|██████████| 300/300 [20:44<00:00,  4.15s/trial, best loss: 0.14338980251665812]
best:{'criterion': 1, 'max_depth': 5, 'max_features': 0, 'min_samples_leaf': 7, 'random_state': 373}
