In [None]:
import numpy as np
import pandas as pd
import pickle
import time

from joblib import dump, load
from sklearn.impute import KNNImputer
from sklearn.utils import shuffle
from sklearn.feature_selection import VarianceThreshold, f_regression, mutual_info_regression, SelectFromModel, RFECV
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.linear_model import LinearRegression

pd.options.display.float_format = '{:20,.15f}'.format

In [None]:
from importlib import reload
# module references for reload
import process_class, output, run_models

from process_class import Process
from output import output_metrics
from run_models import run_nn, run_lgb

reload(process_class)
reload(output)
reload(output)

# reimport in case changed
from process_class import Process
from output import output_metrics
from run_models import run_nn, run_lgb

In [None]:
def process_data(raw_data):
    data = {}
    imputer_func = KNNImputer(n_neighbors=30, weights='distance')
    process = Process(
        raw_data['X_train'], raw_data['X_test'], raw_data['X_val'], 
        raw_data['y_train'], raw_data['y_test'], raw_data['y_val'], 
        imputer='func', imputer_func=imputer_func
    ).skew_X().skew_y().fill_nan()
    data['X_train'], data['X_test'], data['X_val'], data['y_train'], data['y_test'], data['y_val'] = process.return_processed()
    return data, process

In [None]:
with open('processed/dataset_all_no_process_data.pickle', 'rb') as handle:
    raw_data = pickle.load(handle)
X_raw = pd.concat([raw_data['X_train'], raw_data['X_test'], raw_data['X_val']])
y_raw = pd.concat([raw_data['y_train'], raw_data['y_test'], raw_data['y_val']])

with open('processed/dataset_all_data.pickle', 'rb') as handle:
    data = pickle.load(handle)
    
with open('processed/dataset_all_process.pickle', 'rb') as handle:
    process = pickle.load(handle)

X = pd.concat([data['X_train'], data['X_test'], data['X_val']])
y = np.concatenate([data['y_train'], data['y_test'], data['y_val']])

In [None]:
def get_mins_from_result(results):
    for mod in ['nn', 'lgb']:
        for metric in ['smape', 'mape', 'mae', 'rmse']:
            s = [(th, results[th][mod][0]["test"][metric]) for th in results]
            print(mod, metric, min(s, key=lambda t: t[1]))
        for metric in ['adj_r2']:
            s = [(th, results[th][mod][0]["test"][metric]) for th in results]
            print(mod, metric, max(s, key=lambda t: t[1]))

In [None]:
###########################################
# Move raw all dataset through variance threshold
# and output metrics to find the best result
###########################################

In [None]:
results = {}
for threshold in [0.01, 0.02, 0.03, 0.05, 0.75, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.5]:
    print('#########################################')
    print(threshold)
    selector = VarianceThreshold(threshold=threshold)
    selector.fit(X_raw)
    features = selector.get_support(indices=True)
    print(f'{len(features)}\tfeatures')
    d = raw_data.copy()
    for name in ['X_train', 'X_test', 'X_val']:
        d[name] = d[name][d[name].columns[features]]
    data, process = process_data(d)
    results[threshold] = {}
    results[threshold]['features'] = features
    results[threshold]['nn'] = run_nn(data, process)
    results[threshold]['lgb'] = run_lgb(data, process)

In [None]:
get_mins_from_result(results)

In [None]:
selector = VarianceThreshold(threshold=0.01)
selector.fit(X_raw)
features_VT = [X_raw.columns[i] for i in selector.get_support(indices=True)]
print(len(features_VT))
print(features_VT)

In [None]:
####################################
# f_regression
####################################
results = {}

f_test, _ = f_regression(X, y)
f_test /= np.max(f_test)

num_features_list = []
threshold = 0.000001
while threshold < 0.2:
    features = [col for i, col in enumerate(data['X_train'].columns) if f_test[i] >= threshold]
    num_features = len(features)
    if num_features not in num_features_list:
        num_features_list.append(num_features)
        print('#########################################')
        print(f'{threshold}\tthreshold')
        print(f'{num_features}\tnum features')

        d = raw_data.copy()
        for name in ['X_train', 'X_test', 'X_val']:
            d[name] = d[name][features]
        data_run, process_run = process_data(d)
        results[threshold] = {}
        results[threshold]['features'] = features
        results[threshold]['nn'] = run_nn(data_run, process_run)
        results[threshold]['lgb'] = run_lgb(data_run, process_run)
    if threshold < 0.01:
        threshold += 0.001
    else:
        threshold += 0.01

In [None]:
get_mins_from_result(results)

In [None]:
features_FTEST = [col for i, col in enumerate(data['X_train'].columns) if f_test[i] >= 0.003]
print(len(features_FTEST))
print(features_FTEST)

In [None]:
####################################
# mutual info regression
####################################
results_minfo = {}

mi = mutual_info_regression(X, y)
mi /= np.max(mi)

num_features_list = []
threshold = 0.0
while threshold < 0.4:
    features = [col for i, col in enumerate(data['X_train'].columns) if mi[i] >= threshold]
    num_features = len(features)
    if num_features not in num_features_list:
        num_features_list.append(num_features)
        print('#########################################')
        print(f'{threshold}\tthreshold')
        print(f'{num_features}\tnum features')

        d = raw_data.copy()
        for name in ['X_train', 'X_test', 'X_val']:
            d[name] = d[name][features]
        data_run, process_run = process_data(d)
        results_minfo[threshold] = {}
        results_minfo[threshold]['features'] = features
        results_minfo[threshold]['nn'] = run_nn(data_run, process_run)
        results_minfo[threshold]['lgb'] = run_lgb(data_run, process_run)
    if threshold < 0.01:
        threshold += 0.001
    else:
        threshold += 0.01

In [None]:
get_mins_from_result(results_minfo)

In [None]:
features_MIR = [col for i, col in enumerate(data['X_train'].columns) if mi[i] >= 0.006]
print(len(features_MIR))
print(features_MIR)

In [None]:
####################################
# Select from model
####################################

def get_RFR_model():
    return RandomForestRegressor(
        n_estimators=40,
        max_depth=15,
        min_samples_split=0.001,
        min_samples_leaf=0.0005,
        bootstrap=True,
        max_samples=0.95,
        criterion='mae', 
        random_state=0, 
        n_jobs=-1,
    )

results_select = {}
num_features_list = []
threshold = 0.0
while threshold < 1.0:
    start_time = time.time()
    model = get_RFR_model()
    selector = SelectFromModel(estimator=model, prefit=False, threshold=f'{threshold}*mean')
    selector.fit(X,y)
    features = [X.columns[i] for i in selector.get_support(indices=True)]
    num_features = len(features)
    if num_features not in num_features_list:
        num_features_list.append(num_features)
        print('#########################################')
        print(f'{threshold}\tthreshold')
        print(f'{num_features}\tnum features')
        d = raw_data.copy()
        for name in ['X_train', 'X_test', 'X_val']:
            d[name] = d[name][features]
        data_run, process_run = process_data(d)
        results_select[threshold] = {}
        results_select[threshold]['selector'] = selector
        results_select[threshold]['features'] = features
        results_select[threshold]['nn'] = run_nn(data_run, process_run)
        results_select[threshold]['lgb'] = run_lgb(data_run, process_run)
    if threshold < 0.1:
        threshold += 0.01
    else:
        threshold += 0.1
    print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
get_mins_from_result(results_select)

In [None]:
####################################################################################
# RFECV feature selection with LR
####################################################################################

selector = RFECV(LinearRegression(), n_jobs=-1)
selector = selector.fit(X, y)

In [None]:
features_RFECV_LR = [X.columns[i] for i in selector.get_support(indices=True)]
print(len(features_RFECV_LR))
print(features_RFECV_LR)

d = raw_data.copy()
for name in ['X_train', 'X_test', 'X_val']:
    d[name] = d[name][features_RFECV_LR]
data_run, process_run = process_data(d)
run_nn(data_run, process_run)
run_lgb(data_run, process_run)

In [None]:
####################################################################################
# RFECV feature selection with AdaBoostRegressor
####################################################################################

model = get_RFR_model()
abr = AdaBoostRegressor(
    base_estimator=model,
    loss='exponential',
    random_state=0
)

selector_adaboost = RFECV(abr, n_jobs=-1)
selector_adaboost = selector.fit(X, y)

In [None]:
features_RFECV_adaboost = [X.columns[i] for i in selector_adaboost.get_support(indices=True)]
print(len(features_RFECV_adaboost))
print(features_RFECV_adaboost)

d = raw_data.copy()
for name in ['X_train', 'X_test', 'X_val']:
    d[name] = d[name][features_RFECV_adaboost]
data_run, process_run = process_data(d)
run_nn(data_run, process_run)
run_lgb(data_run, process_run)

In [None]:
#########################################
# RFECV feature selection with tuned RF #
#########################################

In [None]:
model = RandomForestRegressor(
    n_estimators=40,
    max_depth=15,
    min_samples_split=0.001,
    min_samples_leaf=0.0005,
    bootstrap=True,
    max_samples=0.95,
    criterion='mae', 
    random_state=0, 
    n_jobs=-1,
    verbose=1,
)
selector = RFECV(model, verbose=10, n_jobs=-1)

In [None]:
selector = selector.fit(data['X_train'], data['y_train'])

In [None]:
joblib.dump(selector, 'rfecv_selector_joblib_2.sav')
pickle.dump(selector, open('rfecv_selector_pickle_2.sav', 'wb'))

In [None]:
# RFECV RFR
with open('rfecv_selector_pickle_2.sav', 'rb') as handle:
    rfecv_rfr = pickle.load(handle)

features_RFECV_RFR = [X.columns[i] for i in rfecv_rfr.get_support(indices=True)]
print(len(features_RFECV_RFR))
print(features_RFECV_RFR)

# d = raw_data.copy()
# for name in ['X_train', 'X_test', 'X_val']:
#     d[name] = d[name][features_RFECV_LR]
# data_run, process_run = process_data(d)
# run_nn(data_run, process_run)
# run_lgb(data_run, process_run)

In [None]:
# RFECV ETR
rfecv_etr = load('rfecv_selector_joblib_etr.sav')

features_RFECV_ETR = [X.columns[i] for i in rfecv_etr.get_support(indices=True)]
print(len(features_RFECV_ETR))
print(features_RFECV_ETR)

d = raw_data.copy()
for name in ['X_train', 'X_test', 'X_val']:
    d[name] = d[name][features_RFECV_ETR]
data_run, process_run = process_data(d)
run_nn(data_run, process_run)
run_lgb(data_run, process_run)

In [None]:
feature_sets = [features_VT, features_FTEST, features_MIR, features_RFECV_LR, features_RFECV_adaboost, features_RFECV_RFR, features_RFECV_ETR]
dump(feature_sets, 'feature_sets.joblib')
present_always = []
for col in X.columns:
    col_present = True
    for feature_set in feature_sets:
        if col not in feature_set:
            col_present = False
    if col_present:
        present_always.append(col)