In [1]:
import numpy as np
import pandas as pd
import pickle

from sklearn.impute import KNNImputer
from sklearn.utils import shuffle
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestRegressor

pd.options.display.float_format = '{:20,.15f}'.format

In [3]:
from importlib import reload
# module references for reload
import process_class, output, run_models

from process_class import Process
from output import output_metrics
from run_models import run_nn, run_lgb

reload(process_class)
reload(output)
reload(output)

# reimport in case changed
from process_class import Process
from output import output_metrics
from run_models import run_nn, run_lgb

In [4]:
def process_data(raw_data):
    data = {}
    imputer_func = KNNImputer(n_neighbors=30, weights='distance')
    process = Process(
        raw_data['X_train'], raw_data['X_test'], raw_data['X_val'], 
        raw_data['y_train'], raw_data['y_test'], raw_data['y_val'], 
        imputer='func', imputer_func=imputer_func
    ).skew_X().skew_y().fill_nan()
    data['X_train'], data['X_test'], data['X_val'], data['y_train'], data['y_test'], data['y_val'] = process.return_processed()
    return data, process

In [5]:
###########################################
# Move raw all dataset through variance threshold
# and output metrics to find the best result
###########################################

In [6]:
with open('processed/dataset_all_no_process_data.pickle', 'rb') as handle:
    raw_data = pickle.load(handle)

In [7]:
X_raw = pd.concat([raw_data['X_train'], raw_data['X_test'], raw_data['X_val']])
y_raw = pd.concat([raw_data['y_train'], raw_data['y_test'], raw_data['y_val']])

In [None]:
from sklearn.feature_selection import VarianceThreshold

results = {}
for threshold in [0.01, 0.02, 0.03, 0.05, 0.75, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.5]:
    print('#########################################')
    print(threshold)
    selector = VarianceThreshold(threshold=threshold)
    selector.fit(X_raw)
    features = selector.get_support(indices=True)
    print(f'{len(features)}\tfeatures')
    d = raw_data.copy()
    for name in ['X_train', 'X_test', 'X_val']:
        d[name] = d[name][d[name].columns[features]]
    data, process = process_data(d)
    results[threshold] = {}
    results[threshold]['features'] = features
    results[threshold]['nn'] = run_nn(data, process)
    results[threshold]['lgb'] = run_lgb(data, process)

In [None]:
for th in results:
    print(th)
    print(f'nn: {results[th]["nn"]}')
    print(f'lgb: {results[th]["lgb"]}')

In [None]:
ind = sel.get_support()
not_ind = [not val for val in ind]

In [None]:
for a in [int(b) for b in list(sel.get_support())]:
    print(a)

In [None]:
with open('processed/dataset_all_data.pickle', 'rb') as handle:
    data = pickle.load(handle)
    
with open('processed/dataset_all_process.pickle', 'rb') as handle:
    process = pickle.load(handle)

In [None]:
X = pd.concat([data['X_train'], data['X_test'], data['X_val']])
y = np.concatenate([data['y_train'], data['y_test'], data['y_val']])

In [None]:
####################################
# Select from model
####################################

from sklearn.feature_selection import SelectFromModel
model = RandomForestRegressor(
    n_estimators=40,
    max_depth=15,
    min_samples_split=0.001,
    min_samples_leaf=0.0005,
    bootstrap=True,
    max_samples=0.95,
    criterion='mae', 
    random_state=0, 
    n_jobs=-1,
    verbose=1,
)
selector = SelectFromModel(estimator=model, prefit=False, threshold="0.1*mean")
selector.fit(X,y)

In [None]:
selector.threshold_

In [None]:
ind = selector.get_support()
not_ind = [not val for val in ind]

In [None]:
for a in [int(b) for b in list(selector.get_support())]:
    print(a)

In [None]:
####################################
# f_regression
####################################

from sklearn.feature_selection import f_regression, mutual_info_regression
f_test, _ = f_regression(X, y)
f_test /= np.max(f_test)

mi = mutual_info_regression(X, y)
mi /= np.max(mi)

In [None]:
f_test.shape

In [None]:
for a in f_test:
    print(int(a >= 0.2))

In [None]:
len([a for a in f_test if a >= 0.2])

In [None]:
for a in mi:
    print(int(a >= 0.2))

In [None]:
len([a for a in mi if a >= 0.05])

In [None]:
koefs = []
for i in range(len(X.columns)):
    koefs.append({
        'f': f_test[i],
        'mi': mi[i],
        'col': X.columns[i]
    })

In [None]:
f_sorted = sorted(koefs, key=lambda k: k['f'], reverse=True) 
mi_sorted = sorted(koefs, key=lambda k: k['mi'], reverse=True) 

In [None]:
f_sorted

In [None]:
mi_sorted

In [None]:
####################################################################################
# RFECV feature selection with LR
####################################################################################

In [None]:
from sklearn.linear_model import LinearRegression
selector = RFECV(LinearRegression(), verbose=10, n_jobs=-1)

In [None]:
selector = selector.fit(X, y)

In [None]:
for a in list(selector.get_support()):
    print(int(a))

In [None]:
####################################################################################
# RFECV feature selection with AdaBoostRegressor
####################################################################################

In [None]:
# AdaBoostRegressor model selection
from scipy.stats import randint as sp_randint

random_grid = {
               'bootstrap': [True, False],
               'max_depth': sp_randint(5, 300),
               'min_samples_leaf': sp_randint(1, 20),
               'min_samples_split': sp_randint(10, 100),
               'n_estimators': [100, 500, 1000],
}

In [None]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import AdaBoostRegressor

abr = AdaBoostRegressor(
    criterion='mae',
    n_jobs=-1,
    random_state=0,
    verbose=1,    
)
abr_random = RandomizedSearchCV(
    scoring='neg_mean_absolute_error',
    estimator = rf, 
    param_distributions = random_grid, 
    n_iter = 100, 
    n_jobs=-1,
    cv = 3, 
    refit=True,
    verbose=True, 
    random_state=42, 
)
abr_random.fit(
    data['X_train'], 
    data['y_train'], 
)

In [None]:
#########################################
# RFECV feature selection with tuned RF #
#########################################

In [None]:
model = RandomForestRegressor(
    n_estimators=40,
    max_depth=15,
    min_samples_split=0.001,
    min_samples_leaf=0.0005,
    bootstrap=True,
    max_samples=0.95,
    criterion='mae', 
    random_state=0, 
    n_jobs=-1,
    verbose=1,
)
selector = RFECV(model, verbose=10, n_jobs=-1)

In [None]:
selector = selector.fit(data['X_train'], data['y_train'])

In [None]:
import joblib
joblib.dump(selector, 'rfecv_selector_joblib_2.sav')
import pickle
pickle.dump(selector, open('rfecv_selector_pickle_2.sav', 'wb'))

In [None]:
selector.n_features_

In [None]:
for a in [int(b) for b in [ True,  True,  True,  True, False, False, False, False, False,
        True,  True, False,  True, False, False, False,  True, False,
       False, False,  True, False, False, False, False, False, False,
       False, False, False, False, False, False,  True,  True,  True,
        True,  True, False,  True,  True,  True, False, False,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True, False,  True, False,  True, False, False, False, False,
       False, False, False,  True, False, False, False,  True, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False,  True, False, False, False,
       False,  True,  True,  True, False, False, False,  True,  True,
        True,  True, False,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True]]:
        print(a)


In [None]:
ind = selector.get_support()
not_ind = [not val for val in ind]

In [None]:
ind

In [None]:
list(data['X_train'].columns[ind])

In [None]:
pd.options.display.float_format = '{:20,.15f}'.format
print(data['X_train'].astype(float).describe())

In [None]:
rank = [(list(data['X_train'].columns)[i], list(selector.ranking_)[i]) for i in range(len(data['X_train'].columns))]
# rank = [(list(data['X_train'].columns)[i], list(selector.estimator_.feature_importances_)[i]) for i in range(len(data['X_train'].columns))]

In [None]:
sorted(rank, key=lambda tup: tup[1])