# IMPORTS

In [1]:
##################
# IMPORT MODULES #
##################
# SYS IMPORT
import os, inspect, importlib, argparse
import random
import gc
import time
import pandas as pd
import numpy as np
from pathlib import Path

from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

import xgboost as xgb
import lightgbm as lgb 
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# UTILS

In [6]:
def reduce_memory_usage(df, verbose=True):
    # NOTE: Original author of this function is unknown
    # if you know the *original author*, please let me know.
    numerics = ["int8", "int16", "int32",
                "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

In [2]:
def seed_everything(seed):
    """
    Seeds basic parameters for reproductibility of results
    
    Arguments:
        seed {int} -- Number of the seed
    """
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)

seed_everything(95)

In [3]:
from sklearn.feature_selection import chi2, f_classif, f_regression
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
from sklearn.feature_selection import SelectKBest, SelectPercentile, VarianceThreshold

# From https://github.com/abhishekkrthakur/approachingalmost
class UnivariateFeatureSelction:
    def __init__(self, n_features, problem_type, scoring, return_cols=True):
        """
        Custom univariate feature selection wrapper on
        different univariate feature selection models from
        scikit-learn.
        :param n_features: SelectPercentile if float else SelectKBest
        :param problem_type: classification or regression
        :param scoring: scoring function, string
        """
        self.n_features = n_features
        
        if problem_type == "classification":
            valid_scoring = {
                "f_classif": f_classif,
                "chi2": chi2,
                "mutual_info_classif": mutual_info_classif
            }
        else:
            valid_scoring = {
                "f_regression": f_regression,
                "mutual_info_regression": mutual_info_regression
            }
        if scoring not in valid_scoring:
            raise Exception("Invalid scoring function")
            
        if isinstance(n_features, int):
            self.selection = SelectKBest(
                valid_scoring[scoring],
                k=n_features
            )
        elif isinstance(n_features, float):
            self.selection = SelectPercentile(
                valid_scoring[scoring],
                percentile=int(n_features * 100)
            )
        else:
            raise Exception("Invalid type of feature")
    
    def fit(self, X, y):
        return self.selection.fit(X, y)
    
    def transform(self, X):
        return self.selection.transform(X)
    
    def fit_transform(self, X, y):
        return self.selection.fit_transform(X, y)
    
    def return_cols(self, X):
        if isinstance(self.n_features, int):
            mask = SelectKBest.get_support(self.selection)
            selected_features = []
            features = list(X.columns)
            for bool, feature in zip(mask, features):
                if bool:
                    selected_features.append(feature)
                    
        elif isinstance(self.n_features, float):
            mask = SelectPercentile.get_support(self.selection)
            selected_features = []
            features = list(X.columns)
            for bool, feature in zip(mask, features):
                if bool:
                    selected_features.append(feature)
        else:
            raise Exception("Invalid type of feature")
        
        return selected_features

# CONFIG

In [4]:
config = {
    "DATA_PATH" : "D:/Documents/GitHub/ml-pipeline/data/TPS-AUG2021/train.csv",
    "TARGET_VAR" : "loss"
}

# DATA & FEATURE ENGINEERING

In [5]:
df = pd.read_csv(config["DATA_PATH"])
target = df[config["TARGET_VAR"]]
df.head()

Unnamed: 0,id,f0,f1,f2,f3,f4,f5,f6,f7,f8,...,f91,f92,f93,f94,f95,f96,f97,f98,f99,loss
0,0,-0.00235,59,0.766739,-1.35046,42.2727,16.6857,30.3599,1.2673,0.392007,...,-42.4399,26.854,1.45751,0.696161,0.941764,1.82847,0.92409,2.29658,10.4898,15
1,1,0.784462,145,-0.463845,-0.530421,27324.9,3.47545,160.498,0.828007,3.73586,...,-184.132,7.90137,1.70644,-0.494699,-2.0583,0.819184,0.439152,2.3647,1.14383,3
2,2,0.317816,19,-0.432571,-0.382644,1383.26,19.7129,31.1026,-0.515354,34.4308,...,7.43721,37.2181,3.25339,0.337934,0.615037,2.21676,0.745268,1.69679,12.3055,6
3,3,0.210753,17,-0.616454,0.946362,-119.253,4.08235,185.257,1.38331,-47.5214,...,9.66778,0.626942,1.49425,0.517513,-10.2221,2.62731,0.61727,1.45645,10.0288,2
4,4,0.439671,20,0.968126,-0.092546,74.302,12.3065,72.186,-0.233964,24.3991,...,290.657,15.6043,1.73557,-0.476668,1.39019,2.19574,0.826987,1.78485,7.07197,1


In [7]:
df = reduce_memory_usage(df)

Mem. usage decreased to 50.78 Mb (73.9% reduction)


In [8]:
from sklearn import preprocessing

def standardize(dataframe):
    scaler = preprocessing.StandardScaler()
    features = dataframe.columns[1:101]
    dataframe[features] = scaler.fit_transform(dataframe[features])
    return dataframe

def feature_engineering(dataframe, train=False):
    dataframe = standardize(dataframe)
    features = dataframe.columns[1:101]
    return dataframe, features

In [9]:
df, features = feature_engineering(df)
df.head()

Unnamed: 0,id,f0,f1,f2,f3,f4,f5,f6,f7,f8,...,f91,f92,f93,f94,f95,f96,f97,f98,f99,loss
0,0,-1.668043,0.179768,0.498751,-1.767612,-0.585083,1.54894,-0.731716,1.096422,-0.16317,...,-0.576275,0.288248,-0.637099,1.049029,-0.066265,-0.660381,1.707198,1.113989,0.43266,15
1,1,0.888178,2.208234,-0.431875,-0.732342,3.908334,-0.864061,-0.021866,0.556285,-0.129647,...,-0.592941,-0.988376,-0.360471,-0.977104,-0.39051,-1.790598,-0.433717,1.219759,-1.22274,3
2,2,-0.627971,-0.763705,-0.40824,-0.545932,-0.364294,2.102522,-0.727626,-1.094732,0.178108,...,-0.570408,0.985879,1.358963,0.439278,-0.101566,-0.224925,0.916342,0.186238,0.753606,6
3,3,-0.976082,-0.810879,-0.547095,1.131143,-0.611684,-0.753131,0.113133,1.239258,-0.643564,...,-0.570146,-1.478323,-0.595876,0.744984,-1.27233,0.234602,0.351752,-0.186978,0.35104,2
4,4,-0.232281,-0.740119,0.65127,-0.17989,-0.579809,0.74853,-0.503565,-0.749195,0.077553,...,-0.537083,-0.469888,-0.327926,-0.946368,-0.017774,-0.248995,1.278369,0.322227,-0.173265,1


# TRAINING A BASELINE WITH ALL FEATURES

In [10]:
metric = mean_squared_error
train_x, valid_x, train_y, valid_y = train_test_split(df[features], target, test_size=0.2, random_state=95)

In [11]:
XGB_REGRESSION = {
    "objective": "reg:squarederror",
    "n_estimators" : 10000,
    "max_depth": 10,
    "learning_rate": 0.006,
    "colsample_bytree": 0.5,
    "subsample": 0.6,
    "reg_alpha" : 0.006221417528979453,
    "reg_lambda": 3.178956727410822e-07,
    "min_child_weight": 123,
    "n_jobs": 2,
    "seed": 95,
    'tree_method': "gpu_hist",
    "gpu_id": 0,
    'predictor': 'gpu_predictor'
}

model = xgb.XGBRegressor(**XGB_REGRESSION)

In [13]:
from sklearn.linear_model import LinearRegression

LIN_REGRESSION_PARAM = {}
model_lin = LinearRegression()

In [16]:
training_parameters = {
    "eval_set": [(valid_x, valid_y)],
    "early_stopping_rounds": 200,
    "verbose": 100,
}
model_lin.fit(train_x, train_y, **training_parameters)
preds = model.predict(valid_x)
score = metric(valid_y, preds, squared=False)
print(f"VALIDATION SCORE : {score}")

TypeError: fit() got an unexpected keyword argument 'eval_set'

In [12]:
training_parameters = {
    "eval_set" : [(valid_x, valid_y)],
    "early_stopping_rounds" : 200,
    "verbose" : 100,
}
model.fit(train_x, train_y, **training_parameters)
preds = model.predict(valid_x)
score = metric(valid_y, preds, squared=False)
print(f"VALIDATION SCORE : {score}")

[0]	validation_0-rmse:10.05850
[100]	validation_0-rmse:8.57067
[200]	validation_0-rmse:8.06694
[300]	validation_0-rmse:7.90497
[400]	validation_0-rmse:7.85175
[500]	validation_0-rmse:7.83231
[600]	validation_0-rmse:7.82348
[700]	validation_0-rmse:7.81768
[800]	validation_0-rmse:7.81383
[900]	validation_0-rmse:7.81009
[1000]	validation_0-rmse:7.80735
[1100]	validation_0-rmse:7.80505
[1200]	validation_0-rmse:7.80292
[1300]	validation_0-rmse:7.80109


KeyboardInterrupt: 

# UNIVARIATE FEATURE SELECTION

In [52]:
ufs = UnivariateFeatureSelction(
        n_features=0.8,
        problem_type="regression",
        scoring="f_regression"
)
columns = df[features].columns.values.tolist()
ufs.fit(df[columns], df[config["TARGET_VAR"]].values.ravel())
selected_features = ufs.return_cols(df[columns])

In [53]:
features

Index(['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10',
       'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20',
       'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29', 'f30',
       'f31', 'f32', 'f33', 'f34', 'f35', 'f36', 'f37', 'f38', 'f39', 'f40',
       'f41', 'f42', 'f43', 'f44', 'f45', 'f46', 'f47', 'f48', 'f49', 'f50',
       'f51', 'f52', 'f53', 'f54', 'f55', 'f56', 'f57', 'f58', 'f59', 'f60',
       'f61', 'f62', 'f63', 'f64', 'f65', 'f66', 'f67', 'f68', 'f69', 'f70',
       'f71', 'f72', 'f73', 'f74', 'f75', 'f76', 'f77', 'f78', 'f79', 'f80',
       'f81', 'f82', 'f83', 'f84', 'f85', 'f86', 'f87', 'f88', 'f89', 'f90',
       'f91', 'f92', 'f93', 'f94', 'f95', 'f96', 'f97', 'f98', 'f99'],
      dtype='object')

In [54]:
selected_features

['f0',
 'f1',
 'f2',
 'f3',
 'f4',
 'f5',
 'f6',
 'f8',
 'f9',
 'f10',
 'f11',
 'f12',
 'f13',
 'f14',
 'f15',
 'f16',
 'f17',
 'f18',
 'f19',
 'f21',
 'f22',
 'f23',
 'f25',
 'f26',
 'f27',
 'f28',
 'f29',
 'f30',
 'f31',
 'f32',
 'f33',
 'f34',
 'f35',
 'f36',
 'f37',
 'f38',
 'f39',
 'f40',
 'f41',
 'f42',
 'f43',
 'f44',
 'f45',
 'f46',
 'f47',
 'f48',
 'f49',
 'f50',
 'f51',
 'f52',
 'f53',
 'f54',
 'f55',
 'f56',
 'f57',
 'f58',
 'f59',
 'f60',
 'f61',
 'f62',
 'f63',
 'f64',
 'f65',
 'f66',
 'f68',
 'f69',
 'f70',
 'f72',
 'f73',
 'f74',
 'f75',
 'f76',
 'f77',
 'f78',
 'f79',
 'f81',
 'f82',
 'f84',
 'f85',
 'f88',
 'f90',
 'f91',
 'f92',
 'f93',
 'f94',
 'f95',
 'f96',
 'f97',
 'f98',
 'f99']

# TRAIN A MODEL WITH SELECTED FEATURES

In [42]:
train_x_selected, valid_x_selected, train_y_selected, valid_y_selected = train_test_split(df[selected_features], target, test_size=0.2, random_state=95)

In [43]:
model_selected = xgb.XGBRegressor(**XGB_REGRESSION)

In [44]:
model_selected.fit(train_x_selected, train_y_selected, eval_set=[(valid_x_selected , valid_y_selected)], early_stopping_rounds=200, verbose=False)
preds_selected = model_selected.predict(valid_x_selected)
score_selected = metric(valid_y_selected, preds_selected, squared=False)
print(f"VALIDATION SCORE : {score_selected}")

VALIDATION SCORE : 7.795086006255788


# FEATURE SELECTION USING RECURSIVE FEATURE ELIMINATION (RFE)

In [55]:
from sklearn.feature_selection import RFE
rfe = RFE(model_selected, n_features_to_select = 0.8)
rfe.fit(train_x, train_y)
preds_selected_rfe = rfe.predict(valid_x)
score_selected_rfe = metric(valid_y, preds_selected_rfe, squared=False)
print(f"VALIDATION SCORE : {score_selected_rfe}")

In [None]:
from operator import itemgetter
features = X_train.columns.to_list()
for x, y in (sorted(zip(rfe.ranking_ , features), key=itemgetter(0))):
    print(x, y)

# FEATURE SELECTION USING THE BORUTA SHAP METHOD

In [9]:
from BorutaShap import BorutaShap
selector = BorutaShap(model=model_selected, importance_measure="shap", classification=False)
selector.fit(train_x, train_y, n_trials=50, random_state=95)
selector.plot(which_features='all', figsize=(16,12))

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "C:\Users\chopp\anaconda3\envs\ML-37\lib\site-packages\IPython\core\interactiveshell.py", line 3427, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-9-e78fa15da944>", line 2, in <module>
    selector = BorutaShap(model=model_selected, importance_measure="shap", classification=False)
NameError: name 'model_selected' is not defined

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\chopp\anaconda3\envs\ML-37\lib\site-packages\IPython\core\interactiveshell.py", line 2054, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'NameError' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\chopp\anaconda3\envs\ML-37\lib\site-packages\IPython\core\ultratb.py", line 1101, in get_records
    return _fixed_getinner