# IMPORTS

In [1]:
##################
# IMPORT MODULES #
##################
# SYS IMPORT
import os, inspect, importlib, argparse
import random
import gc
import time
import pandas as pd
import numpy as np
from pathlib import Path

from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

import xgboost as xgb
import lightgbm as lgb 
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# UTILS

In [2]:
def seed_everything(seed):
    """
    Seeds basic parameters for reproductibility of results
    
    Arguments:
        seed {int} -- Number of the seed
    """
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)

seed_everything(95)

In [3]:
from sklearn.feature_selection import chi2, f_classif, f_regression
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
from sklearn.feature_selection import SelectKBest, SelectPercentile, VarianceThreshold

# From https://github.com/abhishekkrthakur/approachingalmost
class UnivariateFeatureSelction:
    def __init__(self, n_features, problem_type, scoring, return_cols=True):
        """
        Custom univariate feature selection wrapper on
        different univariate feature selection models from
        scikit-learn.
        :param n_features: SelectPercentile if float else SelectKBest
        :param problem_type: classification or regression
        :param scoring: scoring function, string
        """
        self.n_features = n_features
        
        if problem_type == "classification":
            valid_scoring = {
                "f_classif": f_classif,
                "chi2": chi2,
                "mutual_info_classif": mutual_info_classif
            }
        else:
            valid_scoring = {
                "f_regression": f_regression,
                "mutual_info_regression": mutual_info_regression
            }
        if scoring not in valid_scoring:
            raise Exception("Invalid scoring function")
            
        if isinstance(n_features, int):
            self.selection = SelectKBest(
                valid_scoring[scoring],
                k=n_features
            )
        elif isinstance(n_features, float):
            self.selection = SelectPercentile(
                valid_scoring[scoring],
                percentile=int(n_features * 100)
            )
        else:
            raise Exception("Invalid type of feature")
    
    def fit(self, X, y):
        return self.selection.fit(X, y)
    
    def transform(self, X):
        return self.selection.transform(X)
    
    def fit_transform(self, X, y):
        return self.selection.fit_transform(X, y)
    
    def return_cols(self, X):
        if isinstance(self.n_features, int):
            mask = SelectKBest.get_support(self.selection)
            selected_features = []
            features = list(X.columns)
            for bool, feature in zip(mask, features):
                if bool:
                    selected_features.append(feature)
                    
        elif isinstance(self.n_features, float):
            mask = SelectPercentile.get_support(self.selection)
            selected_features = []
            features = list(X.columns)
            for bool, feature in zip(mask, features):
                if bool:
                    selected_features.append(feature)
        else:
            raise Exception("Invalid type of feature")
        
        return selected_features

# CONFIG

In [4]:
config = {
    "DATA_PATH" : "D:/Documents/GitHub/ml-pipeline/data/TPS-MAR2021/train.csv",
    "TARGET_VAR" : "target"
}

# DATA & FEATURE ENGINEERING

In [5]:
df = pd.read_csv(config["DATA_PATH"])
target = df[config["TARGET_VAR"]]
df.head()

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,target
0,0,A,I,A,B,B,BI,A,S,Q,...,0.759439,0.795549,0.681917,0.621672,0.592184,0.791921,0.815254,0.965006,0.665915,0
1,1,A,I,A,A,E,BI,K,W,AD,...,0.386385,0.541366,0.388982,0.357778,0.600044,0.408701,0.399353,0.927406,0.493729,0
2,2,A,K,A,A,E,BI,A,E,BM,...,0.343255,0.616352,0.793687,0.552877,0.352113,0.388835,0.412303,0.292696,0.549452,0
3,3,A,K,A,C,E,BI,A,Y,AD,...,0.831147,0.807807,0.800032,0.619147,0.221789,0.897617,0.633669,0.760318,0.934242,0
4,4,A,I,G,B,E,BI,C,G,Q,...,0.338818,0.277308,0.610578,0.128291,0.578764,0.279167,0.351103,0.357084,0.32896,1


In [6]:
def cat_encoding(dataframe):
    cat = dataframe.columns[1:20]
    for feature in cat:
        le = LabelEncoder()
        le.fit(dataframe[feature])
        dataframe[feature] = le.transform(dataframe[feature])
    return dataframe

def feature_engineering(dataframe):
    dataframe = cat_encoding(dataframe)
    features = dataframe.columns[1:31]
    return dataframe, features

In [7]:
df, features = feature_engineering(df)
df.head()

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,target
0,0,0,8,0,1,1,33,0,44,54,...,0.759439,0.795549,0.681917,0.621672,0.592184,0.791921,0.815254,0.965006,0.665915,0
1,1,0,8,0,0,4,33,8,48,3,...,0.386385,0.541366,0.388982,0.357778,0.600044,0.408701,0.399353,0.927406,0.493729,0
2,2,0,10,0,0,4,33,0,30,38,...,0.343255,0.616352,0.793687,0.552877,0.352113,0.388835,0.412303,0.292696,0.549452,0
3,3,0,10,0,2,4,33,0,50,3,...,0.831147,0.807807,0.800032,0.619147,0.221789,0.897617,0.633669,0.760318,0.934242,0
4,4,0,8,6,1,4,33,2,32,54,...,0.338818,0.277308,0.610578,0.128291,0.578764,0.279167,0.351103,0.357084,0.32896,1


# FEATURE SELECTION

In [8]:
ufs = UnivariateFeatureSelction(
        n_features=0.9,
        problem_type="regression",
        scoring="f_regression"
)
columns = df[features].columns.values.tolist()
ufs.fit(df[columns], df[config["TARGET_VAR"]].values.ravel())
selected_features = ufs.return_cols(df[columns])

In [10]:
features

Index(['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8',
       'cat9', 'cat10', 'cat11', 'cat12', 'cat13', 'cat14', 'cat15', 'cat16',
       'cat17', 'cat18', 'cont0', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5',
       'cont6', 'cont7', 'cont8', 'cont9', 'cont10'],
      dtype='object')

In [9]:
selected_features

['cat0',
 'cat1',
 'cat2',
 'cat3',
 'cat4',
 'cat5',
 'cat6',
 'cat7',
 'cat9',
 'cat11',
 'cat12',
 'cat13',
 'cat14',
 'cat15',
 'cat16',
 'cat17',
 'cat18',
 'cont1',
 'cont2',
 'cont3',
 'cont4',
 'cont5',
 'cont6',
 'cont7',
 'cont8',
 'cont9',
 'cont10']

# METRIC

In [16]:
metric = roc_auc_score

# TRAIN A MODEL WITH ALL FEATURES

In [18]:
train_x, valid_x, train_y, valid_y = train_test_split(df[features], target, test_size=0.2, random_state=95)

In [17]:
LGBM_parameters = {
    'objective': 'binary',
    'metric' : 'auc',
    'n_estimators' : 10000,
    'random_state' : 95,
    'cat_smooth' : 30,
    'reg_alpha': 0.0031293275223408185,
    'reg_lambda': 0.04787145507141445,
    'colsample_bytree': 0.3,
    'subsample': 0.6,
    'learning_rate': 0.008,
    'max_depth': 100,
    'num_leaves': 584,
    'min_child_samples': 173
}

In [19]:
model = lgb.LGBMClassifier(**LGBM_parameters)

In [20]:
model.fit(train_x, train_y, eval_set=[(valid_x, valid_y)], early_stopping_rounds=100, verbose=100)
preds = model.predict_proba(valid_x)[:, 1]
score = metric(valid_y, preds)
print(f"VALIDATION SCORE : {score}")

Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.885724
[200]	valid_0's auc: 0.887945
[300]	valid_0's auc: 0.889507
[400]	valid_0's auc: 0.890755
[500]	valid_0's auc: 0.891713
[600]	valid_0's auc: 0.8925
[700]	valid_0's auc: 0.893206
[800]	valid_0's auc: 0.893656
[900]	valid_0's auc: 0.894049
[1000]	valid_0's auc: 0.894348
[1100]	valid_0's auc: 0.894527
[1200]	valid_0's auc: 0.894675
[1300]	valid_0's auc: 0.894742
[1400]	valid_0's auc: 0.894802
[1500]	valid_0's auc: 0.894859
[1600]	valid_0's auc: 0.89481
Early stopping, best iteration is:
[1522]	valid_0's auc: 0.894873
VALIDATION SCORE : 0.8948733219676132


# TRAIN A MODEL WITH SELECTED FEATURES

In [21]:
train_x_selected, valid_x_selected, train_y_selected, valid_y_selected = train_test_split(df[selected_features], target, test_size=0.2, random_state=95)

In [22]:
model_selected = lgb.LGBMClassifier(**LGBM_parameters)

In [23]:
model_selected.fit(train_x_selected, train_y_selected, eval_set=[(valid_x_selected, valid_y_selected)], early_stopping_rounds=100, verbose=100)
preds_selected = model_selected.predict_proba(valid_x_selected)[:, 1]
score_selected = metric(valid_y_selected, preds_selected)
print(f"VALIDATION SCORE : {score_selected}")

Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.880149
[200]	valid_0's auc: 0.882729
[300]	valid_0's auc: 0.884905
[400]	valid_0's auc: 0.886494
[500]	valid_0's auc: 0.887854
[600]	valid_0's auc: 0.888948
[700]	valid_0's auc: 0.889783
[800]	valid_0's auc: 0.890449
[900]	valid_0's auc: 0.89085
[1000]	valid_0's auc: 0.891243
[1100]	valid_0's auc: 0.891457
[1200]	valid_0's auc: 0.891643
[1300]	valid_0's auc: 0.891764
[1400]	valid_0's auc: 0.89188
[1500]	valid_0's auc: 0.891937
[1600]	valid_0's auc: 0.891942
Early stopping, best iteration is:
[1551]	valid_0's auc: 0.891963
VALIDATION SCORE : 0.8919626274881871
