In [11]:
##################
# IMPORT MODULES #
##################
# SYS IMPORT
import os, inspect, importlib, argparse
import random
import gc
import time
import pandas as pd
import numpy as np
from pathlib import Path

from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

import xgboost as xgb
import lightgbm as lgb
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# UTILS

In [13]:
def seed_everything(seed):
    """
    Seeds basic parameters for reproductibility of results
    
    Arguments:
        seed {int} -- Number of the seed
    """
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)

seed_everything(95)

# CONFIG

In [15]:
config = {
    "DATA_PATH" : "D:/Documents/GitHub/ml-pipeline/data/TPS-MAR2021/train.csv",
    "TARGET_VAR" : "target",
}

# DATA & FEATURE ENGINEERING

In [17]:
df = pd.read_csv(config["DATA_PATH"])
target = df[config["TARGET_VAR"]]
df.head()

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,target
0,0,A,I,A,B,B,BI,A,S,Q,...,0.759439,0.795549,0.681917,0.621672,0.592184,0.791921,0.815254,0.965006,0.665915,0
1,1,A,I,A,A,E,BI,K,W,AD,...,0.386385,0.541366,0.388982,0.357778,0.600044,0.408701,0.399353,0.927406,0.493729,0
2,2,A,K,A,A,E,BI,A,E,BM,...,0.343255,0.616352,0.793687,0.552877,0.352113,0.388835,0.412303,0.292696,0.549452,0
3,3,A,K,A,C,E,BI,A,Y,AD,...,0.831147,0.807807,0.800032,0.619147,0.221789,0.897617,0.633669,0.760318,0.934242,0
4,4,A,I,G,B,E,BI,C,G,Q,...,0.338818,0.277308,0.610578,0.128291,0.578764,0.279167,0.351103,0.357084,0.32896,1


In [19]:
def cat_encoding(dataframe):
    cat = dataframe.columns[1:20]
    for feature in cat:
        le = LabelEncoder()
        le.fit(dataframe[feature])
        dataframe[feature] = le.transform(dataframe[feature])
    return dataframe

def feature_engineering(dataframe):
    dataframe = cat_encoding(dataframe)
    features = dataframe.columns[1:31]
    return dataframe, features

In [21]:
df, features = feature_engineering(df)
df.head()

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,target
0,0,0,8,0,1,1,33,0,44,54,...,0.759439,0.795549,0.681917,0.621672,0.592184,0.791921,0.815254,0.965006,0.665915,0
1,1,0,8,0,0,4,33,8,48,3,...,0.386385,0.541366,0.388982,0.357778,0.600044,0.408701,0.399353,0.927406,0.493729,0
2,2,0,10,0,0,4,33,0,30,38,...,0.343255,0.616352,0.793687,0.552877,0.352113,0.388835,0.412303,0.292696,0.549452,0
3,3,0,10,0,2,4,33,0,50,3,...,0.831147,0.807807,0.800032,0.619147,0.221789,0.897617,0.633669,0.760318,0.934242,0
4,4,0,8,6,1,4,33,2,32,54,...,0.338818,0.277308,0.610578,0.128291,0.578764,0.279167,0.351103,0.357084,0.32896,1


## GET PCA FEATURES

In [23]:
cont_features = df[features].columns[19:]

In [25]:
pca = PCA(n_components=4)
pca.fit(df[cont_features])
print(pca.explained_variance_ratio_)

[0.41382892 0.20012941 0.10590638 0.0827102 ]


In [27]:
pca_values = pca.transform(df[cont_features])
pca_values

array([[ 0.7583725 ,  0.3144808 , -0.12439139, -0.06128659],
       [-0.11459219, -0.09886094, -0.00098513, -0.1732633 ],
       [-0.10473512, -0.22005765, -0.35083298,  0.00784454],
       ...,
       [-0.20756834,  0.25664666, -0.37682105,  0.05553589],
       [ 0.14421467,  0.43126352,  0.16308763, -0.03884611],
       [ 0.96459898, -0.1866586 , -0.06461013,  0.21870288]])

In [29]:
df_pca = pd.DataFrame(pca_values, columns=["PCA_1", "PCA_2", "PCA_3", "PCA_4"])
df_pca

Unnamed: 0,PCA_1,PCA_2,PCA_3,PCA_4
0,0.758373,0.314481,-0.124391,-0.061287
1,-0.114592,-0.098861,-0.000985,-0.173263
2,-0.104735,-0.220058,-0.350833,0.007845
3,0.985506,-0.067565,-0.226378,-0.096329
4,-0.513774,-0.226829,-0.021793,0.324951
...,...,...,...,...
299995,0.337956,-0.353694,0.210301,0.014814
299996,0.311800,0.287399,0.117082,-0.082754
299997,-0.207568,0.256647,-0.376821,0.055536
299998,0.144215,0.431264,0.163088,-0.038846


In [31]:
no_cont = df.columns.difference(cont_features)
df_train = df[no_cont].join(df_pca)
df_train.columns

Index(['cat0', 'cat1', 'cat10', 'cat11', 'cat12', 'cat13', 'cat14', 'cat15',
       'cat16', 'cat17', 'cat18', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6',
       'cat7', 'cat8', 'cat9', 'id', 'target', 'PCA_1', 'PCA_2', 'PCA_3',
       'PCA_4'],
      dtype='object')

## SPLIT TRAIN AND VALID SETS

In [33]:
train_x, valid_x, train_y, valid_y = train_test_split(df[features], target, test_size=0.2, random_state=95)

## METRIC

In [35]:
auc = roc_auc_score

# TRAIN MODELS WITH NORMAL FEATURES

## MODEL PARAMETERS

In [36]:
LGBM_parameters = {
    'objective': 'binary',
    'metric' : 'auc',
    'n_estimators' : 1000,
    'random_state' : 95,
    'cat_smooth' : 30,
    'reg_alpha': 0.0031293275223408185,
    'reg_lambda': 0.04787145507141445,
    'colsample_bytree': 0.3,
    'subsample': 0.6,
    'learning_rate': 0.008,
    'max_depth': 100,
    'num_leaves': 584,
    'min_child_samples': 173
}

In [37]:
model = lgb.LGBMClassifier(**LGBM_parameters)
model.fit(train_x, train_y, eval_set=[(valid_x, valid_y)], early_stopping_rounds=100, verbose=100)
preds = model.predict_proba(valid_x)[:, 1]
score = auc(valid_y, preds)
print(f"Validation score : {score}")

Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.885724
[200]	valid_0's auc: 0.887945
[300]	valid_0's auc: 0.889507
[400]	valid_0's auc: 0.890755
[500]	valid_0's auc: 0.891713
[600]	valid_0's auc: 0.8925
[700]	valid_0's auc: 0.893206
[800]	valid_0's auc: 0.893656
[900]	valid_0's auc: 0.894049
[1000]	valid_0's auc: 0.894348
Did not meet early stopping. Best iteration is:
[998]	valid_0's auc: 0.894351
Validation score : 0.8943508333455539


# TRAIN MODELS WITH PCA FEATURES

In [47]:
pca_features = df_train.columns.difference(["id", "target"])
pca_features

Index(['PCA_1', 'PCA_2', 'PCA_3', 'PCA_4', 'cat0', 'cat1', 'cat10', 'cat11',
       'cat12', 'cat13', 'cat14', 'cat15', 'cat16', 'cat17', 'cat18', 'cat2',
       'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9'],
      dtype='object')

In [48]:
train_pca_x, valid_pca_x, train_pca_y, valid_pca_y = train_test_split(df_train[pca_features], target, test_size=0.2, random_state=95)

In [50]:
model_pca = lgb.LGBMClassifier(**LGBM_parameters)
model_pca.fit(train_pca_x, train_pca_y, eval_set=[(valid_pca_x, valid_pca_y)], early_stopping_rounds=100, verbose=100)
preds_pca = model_pca.predict_proba(valid_pca_x)[:, 1]
score_pca = auc(valid_pca_y, preds_pca)
print(f"Validation score : {score_pca}")

Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.882307
[200]	valid_0's auc: 0.884257
[300]	valid_0's auc: 0.885562
[400]	valid_0's auc: 0.886554
[500]	valid_0's auc: 0.887406
[600]	valid_0's auc: 0.887988
[700]	valid_0's auc: 0.888396
[800]	valid_0's auc: 0.888687
[900]	valid_0's auc: 0.888917
[1000]	valid_0's auc: 0.889027
Did not meet early stopping. Best iteration is:
[1000]	valid_0's auc: 0.889027
Validation score : 0.8890274820315557
