In [16]:
##################
# IMPORT MODULES #
##################
# SYS IMPORT
import os, inspect, importlib, argparse
import random
import gc
import time
import pandas as pd
import numpy as np
from pathlib import Path

from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

import xgboost as xgb
import lightgbm as lgb
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# UTILS

In [17]:
def seed_everything(seed):
    """
    Seeds basic parameters for reproductibility of results
    
    Arguments:
        seed {int} -- Number of the seed
    """
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)

seed_everything(95)

# CONFIG

In [18]:
config = {
    "DATA_PATH" : "D:/Documents/GitHub/ml-pipeline/data/TPS-AUG2021/train.csv",
    "TARGET_VAR" : "loss",
}

# DATA & FEATURE ENGINEERING

In [19]:
df = pd.read_csv(config["DATA_PATH"])
target = df[config["TARGET_VAR"]]
df.head()

Unnamed: 0,id,f0,f1,f2,f3,f4,f5,f6,f7,f8,...,f91,f92,f93,f94,f95,f96,f97,f98,f99,loss
0,0,-0.00235,59,0.766739,-1.35046,42.2727,16.6857,30.3599,1.2673,0.392007,...,-42.4399,26.854,1.45751,0.696161,0.941764,1.82847,0.92409,2.29658,10.4898,15
1,1,0.784462,145,-0.463845,-0.530421,27324.9,3.47545,160.498,0.828007,3.73586,...,-184.132,7.90137,1.70644,-0.494699,-2.0583,0.819184,0.439152,2.3647,1.14383,3
2,2,0.317816,19,-0.432571,-0.382644,1383.26,19.7129,31.1026,-0.515354,34.4308,...,7.43721,37.2181,3.25339,0.337934,0.615037,2.21676,0.745268,1.69679,12.3055,6
3,3,0.210753,17,-0.616454,0.946362,-119.253,4.08235,185.257,1.38331,-47.5214,...,9.66778,0.626942,1.49425,0.517513,-10.2221,2.62731,0.61727,1.45645,10.0288,2
4,4,0.439671,20,0.968126,-0.092546,74.302,12.3065,72.186,-0.233964,24.3991,...,290.657,15.6043,1.73557,-0.476668,1.39019,2.19574,0.826987,1.78485,7.07197,1


In [20]:
def standardize(dataframe):
    scaler = preprocessing.StandardScaler()
    features = dataframe.columns[1:101]
    dataframe[features] = scaler.fit_transform(dataframe[features])
    return dataframe

def feature_engineering(dataframe):
    dataframe = standardize(dataframe)
    features = dataframe.columns[1:101]
    return dataframe, features

In [21]:
df, features = feature_engineering(df)
df.head()

Unnamed: 0,id,f0,f1,f2,f3,f4,f5,f6,f7,f8,...,f91,f92,f93,f94,f95,f96,f97,f98,f99,loss
0,0,-1.668045,0.179768,0.498854,-1.767452,-0.585085,1.548611,-0.731714,1.096083,-0.163171,...,-0.576275,0.287884,-0.636568,1.048811,-0.066279,-0.659995,1.7062,1.113531,0.432237,15
1,1,0.88751,2.208234,-0.431857,-0.732528,3.907814,-0.863906,-0.021877,0.556142,-0.129652,...,-0.592941,-0.988442,-0.360043,-0.977224,-0.390478,-1.79077,-0.433968,1.21893,-1.222692,3
2,2,-0.62815,-0.763705,-0.408204,-0.546028,-0.364251,2.101452,-0.727663,-1.095,0.178041,...,-0.570408,0.985834,1.35839,0.439352,-0.101587,-0.224966,0.917008,0.185501,0.75375,6
3,3,-0.975889,-0.810879,-0.547278,1.131234,-0.611685,-0.753071,0.113171,1.238672,-0.643465,...,-0.570146,-1.478323,-0.595755,0.744873,-1.272692,0.235002,0.352117,-0.186368,0.350606,2
4,4,-0.232366,-0.740119,0.651167,-0.179911,-0.57981,0.748861,-0.503574,-0.74914,0.077481,...,-0.537094,-0.469704,-0.327684,-0.946548,-0.01782,-0.248517,1.277657,0.321752,-0.172972,1


## GET PCA FEATURES

In [22]:
cont_features = df[features].columns[:]

In [28]:
pca = PCA(n_components=50)
pca.fit(df[cont_features])
print(pca.explained_variance_ratio_)

[0.0122535  0.0118662  0.01157557 0.01149263 0.01141663 0.01121586
 0.0111645  0.0111064  0.01099467 0.01091636 0.01088764 0.01075831
 0.01071898 0.01064432 0.01060345 0.01055166 0.01053659 0.01050269
 0.01045709 0.01040076 0.01037213 0.01032226 0.01024697 0.01021993
 0.01020424 0.01017078 0.01015907 0.01010349 0.01008534 0.01006355
 0.01002127 0.01001707 0.01000924 0.00996051 0.00992925 0.00988012
 0.00986337 0.00981565 0.00981142 0.00979158 0.00977089 0.00973747
 0.00968123 0.00966925 0.00965474 0.00964646 0.00963211 0.0096186
 0.00957679 0.00956646]


In [29]:
sum(pca.explained_variance_ratio_)

0.5176650180769775

In [24]:
pca_values = pca.transform(df[cont_features])
pca_values

array([[ 0.04460108, -1.29655203,  0.27673364, ..., -0.32728838,
         0.97515358, -0.8366478 ],
       [ 1.61853719,  1.06977561,  2.51397395, ...,  0.12203802,
        -0.33364372,  1.72051899],
       [ 2.06351271,  0.02675095, -0.08013007, ..., -0.57305131,
        -1.00748682,  0.74146607],
       ...,
       [ 0.34668657,  0.51476461, -0.53412312, ..., -0.62450074,
        -0.11722809,  1.21783962],
       [-0.32108811, -0.36701198, -0.45253772, ..., -0.57752782,
         0.14354966,  0.71329143],
       [-0.02472797, -0.78844537,  0.53131295, ..., -0.86052798,
         1.33372977, -1.79455201]])

In [25]:
df_pca = pd.DataFrame(pca_values, columns=["PCA_1", "PCA_2"])
df_pca

ValueError: Shape of passed values is (250000, 10), indices imply (250000, 2)

In [None]:
#no_cont = df.columns.difference(cont_features)
df_train = df.join(df_pca)
df_train.columns

Index(['id', 'f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8',
       ...
       'f93', 'f94', 'f95', 'f96', 'f97', 'f98', 'f99', 'loss', 'PCA_1',
       'PCA_2'],
      dtype='object', length=104)

## SPLIT TRAIN AND VALID SETS

In [None]:
train_x, valid_x, train_y, valid_y = train_test_split(df[features], target, test_size=0.2, random_state=95)

## METRIC

In [None]:
rmse = mean_squared_error

# TRAIN MODELS WITH NORMAL FEATURES

## MODEL PARAMETERS

In [None]:
XGB_REGRESSOR = {
    "objective": "reg:squarederror",
    "n_estimators" : 10000,
    "max_depth": 10,
    "learning_rate": 0.008,
    "colsample_bytree": 0.5,
    "subsample": 0.8,
    "reg_alpha" : 0.007915504076304212,
    "min_child_weight": 274,
    "n_jobs": 2,
    "seed": 95,
    'tree_method': "gpu_hist",
    "gpu_id": 0,
    'predictor': 'gpu_predictor'
}

In [None]:
model = xgb.XGBRegressor(**XGB_REGRESSOR)
model.fit(train_x, train_y, eval_set=[(valid_x, valid_y)], early_stopping_rounds=200, verbose=100)
preds = model.predict(valid_x)
score = rmse(valid_y, preds, squared=False)
print(f"Validation score : {score}")

[0]	validation_0-rmse:10.05074
[100]	validation_0-rmse:8.33602
[200]	validation_0-rmse:7.94097
[300]	validation_0-rmse:7.85303
[400]	validation_0-rmse:7.82977
[500]	validation_0-rmse:7.82019


KeyboardInterrupt: 

# TRAIN MODELS WITH PCA FEATURES

In [None]:
pca_features = df_train.columns.difference(["id", "loss"])
pca_features

Index(['PCA_1', 'PCA_2', 'f0', 'f1', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15',
       ...
       'f90', 'f91', 'f92', 'f93', 'f94', 'f95', 'f96', 'f97', 'f98', 'f99'],
      dtype='object', length=102)

In [None]:
train_pca_x, valid_pca_x, train_pca_y, valid_pca_y = train_test_split(df_train[pca_features], target, test_size=0.2, random_state=95)

In [None]:
model_pca = xgb.XGBRegressor(**XGB_REGRESSOR)
model_pca.fit(train_pca_x, train_pca_y, eval_set=[(valid_pca_x, valid_pca_y)], early_stopping_rounds=200, verbose=100)
preds_pca = model_pca.predict(valid_pca_x)
score_pca = rmse(valid_pca_y, preds_pca, squared=False)
print(f"Validation score : {score_pca}")

[0]	validation_0-rmse:10.05065
[100]	validation_0-rmse:8.33586
[200]	validation_0-rmse:7.94105
[300]	validation_0-rmse:7.85382
[400]	validation_0-rmse:7.83084
[500]	validation_0-rmse:7.82129
[600]	validation_0-rmse:7.81553
[700]	validation_0-rmse:7.81085
[800]	validation_0-rmse:7.80708
[900]	validation_0-rmse:7.80411
[1000]	validation_0-rmse:7.80179
[1100]	validation_0-rmse:7.79941
[1200]	validation_0-rmse:7.79774
[1300]	validation_0-rmse:7.79621
[1400]	validation_0-rmse:7.79490
[1500]	validation_0-rmse:7.79359
[1600]	validation_0-rmse:7.79240
[1700]	validation_0-rmse:7.79158
[1800]	validation_0-rmse:7.79098
[1900]	validation_0-rmse:7.79047
[2000]	validation_0-rmse:7.78989
[2100]	validation_0-rmse:7.78913
[2200]	validation_0-rmse:7.78886
[2300]	validation_0-rmse:7.78864
[2400]	validation_0-rmse:7.78837
[2500]	validation_0-rmse:7.78813
[2600]	validation_0-rmse:7.78735
[2700]	validation_0-rmse:7.78710
[2800]	validation_0-rmse:7.78701
[2872]	validation_0-rmse:7.78727
Validation score : 7.