In [None]:
pip install lightgbm xgboost optuna sklego catboost flaml

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', 500)

from flaml import AutoML
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, train_test_split

Matplotlib is building the font cache; this may take a moment.


In [35]:
## Reading the competition data
train = pd.read_csv('Data/train.csv')
test = pd.read_csv('Data/test.csv')
sub = pd.read_csv('Data/sample_submission.csv')

In [4]:
train.head()

Unnamed: 0,id,BertzCT,Chi1,Chi1n,Chi1v,Chi2n,Chi2v,Chi3v,Chi4n,EState_VSA1,...,SlogP_VSA3,VSA_EState9,fr_COO,fr_COO2,EC1,EC2,EC3,EC4,EC5,EC6
0,0,323.390782,9.879918,5.875576,5.875576,4.304757,4.304757,2.754513,1.749203,0.0,...,4.794537,35.527357,0,0,1,1,0,0,0,0
1,1,273.723798,7.259037,4.441467,5.834958,3.285046,4.485235,2.201375,1.289775,45.135471,...,13.825658,44.70731,0,0,0,1,1,0,0,0
2,2,521.643822,10.911303,8.527859,11.050864,6.665291,9.519706,5.824822,1.770579,15.645394,...,17.964475,45.66012,0,0,1,1,0,0,1,0
3,3,567.431166,12.453343,7.089119,12.833709,6.478023,10.978151,7.914542,3.067181,95.639554,...,31.961948,87.509997,0,0,1,1,0,0,0,0
4,4,112.770735,4.414719,2.866236,2.866236,1.875634,1.875634,1.03645,0.727664,17.980451,...,9.589074,33.333333,2,2,1,0,1,1,1,0


In [5]:
## Defining input and target variables
inputs = train.drop(columns = ['id', 'EC1', 'EC2', 'EC3', 'EC4', 'EC5', 'EC6']).columns
targets = ['EC1', 'EC2']

### EC1:

In [7]:
## Defining input and target variables for training
X = train[inputs]
Y = train['EC1']
X_test = test[inputs]

## Creating lists to store results
aml_lgb_cv_scores, aml_lgb_preds = list(), list()
aml_cat_cv_scores, aml_cat_preds = list(), list()
ens_cv_scores, ens_preds = list(), list()

## Performing stratified k fold
skf = StratifiedKFold(n_splits = 3, random_state = 42, shuffle = True)
    
for i, (train_idx, valid_idx) in enumerate(skf.split(X, Y)):
        
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    Y_train, Y_valid = Y.iloc[train_idx], Y.iloc[valid_idx]
    
    print('---------------------------------------------------------------')
    
    ## FLAML (LGBM) ##
    
    automl = AutoML()
    automl_settings = {'time_budget': 120,  
                       'metric': 'roc_auc',
                       'task': 'classification',
                       'estimator_list': ['lgbm'],
                       "log_file_name": '',
                      }

    automl.fit(X_train = X_train, y_train = Y_train, **automl_settings, verbose = False)
    
    aml_lgb_pred_1 = automl.predict_proba(X_valid)[:, 1]
    aml_lgb_pred_2 = automl.predict_proba(X_test)[:, 1]

    aml_lgb_score_fold = roc_auc_score(Y_valid, aml_lgb_pred_1)
    aml_lgb_cv_scores.append(aml_lgb_score_fold)
    aml_lgb_preds.append(aml_lgb_pred_2)
    
    print('Fold', i+1, '==> FLAML (LGBM) oof ROC-AUC is ==>', aml_lgb_score_fold)
    
    ## FLAML (CatBoost) ##
    
    automl = AutoML()
    automl_settings = {'time_budget': 120,  
                       'metric': 'roc_auc',
                       'task': 'classification',
                       'estimator_list': ['catboost'],
                       "log_file_name": '',
                      }

    automl.fit(X_train = X_train, y_train = Y_train, **automl_settings, verbose = False)
    
    aml_cat_pred_1 = automl.predict_proba(X_valid)[:, 1]
    aml_cat_pred_2 = automl.predict_proba(X_test)[:, 1]
    
    aml_cat_score_fold = roc_auc_score(Y_valid, aml_cat_pred_1)
    aml_cat_cv_scores.append(aml_cat_score_fold)
    aml_cat_preds.append(aml_cat_pred_2)
    
    print('Fold', i+1, '==> FLAML (CatBoost) oof ROC-AUC is ==>', aml_cat_score_fold)
    
    ######################
    ## Average Ensemble ##
    ######################
    
    ens_pred_1 = (aml_lgb_pred_1 + 2 * aml_cat_pred_1) / 3
    ens_pred_2 = (aml_lgb_pred_2 + 2 * aml_cat_pred_2) / 3
    
    ens_score_fold = roc_auc_score(Y_valid, ens_pred_1)
    ens_cv_scores.append(ens_score_fold)
    ens_preds.append(ens_pred_2)
    
    print('Fold', i+1, '==> Average Ensemble oof ROC-AUC score is ==>', ens_score_fold)

---------------------------------------------------------------
Fold 1 ==> FLAML (LGBM) oof ROC-AUC is ==> 0.6933283691083688
Fold 1 ==> FLAML (CatBoost) oof ROC-AUC is ==> 0.7014718908592845
Fold 1 ==> Average Ensemble oof ROC-AUC score is ==> 0.7017584301992932
---------------------------------------------------------------
Fold 2 ==> FLAML (LGBM) oof ROC-AUC is ==> 0.706954190006724
Fold 2 ==> FLAML (CatBoost) oof ROC-AUC is ==> 0.7120133138523437
Fold 2 ==> Average Ensemble oof ROC-AUC score is ==> 0.7132354824520912
---------------------------------------------------------------
Fold 3 ==> FLAML (LGBM) oof ROC-AUC is ==> 0.6884766992208511
Fold 3 ==> FLAML (CatBoost) oof ROC-AUC is ==> 0.6961391459279268
Fold 3 ==> Average Ensemble oof ROC-AUC score is ==> 0.6979231601875144


In [8]:
flaml_lgb = np.mean(aml_lgb_cv_scores)
flaml_cat = np.mean(aml_cat_cv_scores)
ens_cv_score = np.mean(ens_cv_scores)

print('LGBM: ', flaml_lgb)
print('CAT: ', flaml_cat)
print('ENSEMBLE: ', ens_cv_score)

LGBM:  0.6962530861119812
CAT:  0.7032081168798516
ENSEMBLE:  0.7043056909462996


In [19]:
lgb_preds_EC1 = pd.DataFrame(aml_lgb_preds).apply(np.mean, axis = 0)
cat_preds_EC1 = pd.DataFrame(aml_cat_preds).apply(np.mean, axis = 0)
ens_preds_EC1 = pd.DataFrame(ens_preds).apply(np.mean, axis = 0)

### EC2:

In [10]:
## Defining input and target variables for training
X = train[inputs]
Y = train['EC2']
X_test = test[inputs]

## Creating lists to store results
aml_lgb_cv_scores, aml_lgb_preds = list(), list()
aml_cat_cv_scores, aml_cat_preds = list(), list()
ens_cv_scores, ens_preds = list(), list()

## Performing stratified k fold
skf = StratifiedKFold(n_splits = 3, random_state = 42, shuffle = True)
    
for i, (train_idx, valid_idx) in enumerate(skf.split(X, Y)):
        
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    Y_train, Y_valid = Y.iloc[train_idx], Y.iloc[valid_idx]
    
    print('---------------------------------------------------------------')
    
    ## FLAML (LGBM) ##
    
    automl = AutoML()
    automl_settings = {'time_budget': 120,  
                       'metric': 'roc_auc',
                       'task': 'classification',
                       'estimator_list': ['lgbm'],
                       "log_file_name": '',
                      }

    automl.fit(X_train = X_train, y_train = Y_train, **automl_settings, verbose = False)
    
    aml_lgb_pred_1 = automl.predict_proba(X_valid)[:, 1]
    aml_lgb_pred_2 = automl.predict_proba(X_test)[:, 1]

    aml_lgb_score_fold = roc_auc_score(Y_valid, aml_lgb_pred_1)
    aml_lgb_cv_scores.append(aml_lgb_score_fold)
    aml_lgb_preds.append(aml_lgb_pred_2)
    
    print('Fold', i+1, '==> FLAML (LGBM) oof ROC-AUC is ==>', aml_lgb_score_fold)
    
    ## FLAML (CatBoost) ##
    
    automl = AutoML()
    automl_settings = {'time_budget': 120,  
                       'metric': 'roc_auc',
                       'task': 'classification',
                       'estimator_list': ['catboost'],
                       "log_file_name": '',
                      }

    automl.fit(X_train = X_train, y_train = Y_train, **automl_settings, verbose = False)
    
    aml_cat_pred_1 = automl.predict_proba(X_valid)[:, 1]
    aml_cat_pred_2 = automl.predict_proba(X_test)[:, 1]
    
    aml_cat_score_fold = roc_auc_score(Y_valid, aml_cat_pred_1)
    aml_cat_cv_scores.append(aml_cat_score_fold)
    aml_cat_preds.append(aml_cat_pred_2)
    
    print('Fold', i+1, '==> FLAML (CatBoost) oof ROC-AUC is ==>', aml_cat_score_fold)
    
    ######################
    ## Average Ensemble ##
    ######################
    
    ens_pred_1 = (aml_lgb_pred_1 + 2 * aml_cat_pred_1) / 3
    ens_pred_2 = (aml_lgb_pred_2 + 2 * aml_cat_pred_2) / 3
    
    ens_score_fold = roc_auc_score(Y_valid, ens_pred_1)
    ens_cv_scores.append(ens_score_fold)
    ens_preds.append(ens_pred_2)
    
    print('Fold', i+1, '==> Average Ensemble oof ROC-AUC score is ==>', ens_score_fold)

---------------------------------------------------------------
Fold 1 ==> FLAML (LGBM) oof ROC-AUC is ==> 0.581510571526324
Fold 1 ==> FLAML (CatBoost) oof ROC-AUC is ==> 0.5885879548546339
Fold 1 ==> Average Ensemble oof ROC-AUC score is ==> 0.5896543226973174
---------------------------------------------------------------
Fold 2 ==> FLAML (LGBM) oof ROC-AUC is ==> 0.5890642437621681
Fold 2 ==> FLAML (CatBoost) oof ROC-AUC is ==> 0.5867380395734733
Fold 2 ==> Average Ensemble oof ROC-AUC score is ==> 0.5869915851383605
---------------------------------------------------------------
Fold 3 ==> FLAML (LGBM) oof ROC-AUC is ==> 0.569117671373827
Fold 3 ==> FLAML (CatBoost) oof ROC-AUC is ==> 0.5814448094687561
Fold 3 ==> Average Ensemble oof ROC-AUC score is ==> 0.5817140625934023


In [11]:
flaml_lgb = np.mean(aml_lgb_cv_scores)
flaml_cat = np.mean(aml_cat_cv_scores)
ens_cv_score = np.mean(ens_cv_scores)

print('LGBM: ', flaml_lgb)
print('CAT: ', flaml_cat)
print('ENSEMBLE: ', ens_cv_score)

LGBM:  0.5798974955541064
CAT:  0.5855902679656211
ENSEMBLE:  0.5861199901430267


In [18]:
lgb_preds_EC2 = pd.DataFrame(aml_lgb_preds).apply(np.mean, axis = 0)
cat_preds_EC2 = pd.DataFrame(aml_cat_preds).apply(np.mean, axis = 0)
ens_preds_EC2 = pd.DataFrame(ens_preds).apply(np.mean, axis = 0)

### Putting it all together:

In [36]:
sub['EC1'] = lgb_preds_EC1
sub['EC2'] = lgb_preds_EC2
sub.to_csv('Submissions/LGBM_baseline.csv', index = False)

sub['EC1'] = lgb_preds_EC1 
sub['EC2'] = cat_preds_EC2
sub.to_csv('Submissions/LGBM_CAT_baseline.csv', index = False)

sub['EC1'] = lgb_preds_EC1 
sub['EC2'] = ens_preds_EC2
sub.to_csv('Submissions/LGBM_ENS_baseline.csv', index = False)

sub['EC1'] = cat_preds_EC1 
sub['EC2'] = lgb_preds_EC2
sub.to_csv('Submissions/CAT_LGBM_baseline.csv', index = False)

sub['EC1'] = cat_preds_EC1 
sub['EC2'] = cat_preds_EC2
sub.to_csv('Submissions/CAT_baseline.csv', index = False)

sub['EC1'] = cat_preds_EC1 
sub['EC2'] = ens_preds_EC2
sub.to_csv('Submissions/CAT_ENS_baseline.csv', index = False)

sub['EC1'] = ens_preds_EC1 
sub['EC2'] = lgb_preds_EC2
sub.to_csv('Submissions/ENS_LGBM_baseline.csv', index = False)

sub['EC1'] = ens_preds_EC1 
sub['EC2'] = cat_preds_EC2
sub.to_csv('Submissions/ENS_CAT_baseline.csv', index = False)

sub['EC1'] = ens_preds_EC1 
sub['EC2'] = ens_preds_EC2
sub.to_csv('Submissions/ENS_baseline.csv', index = False)

In [37]:
sub.head()

Unnamed: 0,id,EC1,EC2
0,14838,0.792725,0.793511
1,14839,0.812629,0.807463
2,14840,0.756623,0.747261
3,14841,0.808022,0.811117
4,14842,0.774081,0.757889
