In [None]:
pip install xgboost lightgbm catboost

In [48]:
import pandas as pd
import numpy as np


from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, train_test_split

pd.set_option('display.max_columns', 500)

In [49]:
train = pd.read_csv('Data/train.csv')
test = pd.read_csv('Data/test.csv')
greeks = pd.read_csv('Data/greeks.csv')
sub = pd.read_csv('Data/sample_submission.csv')

In [50]:
LE = LabelEncoder()

train['EJ'] = LE.fit_transform(train['EJ'])
test['EJ'] = LE.transform(test['EJ'])

### Exploration

In [None]:
train.head()

In [None]:
test

In [None]:
greeks.head()

In [None]:
print(train.shape)
print(test.shape)
print(greeks.shape)

In [None]:
missing = train.isna().sum().reset_index()
missing.columns = ['columns', 'missing_count']
missing.sort_values('missing_count', ascending = False).head(10)

In [None]:
train['Id'].nunique()

### Modelling

In [51]:
## Defining the input and target variables
X = train.drop(columns = ['Class'])
Y = train['Class']

## Splitting the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, stratify = Y)

## Re-defining the training set
train = pd.concat([X_train, Y_train], axis = 1).reset_index(drop = True)

In [52]:
## Defining the input and target variables
X = train.drop(columns = ['Id', 'Class'], axis = 1)
Y = train['Class']

X_test = X_test.drop(columns = ['Id'], axis = 1)

## Defining lists to store results
lgbm_cv_scores, lgbm_preds = list(), list()
xgb_cv_scores, xgb_preds = list(), list()
cat_cv_scores, cat_preds = list(), list()
ens_cv_scores, ens_preds = list(), list()

## Performing KFold cross-validation
skf = StratifiedKFold(n_splits = 10, shuffle = True, random_state = 42)
    
for i, (train_idx, valid_idx) in enumerate(skf.split(X, Y)):
        
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    Y_train, Y_valid = Y.iloc[train_idx], Y.iloc[valid_idx]
    
    print('---------------------------------------------')
    
    ## LightGBM
    lgbm_md = LGBMClassifier(objective = 'binary', class_weight = 'balanced', verbosity = -1, 
                            metric = 'binary_logloss').fit(X_train, Y_train)
    
    lgbm_pred_valid = lgbm_md.predict_proba(X_valid)   
    lgbm_pred_test = lgbm_md.predict_proba(X_test)  
    
    lgbm_score_fold = log_loss(Y_valid, lgbm_pred_valid)
    
    lgbm_cv_scores.append(lgbm_score_fold)
    lgbm_preds.append(lgbm_pred_test)
    
    print('Fold', i+1, '==> LightGBM oof log-loss is ==>', lgbm_score_fold)
    
    ## XGBoost
    xgb_md = XGBClassifier(objective = 'binary:logistic', scale_pos_weight = 4.71, verbosity = 0).fit(X_train, Y_train)
        
    xgb_pred_valid = xgb_md.predict_proba(X_valid)   
    xgb_pred_test = xgb_md.predict_proba(X_test)  
    
    xgb_score_fold = log_loss(Y_valid, xgb_pred_valid)
    
    xgb_cv_scores.append(xgb_score_fold)
    xgb_preds.append(xgb_pred_test)
    
    print('Fold', i+1, '==> XGBoost oof log-loss is ==>', xgb_score_fold)
    
    ## CatBoost
    cat_md = CatBoostClassifier(auto_class_weights = 'Balanced', verbose = False).fit(X_train, Y_train)
        
    cat_pred_valid = cat_md.predict_proba(X_valid)   
    cat_pred_test = cat_md.predict_proba(X_test)  
    
    cat_score_fold = log_loss(Y_valid, cat_pred_valid)
    
    cat_cv_scores.append(cat_score_fold)
    cat_preds.append(cat_pred_test)
    
    print('Fold', i+1, '==> CatBoost oof log-loss is ==>', cat_score_fold)
    
    ## Ensemble
    X_train_ens = pd.DataFrame({'LGBM': lgbm_pred_valid[:,1].tolist(),  
                                'XGB': xgb_pred_valid[:,1].tolist(), 
                                'CAT': cat_pred_valid[:,1].tolist()})
    X_test_ens = pd.DataFrame({'LGBM': lgbm_pred_test[:,1].tolist(), 
                               'XGB': xgb_pred_test[:,1].tolist(), 
                               'CAT': cat_pred_test[:,1].tolist()})
    
    ens_md = RandomForestClassifier(max_depth = 3, n_estimators = 100, max_features = None).fit(X_train_ens, Y_valid)
    
    ens_pred_valid = ens_md.predict_proba(X_train_ens)
    ens_pred_test = ens_md.predict_proba(X_test_ens)  
    
    ens_score_fold = log_loss(Y_valid, ens_pred_valid)
    
    ens_cv_scores.append(ens_score_fold)
    ens_preds.append(ens_pred_test)
    
    print('Fold', i+1, '==> Ensemble oof log-loss is ==>', ens_score_fold)

---------------------------------------------
Fold 1 ==> LightGBM oof log-loss is ==> 0.23197055888916274
Fold 1 ==> XGBoost oof log-loss is ==> 0.20710055509866743
Fold 1 ==> CatBoost oof log-loss is ==> 0.20926145602751287
Fold 1 ==> Ensemble oof log-loss is ==> 0.07520874031829519
---------------------------------------------
Fold 2 ==> LightGBM oof log-loss is ==> 0.0999441092159285
Fold 2 ==> XGBoost oof log-loss is ==> 0.06309621669389436
Fold 2 ==> CatBoost oof log-loss is ==> 0.12051072686411989
Fold 2 ==> Ensemble oof log-loss is ==> 0.02312977211400287
---------------------------------------------
Fold 3 ==> LightGBM oof log-loss is ==> 0.25235316293518595
Fold 3 ==> XGBoost oof log-loss is ==> 0.18712614108873954
Fold 3 ==> CatBoost oof log-loss is ==> 0.16386238226912275
Fold 3 ==> Ensemble oof log-loss is ==> 0.03498209736996718
---------------------------------------------
Fold 4 ==> LightGBM oof log-loss is ==> 0.2458691130105267
Fold 4 ==> XGBoost oof log-loss is ==> 0.

In [53]:
lgbm_preds_test = np.mean(lgbm_preds, axis = 0).tolist()
xgb_preds_test = np.mean(xgb_preds, axis = 0).tolist()
cat_preds_test = np.mean(cat_preds, axis = 0).tolist()
ens_preds_test = np.mean(ens_preds, axis = 0).tolist()

print('LightGBM test log-loss is ==>', log_loss(Y_test, lgbm_preds_test))
print('XGBoost test log-loss is ==>', log_loss(Y_test, xgb_preds_test))
print('CatBoost test log-loss is ==>', log_loss(Y_test, cat_preds_test))
print('Ensemble test log-loss is ==>', log_loss(Y_test, ens_preds_test))

LightGBM test log-loss is ==> 0.1629458584464075
XGBoost test log-loss is ==> 0.186683319208035
CatBoost test log-loss is ==> 0.1820564525750845
Ensemble test log-loss is ==> 0.18342686305637823


In [None]:
## Saving best predictions as a data-frame
predictions = pd.DataFrame(best_preds_test, , columns = ['class_0', 'class_1'])

## Finalizing submissions data file
sub['class_0'] = predictions['class_0']
sub['class_1'] = predictions['class_1']

sub.to_csv('submission.csv',index = False)