We are gong to use different models for prediction, and ensemble them together

In [1]:
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, train_test_split, GridSearchCV 
from sklearn.metrics import confusion_matrix, f1_score
from catboost import CatBoostClassifier

import pandas as pd

In [2]:
train_df = pd.read_csv('processed data\processed_train_Agebin.csv')
test_df = pd.read_csv('processed data\processed_test_Agebin.csv')
TARGET = 'Transported'

In [3]:
X = train_df.drop(TARGET, axis=1)
y = train_df[TARGET]

In [4]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2, random_state = 42)
X_train.shape

(6954, 18)

In [5]:
y_train

2333    False
2589    False
8302     True
8177     True
500      True
        ...  
5734     True
5191    False
5390    False
860     False
7270    False
Name: Transported, Length: 6954, dtype: bool

In [6]:
# xgb_model = CatBoostClassifier()
# model = xgb_model.fit(X_train, y_train)

# #print("Performance on train data:", model.score(X_train, y_train))

In [7]:
# y_pred_v = model.predict(X_valid)

# y_pred_v

In [8]:
MLA = [
    SVC(),
    XGBClassifier(),
    LGBMClassifier(),
    #CatBoostClassifier()
]

row_index = 0

# Setting up the table to compare the performances of each model
MLA_cols = ['Model', 'Accuracy']
MLA_compare = pd.DataFrame(columns = MLA_cols)

# Iterate and store scores in the table
for model in MLA:
    MLA_compare.loc[row_index, 'Model'] = model.__class__.__name__
    cv_results = cross_val_score(model, X_train, y_train, cv=10, scoring='accuracy')
    MLA_compare.loc[row_index, 'Accuracy'] = cv_results.mean()
    
    row_index+=1

# Present table
MLA_compare.sort_values(by=['Accuracy'], ascending=False, inplace=True)
MLA_compare

Unnamed: 0,Model,Accuracy
2,LGBMClassifier,0.797962
1,XGBClassifier,0.790768
0,SVC,0.786454


hmmm, this is the baseline, we can then do the hyperparameter tunning.

In [9]:
skf = StratifiedKFold(n_splits=5, shuffle = True, random_state = 1001)

In [10]:
from sklearn import preprocessing
X_p = preprocessing.scale(X)

In [11]:
svc = SVC(probability = True)

svc_grid = {'C': [0.25, 0.5, 0.75, 1, 1.25, 1.5],
            'kernel': ['linear', 'rbf'],
            'gamma': ['scale', 'auto']}

svc_optimal = RandomizedSearchCV(svc, svc_grid, n_iter=24, scoring='accuracy', n_jobs=-1, cv=skf.split(X_p,y), verbose=2, random_state=42)
svc_optimal.fit(X_p, y)
print(svc_optimal.best_score_)
print(svc_optimal.best_params_)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   23.3s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:  1.8min finished


0.7946638776964658
{'kernel': 'rbf', 'gamma': 'scale', 'C': 1}


In [12]:
# svc = SVC(probability = True)

# svc_grid = {'C': [0.25, 0.5, 0.75, 1, 1.25, 1.5],
#             'kernel': ['linear', 'rbf'],
#             'gamma': ['scale', 'auto']}

# svc_optimal = RandomizedSearchCV(svc, svc_grid, n_iter=12, scoring='accuracy', n_jobs=-1, cv=skf.split(X,y), verbose=2, random_state=42)
# svc_optimal.fit(X, y)
# print(svc_optimal.best_score_)
# print(svc_optimal.best_params_)

In [13]:
boost_grid = {
        'n_estimators': [100, 500, 1000],
        #'gamma': [0.5, 1, 1.5, 2, 5],
        'learning_rate': [0.01, 0.05, 0.1, 0.15],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [5, 7, 9]
        }

In [14]:
lgbm = LGBMClassifier()
lgbm_optimal = RandomizedSearchCV(lgbm, param_distributions=boost_grid, n_iter=100, scoring='roc_auc', n_jobs=-1, cv=skf.split(X,y), verbose=2, random_state=42 )
lgbm_optimal.fit(X, y)
print(lgbm_optimal.best_score_)
print(lgbm_optimal.best_params_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   10.2s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:   23.8s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:   33.5s finished


0.8934018320774555
{'subsample': 0.8, 'n_estimators': 500, 'max_depth': 9, 'learning_rate': 0.01, 'colsample_bytree': 0.6}


In [15]:
xgb = XGBClassifier()
xgb_optimal = RandomizedSearchCV(xgb, param_distributions=boost_grid, n_iter=100, scoring='roc_auc', n_jobs=-1, cv=skf.split(X,y), verbose=2, random_state=42 )
xgb_optimal.fit(X, y)
print(xgb_optimal.best_score_)
print(xgb_optimal.best_params_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    7.1s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  3.5min finished


0.8935190841275636
{'subsample': 0.6, 'n_estimators': 1000, 'max_depth': 5, 'learning_rate': 0.01, 'colsample_bytree': 0.6}


In [16]:
xgb_op = xgb_optimal.best_estimator_
svc_op = svc_optimal.best_estimator_
lgbm_op = lgbm_optimal.best_estimator_

In [17]:
from sklearn.ensemble import VotingClassifier

# Create Hard Voting Classifier
Ensemble_HV = VotingClassifier(estimators= [('SVC', svc_op),
                                           ('XBG', xgb_op),
                                           ('LGBM', lgbm_op)],
                              voting = 'hard')

# Create Soft Voting Classifier
Ensemble_SV = VotingClassifier(estimators= [('SVC', svc_op),
                                           ('XBG', xgb_op),
                                           ('LGBM', lgbm_op)],
                              voting = 'soft')


In [18]:
# Return Accuracy Scores
cv_HV = cross_val_score(Ensemble_HV, X, y, scoring='accuracy')
cv_SV = cross_val_score(Ensemble_SV, X, y, scoring='accuracy')

print('Hard Voting Classifier:' , cv_HV.mean())
print('Soft Voting Classifier:' , cv_SV.mean())

Hard Voting Classifier: 0.7844284408787506
Soft Voting Classifier: 0.794206357766821


In [19]:
Ensemble_HV.fit(X_train, y_train)

VotingClassifier(estimators=[('SVC', SVC(C=1, probability=True)),
                             ('XBG',
                              XGBClassifier(base_score=0.5, booster='gbtree',
                                            callbacks=None, colsample_bylevel=1,
                                            colsample_bynode=1,
                                            colsample_bytree=0.6,
                                            early_stopping_rounds=None,
                                            enable_categorical=False,
                                            eval_metric=None, gamma=0,
                                            gpu_id=-1, grow_policy='depthwise',
                                            importance_type=None,
                                            interaction_constraints='',
                                            learning_rate=0.01, max_bin=256,
                                            max_cat_to_onehot=4,
                                   

In [20]:
Ensemble_SV.fit(X_train, y_train)

VotingClassifier(estimators=[('SVC', SVC(C=1, probability=True)),
                             ('XBG',
                              XGBClassifier(base_score=0.5, booster='gbtree',
                                            callbacks=None, colsample_bylevel=1,
                                            colsample_bynode=1,
                                            colsample_bytree=0.6,
                                            early_stopping_rounds=None,
                                            enable_categorical=False,
                                            eval_metric=None, gamma=0,
                                            gpu_id=-1, grow_policy='depthwise',
                                            importance_type=None,
                                            interaction_constraints='',...
                                            max_cat_to_onehot=4,
                                            max_delta_step=0, max_depth=5,
                                  

In [21]:
y_pred_hv = Ensemble_HV.predict(X_valid)

print("Performance on validation data HV:", f1_score(y_valid, y_pred_hv, average='micro'))

Performance on validation data HV: 0.7958596894767108


In [22]:
y_pred_sv = Ensemble_SV.predict(X_valid)

print("Performance on validation data SV:", f1_score(y_valid, y_pred_sv, average='micro'))

Performance on validation data SV: 0.78953421506613


In [23]:
test_df_copy = test_df.copy()
def predict(model):
    model.fit(X_train, y_train)
    Y_pred = model.predict(test_df)
    submission_df = pd.read_csv('sample_submission.csv')
    submission_df["Transported"] = Y_pred
    submission_df["Transported"] = submission_df["Transported"].astype(bool)
    return submission_df

In [24]:
predict(lgbm_op).to_csv('submission_lgbm_optimal2.csv', index=False)

In [25]:
predict(Ensemble_HV).to_csv('submission_Ensemble_HV2.csv', index=False)
predict(Ensemble_SV).to_csv('submission_Ensemble_SV2.csv', index=False)