In [1]:
import pandas as pd
import numpy as np

In [2]:
knn = pd.get_dummies(pd.read_csv('subs_ens/knn_submission.csv')['label'])
nn = pd.get_dummies(pd.read_csv('subs_ens/submissionNN-2.csv')['label'])
rf = pd.get_dummies(pd.read_csv('subs_ens/RF__submission.csv')['label'])
lgbm = pd.get_dummies(pd.read_csv('subs_ens/submissionLGB.csv')['label'])
xgboost = pd.get_dummies(pd.read_csv('subs_ens/submission_xgboost7.csv')['label'])

Perform ensembling by summing result from models (taking in account some weights).
In case of tie, the class prior will be used to determine the class to use :
- class 0    0.374034
- class 1    0.144082
- class 2    0.192593
- class 3    0.237882

In [3]:
def generateEnsemble(knn, nn, log, lgbm, xgboost, knn_w, nn_w, log_w, lgbm_w, xgboost_w):
    df = knn_w * knn + nn_w * nn + log_w * log + lgbm_w * lgbm + xgboost_w * xgboost
    #reorder column according to class priors
    df = df[[0, 3, 2, 1]].astype(int)

    #identify the max value and boost it by 1000
    #so we can get back the column in correct order and apply a simple argmax
    for index, row in df.head(5).iterrows():
        row[np.argmax(row)] = 1000
        df.iloc[index,:] = row
        
    #go back to initial order
    df = df[[0, 1, 2, 3]]
    
    return np.argmax(df.values, axis=1)
    
r = generateEnsemble(knn, nn, rf, lgbm, xgboost, 0, 0, 1, 1, 1)
pred_ens = pd.DataFrame(r, columns=['label'])
pred_ens.to_csv('../data/generated/ens_all_noweights.csv', index=True, index_label='Id')

#use the LB scores as weights
r = generateEnsemble(knn, nn, rf, lgbm, xgboost, 0, 0.937, 0.955, 0.962, 0.96)
pred_ens = pd.DataFrame(r, columns=['label'])
pred_ens.to_csv('../data/generated/ens_all_weights.csv', index=True, index_label='Id')

#exclude xgboost
r = generateEnsemble(knn, nn, rf, lgbm, xgboost, 0, 1, 1, 1, 0)
pred_ens = pd.DataFrame(r, columns=['label'])
pred_ens.to_csv('../data/generated/ens_all_noxgboost.csv', index=True, index_label='Id')

#exclude lgbm
r = generateEnsemble(knn, nn, rf, lgbm, xgboost, 0, 1, 1, 0, 1)
pred_ens = pd.DataFrame(r, columns=['label'])
pred_ens.to_csv('../data/generated/ens_all_nolgbm.csv', index=True, index_label='Id')

#exclude xgboost and weighted
r = generateEnsemble(knn, nn, rf, lgbm, xgboost, 0.925, 0.937, 0.921, 0.962, 0)
pred_ens = pd.DataFrame(r, columns=['label'])
pred_ens.to_csv('../data/generated/ens_all_weights_noxgboost.csv', index=True, index_label='Id')

#exclude lgbm and weighted
r = generateEnsemble(knn, nn, rf, lgbm, xgboost, 0.925, 0.937, 0.921, 0, 0.96)
pred_ens = pd.DataFrame(r, columns=['label'])
pred_ens.to_csv('../data/generated/ens_all_weights_nolgbm.csv', index=True, index_label='Id')

#lgbm & xgboost
r = generateEnsemble(knn, nn, rf, lgbm, xgboost, 0, 0, 0, 1, 1)
pred_ens = pd.DataFrame(r, columns=['label'])
pred_ens.to_csv('../data/generated/ens_lgbm_xgboost.csv', index=True, index_label='Id')

#lgbm & rf
r = generateEnsemble(knn, nn, rf, lgbm, xgboost, 0, 0, 1, 1, 0)
pred_ens = pd.DataFrame(r, columns=['label'])
pred_ens.to_csv('../data/generated/ens_lgbm_rf.csv', index=True, index_label='Id')

The current behaviour of 'Series.argmax' is deprecated, use 'idxmax'
instead.
The behavior of 'argmax' will be corrected to return the positional
maximum in the future. For now, use 'series.values.argmax' or
'np.argmax(np.array(values))' to get the position of the maximum
row.
  return bound(*args, **kwds)
