In [1]:
# Importing required libraries

import operator
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing, metrics, ensemble, neighbors, linear_model, tree, model_selection
import time

In [2]:
start_time = time.time()

In [3]:
# Reading CSVs

train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')
test_id=test['id']

In [4]:
train_copy = train.copy()
test_copy = test.copy()

In [5]:
train_X=train.drop(['id', 'is_pass'], 1)
test_X=test.drop(['id'], 1)
train_y=train['is_pass']

In [6]:
train_X['age_NA'] = train_X.age.isnull()
test_X['age_NA'] = test_X.age.isnull()


train_X['rating_NA'] = train_X.trainee_engagement_rating.isnull()
test_X['rating_NA'] = test_X.trainee_engagement_rating.isnull()

In [7]:
# Imputing missing values.

train_X['trainee_engagement_rating'].fillna(-1, inplace=True)
test_X['trainee_engagement_rating'].fillna(-1, inplace=True)

def replace_age(data):
    data1 = data[['age', 'education']].dropna()
    dict1 = (data1.groupby('education')['age'].median())
    data.age = data.age.fillna(data.education.map(dict1))
    return data

train_X = replace_age(train_X)
test_X = replace_age(test_X)

In [8]:
# Label encoding

def encoder(data):
    '''Map the categorical variables to numbers to work with scikit learn'''
    for col in data.columns:
        if data.dtypes[col] == "object":
            
            le = preprocessing.LabelEncoder()
            data[col]=le.fit_transform(data[col])
    return data

train_X = encoder(train_X)
test_X = encoder(test_X)

In [9]:
# XGB

def create_feature_map(features):
    outfile = open('xgb.fmap', 'w')
    for i, feat in enumerate(features):
        outfile.write('{0}\t{1}\tq\n'.format(i,feat))
    outfile.close()

def runXGB(train_X, train_y, test_X, test_y=None, test_X2=None, feature_names=None, rounds=500, dep=8, eta=0.1):
    params = {}
    params["objective"] = "binary:logistic"
    params['eval_metric'] = 'auc'
    params["eta"] = eta
    params["subsample"] = 1
    params["min_child_weight"] = 1
    params["colsample_bytree"] = 0.7
    params["max_depth"] = dep
    params["silent"] = 1
    params["seed"] = 0
    #params["tree_method"] = 'hist'
    #params["max_leaves"] = 16
    #params["grow_policy"] = 'lossguide'
    #params["gamma"] = 1
    num_rounds = rounds

    plst = list(params.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=100, verbose_eval=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    if feature_names is not None:
        create_feature_map(feature_names)
        model.dump_model('xgbmodel.txt', 'xgb.fmap', with_stats=True)
        importance = model.get_fscore(fmap='xgb.fmap')
        importance = sorted(importance.items(), key=operator.itemgetter(1), reverse=True)
        imp_df = pd.DataFrame(importance, columns=['feature','fscore'])
        imp_df['fscore'] = imp_df['fscore'] / imp_df['fscore'].sum()
        imp_df.to_csv("imp_feat.txt", index=False)

    pred_test_y = model.predict(xgtest, ntree_limit=model.best_ntree_limit)
    pred_test_y2 = model.predict(xgb.DMatrix(test_X2), ntree_limit=model.best_ntree_limit)

    loss = 0
    if test_y is not None:
        loss = metrics.roc_auc_score(test_y, pred_test_y)
        return pred_test_y, loss, pred_test_y2
    else:
        return pred_test_y, loss, pred_test_y2

In [10]:
# LGM

def runLGB(train_X, train_y, test_X, test_y=None, test_X2=None, feature_names=None, min_leaf=32, rounds=500, dep=4, eta=0.1):
    params = {}
    params["objective"] = "binary"
    params['metric'] = 'auc'
    params["max_depth"] = dep
    params["min_data_in_leaf"] = min_leaf
    params["learning_rate"] = eta
    params["bagging_fraction"] = 1
    params["feature_fraction"] = 0.7
    params["bagging_freq"] = 1
    params["bagging_seed"] = 0
    params["verbosity"] = 0
    num_rounds = rounds

    lgtrain = lgb.Dataset(train_X, label=train_y)

    if test_y is not None:
        lgtest = lgb.Dataset(test_X, label=test_y)
        model = lgb.train(params, lgtrain, num_rounds, valid_sets=[lgtest], early_stopping_rounds=100, verbose_eval=20)
    else:
        lgtest = lgb.DMatrix(test_X)
        model = lgb.train(params, lgtrain, num_rounds)

    pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)
    pred_test_y2 = model.predict(test_X2, num_iteration=model.best_iteration)

    loss = 0
    if test_y is not None:
        loss = metrics.roc_auc_score(test_y, pred_test_y)
        print(loss)
        return pred_test_y, loss, pred_test_y2
    else:
        return pred_test_y, loss, pred_test_y2

In [11]:
# RF

def runRF(train_X, train_y, test_X, test_y=None, test_X2=None, depth=10, leaf=2, feat=0.7):
    model = RandomForestClassifier(n_estimators = 500,
                                   criterion = 'gini',
                                   max_depth = depth,
                                   min_samples_split = 2,
                                   min_samples_leaf = leaf,
                                   max_features =  feat,
                                   n_jobs = -1,
                                   #class_weight = 'balanced',
                                   random_state = 0)
    model.fit(train_X, train_y)
    train_preds = model.predict_proba(train_X)[:,1]   
    test_preds = model.predict_proba(test_X)[:,1]
    test_preds2 = model.predict_proba(test_X2)[:,1]
    test_loss = 0
    if test_y is not None:
        train_loss = metrics.roc_auc_score(train_y, train_preds)
        test_loss = metrics.roc_auc_score(test_y, test_preds)
        print("Depth, leaf, feat : ", depth, leaf, feat)
        print("Train and Test loss : ", train_loss, test_loss)
    return test_preds, test_loss, test_preds2

In [12]:
# ET

def runET(train_X, train_y, test_X, test_y=None, test_X2=None, depth=10, leaf=2, feat=0.7):
    model = ensemble.ExtraTreesClassifier(
                                        n_estimators = 700,
                                        criterion = 'gini',
                                        max_depth = depth,
                                        min_samples_split = 2,
                                        min_samples_leaf = leaf,
                                        max_features =  feat,
                                        #min_impurity_split = 0.1,
                                        n_jobs = -1,
                                        random_state = 0)
    model.fit(train_X, train_y)
    train_preds = model.predict_proba(train_X)[:,1]
    test_preds = model.predict_proba(test_X)[:,1]
    test_preds2 = model.predict_proba(test_X2)[:,1]
    test_loss = 0
    if test_y is not None:
        train_loss = metrics.roc_auc_score(train_y, train_preds)
        test_loss = metrics.roc_auc_score(test_y, test_preds)
        print("Depth, leaf, feat : ", depth, leaf, feat)
        print("Train and Test loss : ", train_loss, test_loss)
    return test_preds, test_loss, test_preds2

In [13]:
# Catboost

def runCatB(train_X, train_y, test_X, test_y=None, test_X2=None, depth=8, eta=0.15):
    model = CatBoostClassifier(
                                iterations = 5000,
                                learning_rate = eta,
                                depth = depth,
                                od_type='Iter',
                                od_wait=100,
                                #l2_leaf_reg=6,
                                eval_metric = 'AUC', 
                                verbose=False,
                                random_seed=0)
    
    model.fit(train_X, train_y, eval_set=(test_X, test_y))#, plot=True)
    train_preds = model.predict_proba(train_X)[:,1]
    test_preds = model.predict_proba(test_X)[:,1]
    test_preds2 = model.predict_proba(test_X2)[:,1]
    test_loss = 0
    if test_y is not None:
        train_loss = metrics.roc_auc_score(train_y, train_preds)
        test_loss = metrics.roc_auc_score(test_y, test_preds)
        print("Depth : ", depth)
        print("Train and Test loss : ", train_loss, test_loss)
    return test_preds, test_loss, test_preds2

In [14]:
def runADA(train_X, train_y, test_X, test_y=None, test_X2=None, depth=16, estimators=100, eta=0.1):
    model = AdaBoostClassifier(
                                DecisionTreeClassifier(max_depth=depth, max_features=0.7, min_samples_leaf=5),
                                n_estimators = estimators,
                                learning_rate = eta,
                                random_state=0)
    
    model.fit(train_X, train_y)
    train_preds = model.predict_proba(train_X)[:,1]
    test_preds = model.predict_proba(test_X)[:,1]
    test_preds2 = model.predict_proba(test_X2)[:,1]
    test_loss = 0
    if test_y is not None:
        train_loss = metrics.roc_auc_score(train_y, train_preds)
        test_loss = metrics.roc_auc_score(test_y, test_preds)
        print("Depth : ", depth)
        print("Train and Test loss : ", train_loss, test_loss)
    return test_preds, test_loss, test_preds2

In [15]:
results = pd.DataFrame(index=range(6), columns=['Model', 'AUC Score'])
k=0
col=[]
val_pred=pd.DataFrame()
test_pred=pd.DataFrame()
for model_name in ['XGB', 'LGM', 'RF', 'ET', 'CatBoost', 'ADA']:
    kf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    cv_scores = []
    pred_test_full = 0
    pred_val_full = np.zeros(train_X.shape[0])
    for dev_index, val_index in kf.split(train_X, train_y):
        dev_X, val_X = train_X.iloc[dev_index,:], train_X.iloc[val_index,:]
        dev_y, val_y = train_y[dev_index], train_y[val_index]

        if model_name=='XGB':
            pred_val, loss, pred_test = runXGB(dev_X, dev_y, val_X, val_y, test_X, rounds=10000, feature_names=dev_X.columns.tolist(), dep=10, eta=0.15)
        elif model_name=='LGM':
            pred_val, loss, pred_test = runLGB(dev_X, dev_y, val_X, val_y, test_X, rounds=10000, dep=14, min_leaf=32, eta=0.1)
        elif model_name=='RF':
            pred_val, loss, pred_test = runRF(dev_X, dev_y, val_X, val_y, test_X, depth=30)
        elif model_name=='ET':
            pred_val, loss, pred_test = runET(dev_X, dev_y, val_X, val_y, test_X, leaf=2, depth=24)
        elif model_name=='CatBoost':
            pred_val, loss, pred_test = runCatB(dev_X, dev_y, val_X, val_y, test_X, depth=8, eta=0.2)
        elif model_name=='ADA':
            pred_val, loss, pred_test = runADA(dev_X, dev_y, val_X, val_y, test_X, estimators=100, depth=16)
            
        pred_val_full[val_index] = pred_val
        pred_test_full = pred_test_full + pred_test
        cv_scores.append(loss)
        print(cv_scores)
    pred_test_full /= 5.
    col.append(model_name)
    val_pred=pd.concat([val_pred, pd.DataFrame(pred_val_full)], axis=1)
    test_pred=pd.concat([test_pred, pd.DataFrame(pred_test_full)], axis=1)
    val_pred.columns=col
    test_pred.columns=col
    print(metrics.roc_auc_score(train_y, pred_val_full))
    results['Model'][k]=model_name
    results['AUC Score'][k]=metrics.roc_auc_score(train_y, pred_val_full)
    k=k+1

[0]	train-auc:0.743407	test-auc:0.711717
Multiple eval metrics have been passed: 'test-auc' will be used for early stopping.

Will train until test-auc hasn't improved in 100 rounds.
[20]	train-auc:0.848088	test-auc:0.765559
[40]	train-auc:0.877619	test-auc:0.775989
[60]	train-auc:0.892163	test-auc:0.780564
[80]	train-auc:0.902896	test-auc:0.784364
[100]	train-auc:0.915652	test-auc:0.788622
[120]	train-auc:0.924797	test-auc:0.791042
[140]	train-auc:0.932023	test-auc:0.793222
[160]	train-auc:0.938619	test-auc:0.795605
[180]	train-auc:0.943591	test-auc:0.796583
[200]	train-auc:0.948631	test-auc:0.797843
[220]	train-auc:0.952794	test-auc:0.798442
[240]	train-auc:0.956242	test-auc:0.799626
[260]	train-auc:0.959035	test-auc:0.800149
[280]	train-auc:0.963365	test-auc:0.801391
[300]	train-auc:0.967512	test-auc:0.80269
[320]	train-auc:0.969552	test-auc:0.802548
[340]	train-auc:0.971658	test-auc:0.802487
[360]	train-auc:0.9741	test-auc:0.802737
[380]	train-auc:0.976223	test-auc:0.803116
[400]	t

[20]	valid_0's auc: 0.727769
[40]	valid_0's auc: 0.738807
[60]	valid_0's auc: 0.746907
[80]	valid_0's auc: 0.752689
[100]	valid_0's auc: 0.755946
[120]	valid_0's auc: 0.758216
[140]	valid_0's auc: 0.760883
[160]	valid_0's auc: 0.761875
[180]	valid_0's auc: 0.763458
[200]	valid_0's auc: 0.76477
[220]	valid_0's auc: 0.766414
[240]	valid_0's auc: 0.767942
[260]	valid_0's auc: 0.769109
[280]	valid_0's auc: 0.769857
[300]	valid_0's auc: 0.770817
[320]	valid_0's auc: 0.771915
[340]	valid_0's auc: 0.772686
[360]	valid_0's auc: 0.773833
[380]	valid_0's auc: 0.774462
[400]	valid_0's auc: 0.775796
[420]	valid_0's auc: 0.776649
[440]	valid_0's auc: 0.777221
[460]	valid_0's auc: 0.777538
[480]	valid_0's auc: 0.778338
[500]	valid_0's auc: 0.779209
[520]	valid_0's auc: 0.779764
[540]	valid_0's auc: 0.780224
[560]	valid_0's auc: 0.780692
[580]	valid_0's auc: 0.781148
[600]	valid_0's auc: 0.781513
[620]	valid_0's auc: 0.782329
[640]	valid_0's auc: 0.783126
[660]	valid_0's auc: 0.783687
[680]	valid_0's

[320]	valid_0's auc: 0.77105
[340]	valid_0's auc: 0.771624
[360]	valid_0's auc: 0.772455
[380]	valid_0's auc: 0.773263
[400]	valid_0's auc: 0.773907
[420]	valid_0's auc: 0.774469
[440]	valid_0's auc: 0.774783
[460]	valid_0's auc: 0.775412
[480]	valid_0's auc: 0.776556
[500]	valid_0's auc: 0.777109
[520]	valid_0's auc: 0.777702
[540]	valid_0's auc: 0.778482
[560]	valid_0's auc: 0.77932
[580]	valid_0's auc: 0.779894
[600]	valid_0's auc: 0.780356
[620]	valid_0's auc: 0.780833
[640]	valid_0's auc: 0.781195
[660]	valid_0's auc: 0.781741
[680]	valid_0's auc: 0.78213
[700]	valid_0's auc: 0.782157
[720]	valid_0's auc: 0.782476
[740]	valid_0's auc: 0.782621
[760]	valid_0's auc: 0.782875
[780]	valid_0's auc: 0.783268
[800]	valid_0's auc: 0.783736
[820]	valid_0's auc: 0.784567
[840]	valid_0's auc: 0.784658
[860]	valid_0's auc: 0.785061
[880]	valid_0's auc: 0.78519
[900]	valid_0's auc: 0.785457
[920]	valid_0's auc: 0.786012
[940]	valid_0's auc: 0.786458
[960]	valid_0's auc: 0.787209
[980]	valid_0'

Depth :  8
Train and Test loss :  0.9495073204321538 0.7815128174855079
[0.7827274124135906, 0.7799671046942976, 0.7730935692237046, 0.7815128174855079]
Depth :  8
Train and Test loss :  0.9537103936612255 0.7794357127969784
[0.7827274124135906, 0.7799671046942976, 0.7730935692237046, 0.7815128174855079, 0.7794357127969784]
0.7792677843218343
Depth :  16
Train and Test loss :  1.0 0.7812866049941078
[0.7812866049941078]
Depth :  16
Train and Test loss :  1.0 0.7795099999682367
[0.7812866049941078, 0.7795099999682367]
Depth :  16
Train and Test loss :  1.0 0.7731018196687434
[0.7812866049941078, 0.7795099999682367, 0.7731018196687434]
Depth :  16
Train and Test loss :  1.0 0.7771599356271159
[0.7812866049941078, 0.7795099999682367, 0.7731018196687434, 0.7771599356271159]
Depth :  16
Train and Test loss :  1.0 0.7726411845274241
[0.7812866049941078, 0.7795099999682367, 0.7731018196687434, 0.7771599356271159, 0.7726411845274241]
0.7766462784861665


In [16]:
results

Unnamed: 0,Model,AUC Score
0,XGB,0.799456
1,LGM,0.78787
2,RF,0.788003
3,ET,0.768532
4,CatBoost,0.779268
5,ADA,0.776646


In [17]:
val_pred.head()

Unnamed: 0,XGB,LGM,RF,ET,CatBoost,ADA
0,0.172341,0.269694,0.310238,0.365507,0.22814,0.118497
1,0.526569,0.435421,0.502952,0.480056,0.496453,0.428799
2,0.964663,0.876711,0.925989,0.979604,0.996834,0.9999
3,0.414002,0.343552,0.455548,0.687844,0.632503,0.372227
4,0.775838,0.72406,0.817584,0.778462,0.883979,0.900544


In [18]:
test_pred.head()

Unnamed: 0,XGB,LGM,RF,ET,CatBoost,ADA
0,0.507899,0.621235,0.547748,0.56414,0.565631,0.371691
1,0.987997,0.980881,0.984391,0.961018,0.983799,0.998974
2,0.542262,0.604982,0.462229,0.448704,0.483299,0.39511
3,0.052881,0.116347,0.122345,0.332739,0.195817,0.017869
4,0.421241,0.351761,0.682138,0.521596,0.364408,0.414697


In [19]:
# Concating predictions with training and test sets respectively

train_X_with_preds = pd.concat([train_X, val_pred], axis=1)
test_X_with_preds = pd.concat([test_X, test_pred], axis=1)

In [20]:
# Stacking only the predictions with LGM

kf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
cv_scores = []
pred_test_full = 0
pred_val_full = np.zeros(val_pred.shape[0])
for dev_index, val_index in kf.split(val_pred, train_y):
    dev_X, val_X = val_pred.iloc[dev_index,:], val_pred.iloc[val_index,:]
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    
    pred_val, loss, pred_test = runLGB(dev_X, dev_y, val_X, val_y, test_pred, rounds=10000, dep=4)
    pred_val_full[val_index] = pred_val
    pred_test_full = pred_test_full + pred_test
    cv_scores.append(loss)
    print(cv_scores)
pred_test_full /= 5.
print(metrics.roc_auc_score(train_y, pred_val_full))
test_stacked1 = pred_test_full
val_stacked1 = pred_val_full

Training until validation scores don't improve for 100 rounds.
[20]	valid_0's auc: 0.810128
[40]	valid_0's auc: 0.810195
[60]	valid_0's auc: 0.809861
[80]	valid_0's auc: 0.809785
[100]	valid_0's auc: 0.809812
[120]	valid_0's auc: 0.809577
Early stopping, best iteration is:
[37]	valid_0's auc: 0.810272
0.8102717936804451
[0.8102717936804451]
Training until validation scores don't improve for 100 rounds.
[20]	valid_0's auc: 0.806039
[40]	valid_0's auc: 0.806335
[60]	valid_0's auc: 0.806038
[80]	valid_0's auc: 0.806156
[100]	valid_0's auc: 0.806029
[120]	valid_0's auc: 0.805918
Early stopping, best iteration is:
[31]	valid_0's auc: 0.806444
0.8064443935839551
[0.8102717936804451, 0.8064443935839551]
Training until validation scores don't improve for 100 rounds.
[20]	valid_0's auc: 0.797804
[40]	valid_0's auc: 0.797759
[60]	valid_0's auc: 0.797588
[80]	valid_0's auc: 0.797291
[100]	valid_0's auc: 0.797011
[120]	valid_0's auc: 0.796817
Early stopping, best iteration is:
[30]	valid_0's auc: 

In [21]:
# Stacking predictions and features with LGM

kf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
cv_scores = []
pred_test_full = 0
pred_val_full = np.zeros(train_X_with_preds.shape[0])
for dev_index, val_index in kf.split(train_X_with_preds, train_y):
    dev_X, val_X = train_X_with_preds.iloc[dev_index,:], train_X_with_preds.iloc[val_index,:]
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    
    pred_val, loss, pred_test = runET(dev_X, dev_y, val_X, val_y, test_X_with_preds, depth=10)
    pred_val_full[val_index] = pred_val
    pred_test_full = pred_test_full + pred_test
    cv_scores.append(loss)
    print(cv_scores)
pred_test_full /= 5.
print(metrics.roc_auc_score(train_y, pred_val_full))
test_stacked2 = pred_test_full
val_stacked2 = pred_val_full

Depth, leaf, feat :  10 2 0.7
Train and Test loss :  0.8443827745573647 0.8130480271639717
[0.8130480271639717]
Depth, leaf, feat :  10 2 0.7
Train and Test loss :  0.8452398773938994 0.8086516780373387
[0.8130480271639717, 0.8086516780373387]
Depth, leaf, feat :  10 2 0.7
Train and Test loss :  0.8468353334954959 0.8003311781582572
[0.8130480271639717, 0.8086516780373387, 0.8003311781582572]
Depth, leaf, feat :  10 2 0.7
Train and Test loss :  0.8445552405332121 0.81207884125279
[0.8130480271639717, 0.8086516780373387, 0.8003311781582572, 0.81207884125279]
Depth, leaf, feat :  10 2 0.7
Train and Test loss :  0.8456987044250389 0.8060520896921322
[0.8130480271639717, 0.8086516780373387, 0.8003311781582572, 0.81207884125279, 0.8060520896921322]
0.807794590455011


In [22]:
def mean_encoder(train, test, target=None, cols=None, option=1):
    if target is None:
        target = train.columnsp[-1]
    if cols is None:
        cols = train.select_dtypes(include=['object']).columns[:-1].tolist()
    
    # Option 1: Mean of target value
    # Option 2: Weight of evidence
    # Option 3: Count of target variable
    # OPtion 4: Difference between number of positive and negative
    # Option 5: Expanding mean
    if option==1:
        for col in cols:
            mean = train.groupby(col)[target].mean()
            train[col+'_mean_encoded'] = train[col].map(mean)
            test[col+'_mean_encoded'] = test[col].map(mean)
            test[col+'_mean_encoded'].fillna(train[target].mean(), inplace=True)
    elif option==2:
        for col in cols:
            mean = np.log(train.groupby(col)[target].sum()/(train.groupby(col)[target].count()-train.groupby(col)[target].sum()))
            train[col+'_mean_encoded'] = train[col].map(mean)
            test[col+'_mean_encoded'] = test[col].map(mean)
            test[col+'_mean_encoded'].fillna(train[target].mean(), inplace=True)
    elif option==3:
        for col in cols:
            count = train.groupby(col)[target].sum()
            train[col+'_mean_encoded'] = train[col].map(count)
            test[col+'_mean_encoded'] = test[col].map(count)
            test[col+'_mean_encoded'].fillna(train[target].mean(), inplace=True)
    elif option==4:
        for col in cols:
            diff = 2*train.groupby(col)[target].sum() - train.groupby(col)[target].count()
            train[col+'_mean_encoded'] = train[col].map(diff)
            test[col+'_mean_encoded'] = test[col].map(diff)
            test[col+'_mean_encoded'].fillna(train[target].mean(), inplace=True)
    elif option==5:
        for col in cols:
            cumsum=train.groupby(col)[target].cumsum()-train[target]
            cumcnt=train.groupby(col).cumcount()
            train[col+'_mean_encoded'] = cumsum/cumcnt
            test[col+'_mean_encoded'] = test[col].map(train.groupby(col)[target].mean())
            test[col+'_mean_encoded'].fillna(train[target].mean(), inplace=True)
    else:
        for col in cols:    
            mean = train.groupby(col)[target].mean()
            count = train.groupby(col)[target].count()
            global_mean = train[target].mean()
            alpha=5
            regularized_mean = (mean*count + global_mean*alpha)/(count+alpha)

            train[col+'_mean_encoded'] = train[col].map(regularized_mean)
            test[col+'_mean_encoded'] = test[col].map(regularized_mean)
            test[col+'_mean_encoded'].fillna(global_mean, inplace=True)

    return train, test

In [23]:
cat_feature = ['program_id', 'program_type', 'program_duration', 'test_id', 'test_type', 'difficulty_level',
               'trainee_id', 'gender', 'education', 'city_tier',  'total_programs_enrolled', 'is_handicapped']

In [24]:
results = pd.DataFrame(index=range(2), columns=['Model', 'AUC Score'])
k=0
col=[]
val_pred=pd.DataFrame()
test_pred=pd.DataFrame()
for model_name in ['XGB', 'LGM']:
    kf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    cv_scores = []
    pred_test_full = 0
    pred_val_full = np.zeros(train_X.shape[0])
    for dev_index, val_index in kf.split(train_X, train_y):
        dev_X, val_X = train_X.iloc[dev_index,:], train_X.iloc[val_index,:]
        dev_y, val_y = train_y[dev_index], train_y[val_index]
        train = pd.concat([dev_X, dev_y], axis=1)
        train, val_X = mean_encoder(train, val_X, target='is_pass', cols=cat_feature, option=5)
        train, test_X1 = mean_encoder(train, test_X, target='is_pass', cols=cat_feature, option=5)
        test_X1 = test_X1.drop(cat_feature, axis=1)
        dev_X = train.drop(cat_feature, axis=1)
        dev_X = dev_X.drop(['is_pass'], axis=1)
        dev_y = train['is_pass']
        val_X = val_X.drop(cat_feature, axis=1)

        if model_name=='XGB':
            pred_val, loss, pred_test = runXGB(dev_X, dev_y, val_X, val_y, test_X1, rounds=10000, feature_names=dev_X.columns.tolist(), dep=8, eta=0.1)
        elif model_name=='LGM':
            pred_val, loss, pred_test = runLGB(dev_X, dev_y, val_X, val_y, test_X1, rounds=10000, dep=14, eta=0.01)
            
        pred_val_full[val_index] = pred_val
        pred_test_full = pred_test_full + pred_test
        cv_scores.append(loss)
        print(cv_scores)
    pred_test_full /= 5.
    col.append(model_name)
    val_pred=pd.concat([val_pred, pd.DataFrame(pred_val_full)], axis=1)
    test_pred=pd.concat([test_pred, pd.DataFrame(pred_test_full)], axis=1)
    val_pred.columns=col
    test_pred.columns=col
    print(metrics.roc_auc_score(train_y, pred_val_full))
    results['Model'][k]=model_name
    results['AUC Score'][k]=metrics.roc_auc_score(train_y, pred_val_full)
    k=k+1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


[0]	train-auc:0.737261	test-auc:0.723093
Multiple eval metrics have been passed: 'test-auc' will be used for early stopping.

Will train until test-auc hasn't improved in 100 rounds.
[20]	train-auc:0.827999	test-auc:0.815891
[40]	train-auc:0.846989	test-auc:0.817317
[60]	train-auc:0.865881	test-auc:0.816545
[80]	train-auc:0.874804	test-auc:0.816541
[100]	train-auc:0.880941	test-auc:0.816307
[120]	train-auc:0.888775	test-auc:0.816467
[140]	train-auc:0.895813	test-auc:0.816278
Stopping. Best iteration:
[48]	train-auc:0.855691	test-auc:0.817719

[0.817718974405488]
[0]	train-auc:0.736907	test-auc:0.718446
Multiple eval metrics have been passed: 'test-auc' will be used for early stopping.

Will train until test-auc hasn't improved in 100 rounds.
[20]	train-auc:0.830587	test-auc:0.810579
[40]	train-auc:0.850129	test-auc:0.8113
[60]	train-auc:0.866706	test-auc:0.810209
[80]	train-auc:0.875212	test-auc:0.808569
[100]	train-auc:0.881846	test-auc:0.806634
[120]	train-auc:0.889043	test-auc:0.805

[100]	valid_0's auc: 0.811586
[120]	valid_0's auc: 0.812754
[140]	valid_0's auc: 0.813799
[160]	valid_0's auc: 0.814741
[180]	valid_0's auc: 0.815536
[200]	valid_0's auc: 0.816165
[220]	valid_0's auc: 0.81676
[240]	valid_0's auc: 0.81721
[260]	valid_0's auc: 0.817587
[280]	valid_0's auc: 0.81796
[300]	valid_0's auc: 0.818299
[320]	valid_0's auc: 0.818688
[340]	valid_0's auc: 0.819066
[360]	valid_0's auc: 0.819399
[380]	valid_0's auc: 0.819767
[400]	valid_0's auc: 0.819946
[420]	valid_0's auc: 0.820248
[440]	valid_0's auc: 0.820491
[460]	valid_0's auc: 0.820636
[480]	valid_0's auc: 0.820754
[500]	valid_0's auc: 0.820884
[520]	valid_0's auc: 0.820995
[540]	valid_0's auc: 0.821103
[560]	valid_0's auc: 0.8212
[580]	valid_0's auc: 0.821278
[600]	valid_0's auc: 0.821328
[620]	valid_0's auc: 0.821351
[640]	valid_0's auc: 0.821392
[660]	valid_0's auc: 0.821428
[680]	valid_0's auc: 0.821466
[700]	valid_0's auc: 0.821465
[720]	valid_0's auc: 0.821523
[740]	valid_0's auc: 0.821572
[760]	valid_0's

In [25]:
results

Unnamed: 0,Model,AUC Score
0,XGB,0.811195
1,LGM,0.814176


In [26]:
val_pred.head()

Unnamed: 0,XGB,LGM
0,0.201661,0.286641
1,0.625853,0.649665
2,0.913107,0.904516
3,0.307189,0.33072
4,0.660799,0.677801


In [27]:
# combining stacked predictions with mean encoded predictions

val_combined = pd.concat([val_pred, pd.DataFrame(val_stacked1)], axis=1)
val_combined = pd.concat([val_combined, pd.DataFrame(val_stacked2)], axis=1)
test_combined = pd.concat([test_pred, pd.DataFrame(test_stacked1)], axis=1)
test_combined = pd.concat([test_combined, pd.DataFrame(test_stacked2)], axis=1)

In [28]:
val_combined.columns = ['XGB_mean_encoded', 'LGM_mean_encoded', 'Stacked1', 'Stacked2']
test_combined.columns = ['XGB_mean_encoded', 'LGM_mean_encoded', 'Stacked1', 'Stacked2']

In [29]:
train_X_with_stacked_preds = pd.concat([train_X, val_combined], axis=1)
test_X_with_stacked_preds = pd.concat([test_X[['program_id', 'program_type', 'program_duration', 'test_id',
       'test_type', 'difficulty_level', 'trainee_id', 'gender', 'education',
       'city_tier', 'age', 'total_programs_enrolled', 'is_handicapped',
       'trainee_engagement_rating', 'age_NA', 'rating_NA']], test_combined], axis=1)

In [30]:
# Stacking predictions and features with LGM

kf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
cv_scores = []
pred_test_full = 0
pred_val_full = np.zeros(train_X_with_stacked_preds.shape[0])
for dev_index, val_index in kf.split(train_X_with_stacked_preds, train_y):
    dev_X, val_X = train_X_with_stacked_preds.iloc[dev_index,:], train_X_with_stacked_preds.iloc[val_index,:]
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    
    pred_val, loss, pred_test = runLGB(dev_X, dev_y, val_X, val_y, test_X_with_stacked_preds, dep=2)
    pred_val_full[val_index] = pred_val
    pred_test_full = pred_test_full + pred_test
    cv_scores.append(loss)
    print(cv_scores)
pred_test_full /= 5.
print(metrics.roc_auc_score(train_y, pred_val_full))

Training until validation scores don't improve for 100 rounds.
[20]	valid_0's auc: 0.826557
[40]	valid_0's auc: 0.82699
[60]	valid_0's auc: 0.827105
[80]	valid_0's auc: 0.827102
[100]	valid_0's auc: 0.827115
[120]	valid_0's auc: 0.827047
[140]	valid_0's auc: 0.826989
[160]	valid_0's auc: 0.826914
Early stopping, best iteration is:
[71]	valid_0's auc: 0.827239
0.8272387212943614
[0.8272387212943614]
Training until validation scores don't improve for 100 rounds.
[20]	valid_0's auc: 0.820384
[40]	valid_0's auc: 0.820907
[60]	valid_0's auc: 0.820928
[80]	valid_0's auc: 0.820982
[100]	valid_0's auc: 0.820926
[120]	valid_0's auc: 0.820994
[140]	valid_0's auc: 0.820925
[160]	valid_0's auc: 0.820901
[180]	valid_0's auc: 0.820854
[200]	valid_0's auc: 0.820816
Early stopping, best iteration is:
[111]	valid_0's auc: 0.82102
0.8210195118404748
[0.8272387212943614, 0.8210195118404748]
Training until validation scores don't improve for 100 rounds.
[20]	valid_0's auc: 0.813218
[40]	valid_0's auc: 0.8

In [31]:
result=pd.DataFrame({'id':test_id, 'is_pass':pred_test_full})

In [32]:
result.to_csv('result_final_1.csv', index=False)

In [33]:
print(time.time()-start_time)

2023.9516987800598


In [34]:
## Some feature engineering

# Extracting integer in 'program_id'

train_copy['program_number'] = train_copy['program_id'].str[2]
test_copy['program_number'] = test_copy['program_id'].str[2]

# It can seen that programs S & U have 2 types, X & Z have 3 types and T, V & Y have 4 types.

dict = {'S':2, 'T':4, 'U':2, 'V':4, 'X':3, 'Y':4, 'Z':3}
train_copy['number_of_program_types'] = train_copy.program_type.map(dict)
test_copy['number_of_program_types'] = test_copy.program_type.map(dict)

train_copy['program_id_difficulty'] = train_copy.program_id+'_'+train_copy.difficulty_level
test_copy['program_id_difficulty'] = test_copy.program_id+'_'+test_copy.difficulty_level

train_copy['education_difficulty'] = train_copy.education+'_'+train_copy.difficulty_level
test_copy['education_difficulty'] = test_copy.education+'_'+test_copy.difficulty_level

train_copy['program_type_difficulty'] = train_copy.program_type+'_'+train_copy.difficulty_level
test_copy['program_type_difficulty'] = test_copy.program_type+'_'+test_copy.difficulty_level

dict = {
        'S':'easy_intermediate_hard',
        'T':'easy_intermediate_hard',
        'U':'easy_intermediate_veryhard',
        'V':'easy_intermediate_hard_veryhard',
        'X':'intermidiate_hard',
        'Y':'easy_intermediate_hard',
        'Z':'easy'
        }
train_copy['program_difficulty_levels'] = train_copy.program_type.map(dict)
test_copy['program_difficulty_levels'] = test_copy.program_type.map(dict)

In [35]:
train_X=train_copy.drop(['id', 'is_pass'], 1)
test_X=test_copy.drop(['id'], 1)
train_y=train_copy['is_pass']

train_X['age_NA'] = train_X.age.isnull()
test_X['age_NA'] = test_X.age.isnull()


train_X['rating_NA'] = train_X.trainee_engagement_rating.isnull()
test_X['rating_NA'] = test_X.trainee_engagement_rating.isnull()

train_X['trainee_engagement_rating'].fillna(-1, inplace=True)
test_X['trainee_engagement_rating'].fillna(-1, inplace=True)

train_X = replace_age(train_X)
test_X = replace_age(test_X)

train_X = encoder(train_X)
test_X = encoder(test_X)

In [36]:
cat_feature = ['program_id', 'program_type', 'program_duration', 'test_id', 'test_type', 'difficulty_level', 
               'trainee_id', 'gender', 'education', 'city_tier',  'total_programs_enrolled', 'is_handicapped', 
               'program_number','number_of_program_types', 'program_id_difficulty','education_difficulty', 
               'program_type_difficulty','program_difficulty_levels']

In [37]:
results = pd.DataFrame(index=range(2), columns=['Model', 'AUC Score'])
k=0
col=[]
val_pred=pd.DataFrame()
test_pred=pd.DataFrame()
for model_name in ['XGB', 'LGM']:#, 'LGM', 'RF', 'ET', 'CatBoost', 'ADA']:
    kf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    cv_scores = []
    pred_test_full = 0
    pred_val_full = np.zeros(train_X.shape[0])
    for dev_index, val_index in kf.split(train_X, train_y):
        dev_X, val_X = train_X.iloc[dev_index,:], train_X.iloc[val_index,:]
        dev_y, val_y = train_y[dev_index], train_y[val_index]
        train = pd.concat([dev_X, dev_y], axis=1)
        train, val_X = mean_encoder(train, val_X, target='is_pass', cols=cat_feature, option=5)
        train, test_X1 = mean_encoder(train, test_X, target='is_pass', cols=cat_feature, option=5)
        test_X1 = test_X1.drop(cat_feature, axis=1)
        dev_X = train.drop(cat_feature, axis=1)
        dev_X = dev_X.drop(['is_pass'], axis=1)
        dev_y = train['is_pass']
        val_X = val_X.drop(cat_feature, axis=1)

        if model_name=='XGB':
            pred_val, loss, pred_test = runXGB(dev_X, dev_y, val_X, val_y, test_X1, rounds=10000, feature_names=dev_X.columns.tolist(), dep=8, eta=0.1)
        elif model_name=='LGM':
            pred_val, loss, pred_test = runLGB(dev_X, dev_y, val_X, val_y, test_X1, rounds=10000, dep=14, eta=0.01)
            
        pred_val_full[val_index] = pred_val
        pred_test_full = pred_test_full + pred_test
        cv_scores.append(loss)
        print(cv_scores)
    pred_test_full /= 5.
    col.append(model_name)
    val_pred=pd.concat([val_pred, pd.DataFrame(pred_val_full)], axis=1)
    test_pred=pd.concat([test_pred, pd.DataFrame(pred_test_full)], axis=1)
    val_pred.columns=col
    test_pred.columns=col
    print(metrics.roc_auc_score(train_y, pred_val_full))
    results['Model'][k]=model_name
    results['AUC Score'][k]=metrics.roc_auc_score(train_y, pred_val_full)
    k=k+1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


[0]	train-auc:0.769384	test-auc:0.789542
Multiple eval metrics have been passed: 'test-auc' will be used for early stopping.

Will train until test-auc hasn't improved in 100 rounds.
[20]	train-auc:0.831405	test-auc:0.814058
[40]	train-auc:0.853902	test-auc:0.815582
[60]	train-auc:0.870127	test-auc:0.815588
[80]	train-auc:0.880132	test-auc:0.815271
[100]	train-auc:0.886654	test-auc:0.814678
[120]	train-auc:0.893916	test-auc:0.81474
[140]	train-auc:0.901175	test-auc:0.814168
Stopping. Best iteration:
[47]	train-auc:0.860849	test-auc:0.815777

[0.8157771230323079]


KeyboardInterrupt: 

In [None]:
results

In [None]:
result=pd.DataFrame({'id':test_id, 'is_pass':test_pred.LGM})
#result.to_csv('result1.csv', index=False)

Submission - result_final.csv