In [1]:
# Importing required libraries

import operator
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing, metrics, ensemble, neighbors, linear_model, tree, model_selection
import time

In [2]:
start_time = time.time()

In [3]:
# Reading CSVs

train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')
test_id=test['id']

In [4]:
train_copy = train.copy()
test_copy = test.copy()

In [5]:
train_X=train.drop(['id', 'is_pass'], 1)
test_X=test.drop(['id'], 1)
train_y=train['is_pass']

In [6]:
train_X['age_NA'] = train_X.age.isnull()
test_X['age_NA'] = test_X.age.isnull()


train_X['rating_NA'] = train_X.trainee_engagement_rating.isnull()
test_X['rating_NA'] = test_X.trainee_engagement_rating.isnull()

In [7]:
# Imputing missing values.

train_X['trainee_engagement_rating'].fillna(-1, inplace=True)
test_X['trainee_engagement_rating'].fillna(-1, inplace=True)

def replace_age(data):
    data1 = data[['age', 'education']].dropna()
    dict1 = (data1.groupby('education')['age'].apply(lambda x : x.mode()[0]))
    data.age = data.age.fillna(data.education.map(dict1))
    return data

train_X = replace_age(train_X)
test_X = replace_age(test_X)

In [8]:
# Label encoding

def encoder(data):
    '''Map the categorical variables to numbers to work with scikit learn'''
    for col in data.columns:
        if data.dtypes[col] == "object":
            
            le = preprocessing.LabelEncoder()
            data[col]=le.fit_transform(data[col])
    return data

train_X = encoder(train_X)
test_X = encoder(test_X)

In [9]:
# XGB

def create_feature_map(features):
    outfile = open('xgb.fmap', 'w')
    for i, feat in enumerate(features):
        outfile.write('{0}\t{1}\tq\n'.format(i,feat))
    outfile.close()

def runXGB(train_X, train_y, test_X, test_y=None, test_X2=None, feature_names=None, rounds=500, dep=8, eta=0.1):
    params = {}
    params["objective"] = "binary:logistic"
    params['eval_metric'] = 'auc'
    params["eta"] = eta
    params["subsample"] = 1
    params["min_child_weight"] = 1
    params["colsample_bytree"] = 0.7
    params["max_depth"] = dep
    params["silent"] = 1
    params["seed"] = 100
    #params["tree_method"] = 'hist'
    #params["max_leaves"] = 16
    #params["grow_policy"] = 'lossguide'
    #params["gamma"] = 1
    num_rounds = rounds

    plst = list(params.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=100, verbose_eval=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    if feature_names is not None:
        create_feature_map(feature_names)
        model.dump_model('xgbmodel.txt', 'xgb.fmap', with_stats=True)
        importance = model.get_fscore(fmap='xgb.fmap')
        importance = sorted(importance.items(), key=operator.itemgetter(1), reverse=True)
        imp_df = pd.DataFrame(importance, columns=['feature','fscore'])
        imp_df['fscore'] = imp_df['fscore'] / imp_df['fscore'].sum()
        imp_df.to_csv("imp_feat.txt", index=False)

    pred_test_y = model.predict(xgtest, ntree_limit=model.best_ntree_limit)
    pred_test_y2 = model.predict(xgb.DMatrix(test_X2), ntree_limit=model.best_ntree_limit)

    loss = 0
    if test_y is not None:
        loss = metrics.roc_auc_score(test_y, pred_test_y)
        return pred_test_y, loss, pred_test_y2
    else:
        return pred_test_y, loss, pred_test_y2

In [10]:
# LGM

def runLGB(train_X, train_y, test_X, test_y=None, test_X2=None, feature_names=None, min_leaf=32, rounds=500, dep=4, eta=0.1):
    params = {}
    params["objective"] = "binary"
    params['metric'] = 'auc'
    params["max_depth"] = dep
    params["min_data_in_leaf"] = min_leaf
    params["learning_rate"] = eta
    params["bagging_fraction"] = 1
    params["feature_fraction"] = 0.7
    params["bagging_freq"] = 1
    params["bagging_seed"] = 100
    params["verbosity"] = 0
    num_rounds = rounds

    lgtrain = lgb.Dataset(train_X, label=train_y)

    if test_y is not None:
        lgtest = lgb.Dataset(test_X, label=test_y)
        model = lgb.train(params, lgtrain, num_rounds, valid_sets=[lgtest], early_stopping_rounds=100, verbose_eval=20)
    else:
        lgtest = lgb.DMatrix(test_X)
        model = lgb.train(params, lgtrain, num_rounds)

    pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)
    pred_test_y2 = model.predict(test_X2, num_iteration=model.best_iteration)

    loss = 0
    if test_y is not None:
        loss = metrics.roc_auc_score(test_y, pred_test_y)
        print(loss)
        return pred_test_y, loss, pred_test_y2
    else:
        return pred_test_y, loss, pred_test_y2

In [11]:
# RF

def runRF(train_X, train_y, test_X, test_y=None, test_X2=None, depth=10, leaf=2, feat=0.7):
    model = RandomForestClassifier(n_estimators = 500,
                                   criterion = 'gini',
                                   max_depth = depth,
                                   min_samples_split = 2,
                                   min_samples_leaf = leaf,
                                   max_features =  feat,
                                   n_jobs = -1,
                                   #class_weight = 'balanced',
                                   random_state = 100)
    model.fit(train_X, train_y)
    train_preds = model.predict_proba(train_X)[:,1]   
    test_preds = model.predict_proba(test_X)[:,1]
    test_preds2 = model.predict_proba(test_X2)[:,1]
    test_loss = 0
    if test_y is not None:
        train_loss = metrics.roc_auc_score(train_y, train_preds)
        test_loss = metrics.roc_auc_score(test_y, test_preds)
        print("Depth, leaf, feat : ", depth, leaf, feat)
        print("Train and Test loss : ", train_loss, test_loss)
    return test_preds, test_loss, test_preds2

In [12]:
# ET

def runET(train_X, train_y, test_X, test_y=None, test_X2=None, depth=10, leaf=2, feat=0.7):
    model = ensemble.ExtraTreesClassifier(
                                        n_estimators = 700,
                                        criterion = 'gini',
                                        max_depth = depth,
                                        min_samples_split = 2,
                                        min_samples_leaf = leaf,
                                        max_features =  feat,
                                        #min_impurity_split = 0.1,
                                        n_jobs = -1,
                                        random_state = 100)
    model.fit(train_X, train_y)
    train_preds = model.predict_proba(train_X)[:,1]
    test_preds = model.predict_proba(test_X)[:,1]
    test_preds2 = model.predict_proba(test_X2)[:,1]
    test_loss = 0
    if test_y is not None:
        train_loss = metrics.roc_auc_score(train_y, train_preds)
        test_loss = metrics.roc_auc_score(test_y, test_preds)
        print("Depth, leaf, feat : ", depth, leaf, feat)
        print("Train and Test loss : ", train_loss, test_loss)
    return test_preds, test_loss, test_preds2

In [13]:
# Catboost

def runCatB(train_X, train_y, test_X, test_y=None, test_X2=None, depth=8, eta=0.15):
    model = CatBoostClassifier(
                                iterations = 5000,
                                learning_rate = eta,
                                depth = depth,
                                od_type='Iter',
                                od_wait=100,
                                #l2_leaf_reg=6,
                                eval_metric = 'AUC', 
                                verbose=False,
                                random_seed=100)
    
    model.fit(train_X, train_y, eval_set=(test_X, test_y))#, plot=True)
    train_preds = model.predict_proba(train_X)[:,1]
    test_preds = model.predict_proba(test_X)[:,1]
    test_preds2 = model.predict_proba(test_X2)[:,1]
    test_loss = 0
    if test_y is not None:
        train_loss = metrics.roc_auc_score(train_y, train_preds)
        test_loss = metrics.roc_auc_score(test_y, test_preds)
        print("Depth : ", depth)
        print("Train and Test loss : ", train_loss, test_loss)
    return test_preds, test_loss, test_preds2

In [14]:
def runADA(train_X, train_y, test_X, test_y=None, test_X2=None, depth=16, estimators=100, eta=0.1):
    model = AdaBoostClassifier(
                                DecisionTreeClassifier(max_depth=depth, max_features=0.7, min_samples_leaf=5),
                                n_estimators = estimators,
                                learning_rate = eta,
                                random_state=0)
    
    model.fit(train_X, train_y)
    train_preds = model.predict_proba(train_X)[:,1]
    test_preds = model.predict_proba(test_X)[:,1]
    test_preds2 = model.predict_proba(test_X2)[:,1]
    test_loss = 0
    if test_y is not None:
        train_loss = metrics.roc_auc_score(train_y, train_preds)
        test_loss = metrics.roc_auc_score(test_y, test_preds)
        print("Depth : ", depth)
        print("Train and Test loss : ", train_loss, test_loss)
    return test_preds, test_loss, test_preds2

In [15]:
results = pd.DataFrame(index=range(6), columns=['Model', 'AUC Score'])
k=0
col=[]
val_pred=pd.DataFrame()
test_pred=pd.DataFrame()
for model_name in ['XGB', 'LGM', 'RF', 'ET', 'CatBoost', 'ADA']:
    kf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    cv_scores = []
    pred_test_full = 0
    pred_val_full = np.zeros(train_X.shape[0])
    for dev_index, val_index in kf.split(train_X, train_y):
        dev_X, val_X = train_X.iloc[dev_index,:], train_X.iloc[val_index,:]
        dev_y, val_y = train_y[dev_index], train_y[val_index]

        if model_name=='XGB':
            pred_val, loss, pred_test = runXGB(dev_X, dev_y, val_X, val_y, test_X, rounds=10000, feature_names=dev_X.columns.tolist(), dep=10, eta=0.15)
        elif model_name=='LGM':
            pred_val, loss, pred_test = runLGB(dev_X, dev_y, val_X, val_y, test_X, rounds=10000, dep=14, min_leaf=32, eta=0.1)
        elif model_name=='RF':
            pred_val, loss, pred_test = runRF(dev_X, dev_y, val_X, val_y, test_X, depth=30)
        elif model_name=='ET':
            pred_val, loss, pred_test = runET(dev_X, dev_y, val_X, val_y, test_X, leaf=2, depth=24)
        elif model_name=='CatBoost':
            pred_val, loss, pred_test = runCatB(dev_X, dev_y, val_X, val_y, test_X, depth=8, eta=0.2)
        elif model_name=='ADA':
            pred_val, loss, pred_test = runADA(dev_X, dev_y, val_X, val_y, test_X, estimators=100, depth=16)
            
        pred_val_full[val_index] = pred_val
        pred_test_full = pred_test_full + pred_test
        cv_scores.append(loss)
        print(cv_scores)
    pred_test_full /= 5.
    col.append(model_name)
    val_pred=pd.concat([val_pred, pd.DataFrame(pred_val_full)], axis=1)
    test_pred=pd.concat([test_pred, pd.DataFrame(pred_test_full)], axis=1)
    val_pred.columns=col
    test_pred.columns=col
    print(metrics.roc_auc_score(train_y, pred_val_full))
    results['Model'][k]=model_name
    results['AUC Score'][k]=metrics.roc_auc_score(train_y, pred_val_full)
    k=k+1

[0]	train-auc:0.745955	test-auc:0.719408
Multiple eval metrics have been passed: 'test-auc' will be used for early stopping.

Will train until test-auc hasn't improved in 100 rounds.
[20]	train-auc:0.839522	test-auc:0.762001
[40]	train-auc:0.870089	test-auc:0.774768
[60]	train-auc:0.884586	test-auc:0.778976
[80]	train-auc:0.896715	test-auc:0.781767
[100]	train-auc:0.906849	test-auc:0.783931
[120]	train-auc:0.918113	test-auc:0.787618
[140]	train-auc:0.925626	test-auc:0.788424
[160]	train-auc:0.93146	test-auc:0.789844
[180]	train-auc:0.938563	test-auc:0.792219
[200]	train-auc:0.943052	test-auc:0.793097
[220]	train-auc:0.947711	test-auc:0.794468
[240]	train-auc:0.95279	test-auc:0.796418
[260]	train-auc:0.956013	test-auc:0.797137
[280]	train-auc:0.959357	test-auc:0.798094
[300]	train-auc:0.963116	test-auc:0.799107
[320]	train-auc:0.966449	test-auc:0.80038
[340]	train-auc:0.968657	test-auc:0.80035
[360]	train-auc:0.970944	test-auc:0.800663
[380]	train-auc:0.973069	test-auc:0.801287
[400]	tr

[100]	train-auc:0.910347	test-auc:0.782594
[120]	train-auc:0.918946	test-auc:0.784926
[140]	train-auc:0.92688	test-auc:0.787017
[160]	train-auc:0.933654	test-auc:0.788915
[180]	train-auc:0.938531	test-auc:0.78982
[200]	train-auc:0.944022	test-auc:0.790901
[220]	train-auc:0.948647	test-auc:0.79202
[240]	train-auc:0.951821	test-auc:0.792216
[260]	train-auc:0.95523	test-auc:0.793157
[280]	train-auc:0.958961	test-auc:0.793722
[300]	train-auc:0.962931	test-auc:0.794747
[320]	train-auc:0.965585	test-auc:0.79515
[340]	train-auc:0.967656	test-auc:0.794984
[360]	train-auc:0.969216	test-auc:0.795247
[380]	train-auc:0.971429	test-auc:0.7958
[400]	train-auc:0.97296	test-auc:0.796413
[420]	train-auc:0.973871	test-auc:0.796366
[440]	train-auc:0.975692	test-auc:0.796637
[460]	train-auc:0.977336	test-auc:0.796962
[480]	train-auc:0.979096	test-auc:0.797764
[500]	train-auc:0.980669	test-auc:0.798114
[520]	train-auc:0.982163	test-auc:0.79865
[540]	train-auc:0.983284	test-auc:0.798635
[560]	train-auc:0.98

[60]	valid_0's auc: 0.73417
[80]	valid_0's auc: 0.738498
[100]	valid_0's auc: 0.74167
[120]	valid_0's auc: 0.744764
[140]	valid_0's auc: 0.746894
[160]	valid_0's auc: 0.749106
[180]	valid_0's auc: 0.750791
[200]	valid_0's auc: 0.752213
[220]	valid_0's auc: 0.753074
[240]	valid_0's auc: 0.754457
[260]	valid_0's auc: 0.75504
[280]	valid_0's auc: 0.756095
[300]	valid_0's auc: 0.756403
[320]	valid_0's auc: 0.757347
[340]	valid_0's auc: 0.757728
[360]	valid_0's auc: 0.758557
[380]	valid_0's auc: 0.759109
[400]	valid_0's auc: 0.759477
[420]	valid_0's auc: 0.760342
[440]	valid_0's auc: 0.760566
[460]	valid_0's auc: 0.760961
[480]	valid_0's auc: 0.761426
[500]	valid_0's auc: 0.762238
[520]	valid_0's auc: 0.762546
[540]	valid_0's auc: 0.762944
[560]	valid_0's auc: 0.763565
[580]	valid_0's auc: 0.763862
[600]	valid_0's auc: 0.764245
[620]	valid_0's auc: 0.764386
[640]	valid_0's auc: 0.764859
[660]	valid_0's auc: 0.765005
[680]	valid_0's auc: 0.765163
[700]	valid_0's auc: 0.765309
[720]	valid_0's

[1100]	valid_0's auc: 0.780609
[1120]	valid_0's auc: 0.780627
[1140]	valid_0's auc: 0.781129
[1160]	valid_0's auc: 0.781244
[1180]	valid_0's auc: 0.781601
[1200]	valid_0's auc: 0.781792
[1220]	valid_0's auc: 0.782046
[1240]	valid_0's auc: 0.78217
[1260]	valid_0's auc: 0.78236
[1280]	valid_0's auc: 0.782219
[1300]	valid_0's auc: 0.782143
[1320]	valid_0's auc: 0.782299
[1340]	valid_0's auc: 0.782772
[1360]	valid_0's auc: 0.783044
[1380]	valid_0's auc: 0.783339
[1400]	valid_0's auc: 0.78358
[1420]	valid_0's auc: 0.783639
[1440]	valid_0's auc: 0.783828
[1460]	valid_0's auc: 0.784152
[1480]	valid_0's auc: 0.784479
[1500]	valid_0's auc: 0.784984
[1520]	valid_0's auc: 0.785263
[1540]	valid_0's auc: 0.78537
[1560]	valid_0's auc: 0.785196
[1580]	valid_0's auc: 0.785284
[1600]	valid_0's auc: 0.785432
[1620]	valid_0's auc: 0.785431
[1640]	valid_0's auc: 0.785521
[1660]	valid_0's auc: 0.785819
[1680]	valid_0's auc: 0.785787
[1700]	valid_0's auc: 0.785731
[1720]	valid_0's auc: 0.786123
[1740]	valid

In [16]:
results

Unnamed: 0,Model,AUC Score
0,XGB,0.798488
1,LGM,0.788143
2,RF,0.788405
3,ET,0.768185
4,CatBoost,0.780328
5,ADA,0.7763


In [17]:
val_pred.head()

Unnamed: 0,XGB,LGM,RF,ET,CatBoost,ADA
0,0.221323,0.380634,0.292143,0.358111,0.324906,0.106499
1,0.627795,0.491517,0.511588,0.489497,0.544568,0.628839
2,0.987618,0.957244,0.941529,0.979798,0.994488,0.999948
3,0.503662,0.236454,0.462976,0.709427,0.618077,0.046002
4,0.764569,0.772268,0.820976,0.754611,0.729958,0.868965


In [18]:
test_pred.head()

Unnamed: 0,XGB,LGM,RF,ET,CatBoost,ADA
0,0.56877,0.763693,0.536592,0.570319,0.720311,0.529218
1,0.993493,0.981534,0.980025,0.961517,0.983941,0.999343
2,0.548462,0.597565,0.490836,0.470659,0.489991,0.427641
3,0.09761,0.12506,0.122489,0.32889,0.187254,0.020923
4,0.55262,0.40746,0.700347,0.543183,0.479832,0.44807


In [19]:
# Concating predictions with training and test sets respectively

train_X_with_preds = pd.concat([train_X, val_pred], axis=1)
test_X_with_preds = pd.concat([test_X, test_pred], axis=1)

In [20]:
# Stacking only the predictions with LGM

kf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
cv_scores = []
pred_test_full = 0
pred_val_full = np.zeros(val_pred.shape[0])
for dev_index, val_index in kf.split(val_pred, train_y):
    dev_X, val_X = val_pred.iloc[dev_index,:], val_pred.iloc[val_index,:]
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    
    pred_val, loss, pred_test = runLGB(dev_X, dev_y, val_X, val_y, test_pred, rounds=10000, dep=4)
    pred_val_full[val_index] = pred_val
    pred_test_full = pred_test_full + pred_test
    cv_scores.append(loss)
    print(cv_scores)
pred_test_full /= 5.
print(metrics.roc_auc_score(train_y, pred_val_full))
test_stacked1 = pred_test_full
val_stacked1 = pred_val_full

Training until validation scores don't improve for 100 rounds.
[20]	valid_0's auc: 0.811057
[40]	valid_0's auc: 0.810941
[60]	valid_0's auc: 0.810986
[80]	valid_0's auc: 0.811
[100]	valid_0's auc: 0.810822
[120]	valid_0's auc: 0.81061
Early stopping, best iteration is:
[22]	valid_0's auc: 0.811195
0.8111952894853958
[0.8111952894853958]
Training until validation scores don't improve for 100 rounds.
[20]	valid_0's auc: 0.804937
[40]	valid_0's auc: 0.80506
[60]	valid_0's auc: 0.805062
[80]	valid_0's auc: 0.805003
[100]	valid_0's auc: 0.804837
[120]	valid_0's auc: 0.804771
[140]	valid_0's auc: 0.804498
Early stopping, best iteration is:
[43]	valid_0's auc: 0.805112
0.8051122431099232
[0.8111952894853958, 0.8051122431099232]
Training until validation scores don't improve for 100 rounds.
[20]	valid_0's auc: 0.797659
[40]	valid_0's auc: 0.797244
[60]	valid_0's auc: 0.797404
[80]	valid_0's auc: 0.797292
[100]	valid_0's auc: 0.797219
[120]	valid_0's auc: 0.796969
Early stopping, best iteration

In [21]:
# Stacking predictions and features with LGM

kf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
cv_scores = []
pred_test_full = 0
pred_val_full = np.zeros(train_X_with_preds.shape[0])
for dev_index, val_index in kf.split(train_X_with_preds, train_y):
    dev_X, val_X = train_X_with_preds.iloc[dev_index,:], train_X_with_preds.iloc[val_index,:]
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    
    pred_val, loss, pred_test = runET(dev_X, dev_y, val_X, val_y, test_X_with_preds, depth=10)
    pred_val_full[val_index] = pred_val
    pred_test_full = pred_test_full + pred_test
    cv_scores.append(loss)
    print(cv_scores)
pred_test_full /= 5.
print(metrics.roc_auc_score(train_y, pred_val_full))
test_stacked2 = pred_test_full
val_stacked2 = pred_val_full

Depth, leaf, feat :  10 2 0.7
Train and Test loss :  0.8447214219107289 0.8134362011237855
[0.8134362011237855]
Depth, leaf, feat :  10 2 0.7
Train and Test loss :  0.8456165814715176 0.8081303188082127
[0.8134362011237855, 0.8081303188082127]
Depth, leaf, feat :  10 2 0.7
Train and Test loss :  0.847112450862632 0.8006298707406747
[0.8134362011237855, 0.8081303188082127, 0.8006298707406747]
Depth, leaf, feat :  10 2 0.7
Train and Test loss :  0.8446040403995172 0.8119122660910592
[0.8134362011237855, 0.8081303188082127, 0.8006298707406747, 0.8119122660910592]
Depth, leaf, feat :  10 2 0.7
Train and Test loss :  0.8462797635066603 0.806176596408171
[0.8134362011237855, 0.8081303188082127, 0.8006298707406747, 0.8119122660910592, 0.806176596408171]
0.8078576416711599


In [22]:
def mean_encoder(train, test, target=None, cols=None, option=1):
    if target is None:
        target = train.columnsp[-1]
    if cols is None:
        cols = train.select_dtypes(include=['object']).columns[:-1].tolist()
    
    # Option 1: Mean of target value
    # Option 2: Weight of evidence
    # Option 3: Count of target variable
    # OPtion 4: Difference between number of positive and negative
    # Option 5: Expanding mean
    if option==1:
        for col in cols:
            mean = train.groupby(col)[target].mean()
            train[col+'_mean_encoded'] = train[col].map(mean)
            test[col+'_mean_encoded'] = test[col].map(mean)
            test[col+'_mean_encoded'].fillna(train[target].mean(), inplace=True)
    elif option==2:
        for col in cols:
            mean = np.log(train.groupby(col)[target].sum()/(train.groupby(col)[target].count()-train.groupby(col)[target].sum()))
            train[col+'_mean_encoded'] = train[col].map(mean)
            test[col+'_mean_encoded'] = test[col].map(mean)
            test[col+'_mean_encoded'].fillna(train[target].mean(), inplace=True)
    elif option==3:
        for col in cols:
            count = train.groupby(col)[target].sum()
            train[col+'_mean_encoded'] = train[col].map(count)
            test[col+'_mean_encoded'] = test[col].map(count)
            test[col+'_mean_encoded'].fillna(train[target].mean(), inplace=True)
    elif option==4:
        for col in cols:
            diff = 2*train.groupby(col)[target].sum() - train.groupby(col)[target].count()
            train[col+'_mean_encoded'] = train[col].map(diff)
            test[col+'_mean_encoded'] = test[col].map(diff)
            test[col+'_mean_encoded'].fillna(train[target].mean(), inplace=True)
    elif option==5:
        for col in cols:
            cumsum=train.groupby(col)[target].cumsum()-train[target]
            cumcnt=train.groupby(col).cumcount()
            train[col+'_mean_encoded'] = cumsum/cumcnt
            test[col+'_mean_encoded'] = test[col].map(train.groupby(col)[target].mean())
            test[col+'_mean_encoded'].fillna(train[target].mean(), inplace=True)
    else:
        for col in cols:    
            mean = train.groupby(col)[target].mean()
            count = train.groupby(col)[target].count()
            global_mean = train[target].mean()
            alpha=5
            regularized_mean = (mean*count + global_mean*alpha)/(count+alpha)

            train[col+'_mean_encoded'] = train[col].map(regularized_mean)
            test[col+'_mean_encoded'] = test[col].map(regularized_mean)
            test[col+'_mean_encoded'].fillna(global_mean, inplace=True)

    return train, test

In [23]:
cat_feature = ['program_id', 'program_type', 'program_duration', 'test_id', 'test_type', 'difficulty_level',
               'trainee_id', 'gender', 'education', 'city_tier',  'total_programs_enrolled', 'is_handicapped']

In [24]:
results = pd.DataFrame(index=range(2), columns=['Model', 'AUC Score'])
k=0
col=[]
val_pred=pd.DataFrame()
test_pred=pd.DataFrame()
for model_name in ['XGB', 'LGM']:
    kf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    cv_scores = []
    pred_test_full = 0
    pred_val_full = np.zeros(train_X.shape[0])
    for dev_index, val_index in kf.split(train_X, train_y):
        dev_X, val_X = train_X.iloc[dev_index,:], train_X.iloc[val_index,:]
        dev_y, val_y = train_y[dev_index], train_y[val_index]
        train = pd.concat([dev_X, dev_y], axis=1)
        train, val_X = mean_encoder(train, val_X, target='is_pass', cols=cat_feature, option=5)
        train, test_X1 = mean_encoder(train, test_X, target='is_pass', cols=cat_feature, option=5)
        test_X1 = test_X1.drop(cat_feature, axis=1)
        dev_X = train.drop(cat_feature, axis=1)
        dev_X = dev_X.drop(['is_pass'], axis=1)
        dev_y = train['is_pass']
        val_X = val_X.drop(cat_feature, axis=1)

        if model_name=='XGB':
            pred_val, loss, pred_test = runXGB(dev_X, dev_y, val_X, val_y, test_X1, rounds=10000, feature_names=dev_X.columns.tolist(), dep=8, eta=0.1)
        elif model_name=='LGM':
            pred_val, loss, pred_test = runLGB(dev_X, dev_y, val_X, val_y, test_X1, rounds=10000, dep=14, eta=0.01)
            
        pred_val_full[val_index] = pred_val
        pred_test_full = pred_test_full + pred_test
        cv_scores.append(loss)
        print(cv_scores)
    pred_test_full /= 5.
    col.append(model_name)
    val_pred=pd.concat([val_pred, pd.DataFrame(pred_val_full)], axis=1)
    test_pred=pd.concat([test_pred, pd.DataFrame(pred_test_full)], axis=1)
    val_pred.columns=col
    test_pred.columns=col
    print(metrics.roc_auc_score(train_y, pred_val_full))
    results['Model'][k]=model_name
    results['AUC Score'][k]=metrics.roc_auc_score(train_y, pred_val_full)
    k=k+1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


[0]	train-auc:0.742617	test-auc:0.726673
Multiple eval metrics have been passed: 'test-auc' will be used for early stopping.

Will train until test-auc hasn't improved in 100 rounds.
[20]	train-auc:0.826443	test-auc:0.815007
[40]	train-auc:0.850685	test-auc:0.816915
[60]	train-auc:0.866917	test-auc:0.81629
[80]	train-auc:0.877015	test-auc:0.816571
[100]	train-auc:0.884534	test-auc:0.816655
[120]	train-auc:0.891368	test-auc:0.815861
[140]	train-auc:0.897489	test-auc:0.815611
Stopping. Best iteration:
[47]	train-auc:0.856164	test-auc:0.817092

[0.8170922117630782]
[0]	train-auc:0.741411	test-auc:0.7212
Multiple eval metrics have been passed: 'test-auc' will be used for early stopping.

Will train until test-auc hasn't improved in 100 rounds.
[20]	train-auc:0.827196	test-auc:0.809796
[40]	train-auc:0.84867	test-auc:0.809348
[60]	train-auc:0.863176	test-auc:0.808274
[80]	train-auc:0.87458	test-auc:0.807724
[100]	train-auc:0.881272	test-auc:0.806255
[120]	train-auc:0.889116	test-auc:0.80592

[160]	valid_0's auc: 0.814835
[180]	valid_0's auc: 0.815685
[200]	valid_0's auc: 0.816242
[220]	valid_0's auc: 0.816799
[240]	valid_0's auc: 0.817238
[260]	valid_0's auc: 0.817669
[280]	valid_0's auc: 0.817989
[300]	valid_0's auc: 0.818305
[320]	valid_0's auc: 0.818738
[340]	valid_0's auc: 0.819075
[360]	valid_0's auc: 0.819343
[380]	valid_0's auc: 0.819688
[400]	valid_0's auc: 0.819898
[420]	valid_0's auc: 0.820108
[440]	valid_0's auc: 0.820324
[460]	valid_0's auc: 0.820476
[480]	valid_0's auc: 0.820684
[500]	valid_0's auc: 0.820841
[520]	valid_0's auc: 0.820946
[540]	valid_0's auc: 0.821074
[560]	valid_0's auc: 0.821195
[580]	valid_0's auc: 0.82129
[600]	valid_0's auc: 0.821365
[620]	valid_0's auc: 0.821395
[640]	valid_0's auc: 0.821409
[660]	valid_0's auc: 0.821402
[680]	valid_0's auc: 0.821412
[700]	valid_0's auc: 0.821441
[720]	valid_0's auc: 0.821456
[740]	valid_0's auc: 0.821513
[760]	valid_0's auc: 0.821534
[780]	valid_0's auc: 0.821516
[800]	valid_0's auc: 0.821469
[820]	valid

In [25]:
results

Unnamed: 0,Model,AUC Score
0,XGB,0.810573
1,LGM,0.814063


In [26]:
val_pred.head()

Unnamed: 0,XGB,LGM
0,0.190488,0.263792
1,0.582726,0.659094
2,0.911731,0.905701
3,0.355417,0.332484
4,0.718222,0.668355


In [27]:
# combining stacked predictions with mean encoded predictions

val_combined = pd.concat([val_pred, pd.DataFrame(val_stacked1)], axis=1)
val_combined = pd.concat([val_combined, pd.DataFrame(val_stacked2)], axis=1)
test_combined = pd.concat([test_pred, pd.DataFrame(test_stacked1)], axis=1)
test_combined = pd.concat([test_combined, pd.DataFrame(test_stacked2)], axis=1)

In [28]:
val_combined.columns = ['XGB_mean_encoded', 'LGM_mean_encoded', 'Stacked1', 'Stacked2']
test_combined.columns = ['XGB_mean_encoded', 'LGM_mean_encoded', 'Stacked1', 'Stacked2']

In [29]:
train_X_with_stacked_preds = pd.concat([train_X, val_combined], axis=1)
test_X_with_stacked_preds = pd.concat([test_X[['program_id', 'program_type', 'program_duration', 'test_id',
       'test_type', 'difficulty_level', 'trainee_id', 'gender', 'education',
       'city_tier', 'age', 'total_programs_enrolled', 'is_handicapped',
       'trainee_engagement_rating', 'age_NA', 'rating_NA']], test_combined], axis=1)

In [30]:
# Stacking predictions and features with LGM

kf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
cv_scores = []
pred_test_full = 0
pred_val_full = np.zeros(train_X_with_stacked_preds.shape[0])
for dev_index, val_index in kf.split(train_X_with_stacked_preds, train_y):
    dev_X, val_X = train_X_with_stacked_preds.iloc[dev_index,:], train_X_with_stacked_preds.iloc[val_index,:]
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    
    pred_val, loss, pred_test = runLGB(dev_X, dev_y, val_X, val_y, test_X_with_stacked_preds, dep=2)
    pred_val_full[val_index] = pred_val
    pred_test_full = pred_test_full + pred_test
    cv_scores.append(loss)
    print(cv_scores)
pred_test_full /= 5.
print(metrics.roc_auc_score(train_y, pred_val_full))

Training until validation scores don't improve for 100 rounds.
[20]	valid_0's auc: 0.827136
[40]	valid_0's auc: 0.827572
[60]	valid_0's auc: 0.827864
[80]	valid_0's auc: 0.827844
[100]	valid_0's auc: 0.827729
[120]	valid_0's auc: 0.827726
[140]	valid_0's auc: 0.827543
[160]	valid_0's auc: 0.827538
Early stopping, best iteration is:
[63]	valid_0's auc: 0.827894
0.8278939164992329
[0.8278939164992329]
Training until validation scores don't improve for 100 rounds.
[20]	valid_0's auc: 0.819943
[40]	valid_0's auc: 0.820651
[60]	valid_0's auc: 0.820676
[80]	valid_0's auc: 0.820786
[100]	valid_0's auc: 0.82071
[120]	valid_0's auc: 0.820598
[140]	valid_0's auc: 0.820471
[160]	valid_0's auc: 0.820411
[180]	valid_0's auc: 0.820332
Early stopping, best iteration is:
[87]	valid_0's auc: 0.820794
0.8207937277370169
[0.8278939164992329, 0.8207937277370169]
Training until validation scores don't improve for 100 rounds.
[20]	valid_0's auc: 0.813853
[40]	valid_0's auc: 0.81469
[60]	valid_0's auc: 0.814

In [31]:
result=pd.DataFrame({'id':test_id, 'is_pass':pred_test_full})

In [32]:
result.to_csv('result_final_2.csv', index=False)

In [33]:
print(time.time()-start_time)

1192.2160460948944


In [34]:
## Some feature engineering

# Extracting integer in 'program_id'

train_copy['program_number'] = train_copy['program_id'].str[2]
test_copy['program_number'] = test_copy['program_id'].str[2]

# It can seen that programs S & U have 2 types, X & Z have 3 types and T, V & Y have 4 types.

dict = {'S':2, 'T':4, 'U':2, 'V':4, 'X':3, 'Y':4, 'Z':3}
train_copy['number_of_program_types'] = train_copy.program_type.map(dict)
test_copy['number_of_program_types'] = test_copy.program_type.map(dict)

train_copy['program_id_difficulty'] = train_copy.program_id+'_'+train_copy.difficulty_level
test_copy['program_id_difficulty'] = test_copy.program_id+'_'+test_copy.difficulty_level

train_copy['education_difficulty'] = train_copy.education+'_'+train_copy.difficulty_level
test_copy['education_difficulty'] = test_copy.education+'_'+test_copy.difficulty_level

train_copy['program_type_difficulty'] = train_copy.program_type+'_'+train_copy.difficulty_level
test_copy['program_type_difficulty'] = test_copy.program_type+'_'+test_copy.difficulty_level

dict = {
        'S':'easy_intermediate_hard',
        'T':'easy_intermediate_hard',
        'U':'easy_intermediate_veryhard',
        'V':'easy_intermediate_hard_veryhard',
        'X':'intermidiate_hard',
        'Y':'easy_intermediate_hard',
        'Z':'easy'
        }
train_copy['program_difficulty_levels'] = train_copy.program_type.map(dict)
test_copy['program_difficulty_levels'] = test_copy.program_type.map(dict)

In [35]:
train_X=train_copy.drop(['id', 'is_pass'], 1)
test_X=test_copy.drop(['id'], 1)
train_y=train_copy['is_pass']

train_X['age_NA'] = train_X.age.isnull()
test_X['age_NA'] = test_X.age.isnull()


train_X['rating_NA'] = train_X.trainee_engagement_rating.isnull()
test_X['rating_NA'] = test_X.trainee_engagement_rating.isnull()

train_X['trainee_engagement_rating'].fillna(-1, inplace=True)
test_X['trainee_engagement_rating'].fillna(-1, inplace=True)

train_X = replace_age(train_X)
test_X = replace_age(test_X)

train_X = encoder(train_X)
test_X = encoder(test_X)

In [36]:
cat_feature = ['program_id', 'program_type', 'program_duration', 'test_id', 'test_type', 'difficulty_level', 
               'trainee_id', 'gender', 'education', 'city_tier',  'total_programs_enrolled', 'is_handicapped', 
               'program_number','number_of_program_types', 'program_id_difficulty','education_difficulty', 
               'program_type_difficulty','program_difficulty_levels']

In [37]:
results = pd.DataFrame(index=range(2), columns=['Model', 'AUC Score'])
k=0
col=[]
val_pred=pd.DataFrame()
test_pred=pd.DataFrame()
for model_name in ['XGB', 'LGM']:#, 'LGM', 'RF', 'ET', 'CatBoost', 'ADA']:
    kf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    cv_scores = []
    pred_test_full = 0
    pred_val_full = np.zeros(train_X.shape[0])
    for dev_index, val_index in kf.split(train_X, train_y):
        dev_X, val_X = train_X.iloc[dev_index,:], train_X.iloc[val_index,:]
        dev_y, val_y = train_y[dev_index], train_y[val_index]
        train = pd.concat([dev_X, dev_y], axis=1)
        train, val_X = mean_encoder(train, val_X, target='is_pass', cols=cat_feature, option=5)
        train, test_X1 = mean_encoder(train, test_X, target='is_pass', cols=cat_feature, option=5)
        test_X1 = test_X1.drop(cat_feature, axis=1)
        dev_X = train.drop(cat_feature, axis=1)
        dev_X = dev_X.drop(['is_pass'], axis=1)
        dev_y = train['is_pass']
        val_X = val_X.drop(cat_feature, axis=1)

        if model_name=='XGB':
            pred_val, loss, pred_test = runXGB(dev_X, dev_y, val_X, val_y, test_X1, rounds=10000, feature_names=dev_X.columns.tolist(), dep=8, eta=0.1)
        elif model_name=='LGM':
            pred_val, loss, pred_test = runLGB(dev_X, dev_y, val_X, val_y, test_X1, rounds=10000, dep=14, eta=0.01)
            
        pred_val_full[val_index] = pred_val
        pred_test_full = pred_test_full + pred_test
        cv_scores.append(loss)
        print(cv_scores)
    pred_test_full /= 5.
    col.append(model_name)
    val_pred=pd.concat([val_pred, pd.DataFrame(pred_val_full)], axis=1)
    test_pred=pd.concat([test_pred, pd.DataFrame(pred_test_full)], axis=1)
    val_pred.columns=col
    test_pred.columns=col
    print(metrics.roc_auc_score(train_y, pred_val_full))
    results['Model'][k]=model_name
    results['AUC Score'][k]=metrics.roc_auc_score(train_y, pred_val_full)
    k=k+1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


[0]	train-auc:0.710231	test-auc:0.687603
Multiple eval metrics have been passed: 'test-auc' will be used for early stopping.

Will train until test-auc hasn't improved in 100 rounds.
[20]	train-auc:0.831167	test-auc:0.814873
[40]	train-auc:0.85182	test-auc:0.816573
[60]	train-auc:0.867298	test-auc:0.816501
[80]	train-auc:0.877264	test-auc:0.815509
[100]	train-auc:0.885128	test-auc:0.814998
[120]	train-auc:0.891999	test-auc:0.814981
[140]	train-auc:0.90135	test-auc:0.814398
Stopping. Best iteration:
[58]	train-auc:0.865988	test-auc:0.816887

[0.8168872502654883]
[0]	train-auc:0.711941	test-auc:0.689648
Multiple eval metrics have been passed: 'test-auc' will be used for early stopping.

Will train until test-auc hasn't improved in 100 rounds.
[20]	train-auc:0.832199	test-auc:0.808281
[40]	train-auc:0.854308	test-auc:0.810032
[60]	train-auc:0.871421	test-auc:0.809005
[80]	train-auc:0.882263	test-auc:0.808563
[100]	train-auc:0.890442	test-auc:0.806903
[120]	train-auc:0.896422	test-auc:0.80

KeyboardInterrupt: 

In [None]:
results

In [None]:
result=pd.DataFrame({'id':test_id, 'is_pass':test_pred.LGM})
#result.to_csv('result1.csv', index=False)

Submission - result_final.csv