In [25]:
import numpy as np
import pandas as pd
from sklearn.externals import joblib
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import Normalizer
from sklearn.cross_validation import train_test_split
import time

In [26]:
# Path to the pickle files
PKL_DIR = 'Data/pickle'
CSV_DIR = 'Data'

# File names
train_f = '/train_merged2_without17304.pkl'
test_f = '/test_merged2_without17304.pkl'

In [27]:
# Non feature
NON_FEATURE=['activity_id','people_id','date','people_date','char_10','outcome', 'outcome_leak','outcome_filled']

# Categorical data that is only label encoded
CATEGORICAL_DATA = ['people_char_1', 'people_char_2','people_group_1',
                    'people_char_3', 'people_char_4', 'people_char_5',
                    'people_char_6', 'people_char_7', 'people_char_8',
                    'people_char_9', 'activity_category',
                    'char_1', 'char_2', 'char_3', 'char_4', 'char_5', 'char_6',
                    'char_7', 'char_8', 'char_9']

# Already in a one-hot encoded form
CATEGORICAL_BINARY = ['people_char_10', 'people_char_11', 'people_char_12',
                      'people_char_13', 'people_char_14', 'people_char_15',
                      'people_char_16', 'people_char_17', 'people_char_18',
                      'people_char_19', 'people_char_20', 'people_char_21',
                      'people_char_22', 'people_char_23', 'people_char_24',
                      'people_char_25', 'people_char_26', 'people_char_27',
                      'people_char_28', 'people_char_29', 'people_char_30',
                      'people_char_31', 'people_char_32', 'people_char_33',
                      'people_char_34', 'people_char_35', 'people_char_36',
                      'people_char_37','weekend','people_weekend' ]

# Continuous categories
CONT = ['people_days', 'days',
      'people_month',  'month', 
      'people_quarter', 'quarter',
      'people_week', 'week',
      'people_dayOfMonth', 'dayOfMonth',
      'people_year', 'year',
      'people_char_38','activities_per_group_date',
      'adays_till_activity',
      'diff_date',
      'next_outcome',
      'outcome_filled_nona',
      'pdays_till_activity',
      'people_per_group',
      'people_per_group_date',
      'prev_outcome',
       'worked_for_day','gp_all0', 'gp_all1', 'gp_mixed']

In [28]:
def category_to_one_hot(dataset, non_feature, continuous_feature):
    # Function to change labels of categories to one-hot encoding using scikit's OneHot Encoding sparse matrix
    # pd.get_dummies(df) does the same, provides sweet header's as well but it kill's memory
    ds = dataset.drop(non_feature, axis=1)
    boolean_column = []
    counter = 0
    for column in ds.columns:
        if column not in continuous_feature:
            boolean_column.append(counter)
        counter += 1
    # boolean_column is not the column name but index
    print("Done filtering columns...")
    grd_enc = OneHotEncoder(categorical_features=boolean_column)
    encoded_arr = grd_enc.fit_transform(ds)
    return encoded_arr

In [29]:
# Read all
train = pd.read_pickle(PKL_DIR+train_f)
test = pd.read_pickle(PKL_DIR+test_f)

In [30]:
v_out=train['outcome']

In [7]:
# test = test.fillna(0)

In [31]:
# Function to one hot encode all values ~ 120 secs
start=time.time()
arr=category_to_one_hot(train,NON_FEATURE,CONT+CATEGORICAL_BINARY)
end=time.time()
print(end-start)

Done filtering columns...
38.5169229507


In [32]:
start=time.time()
arr_b=category_to_one_hot(test,NON_FEATURE,CONT+CATEGORICAL_BINARY)
end=time.time()
print(end-start)

Done filtering columns...
6.10215497017


In [33]:
print (arr.shape)
print (arr_b.shape)
print (v_out.shape)

(1398166, 7664)
(333084, 7664)
(1398166,)


In [13]:
train_arr = arr.shape
test_arr = arr_b.shape

In [34]:
dtrain = xgb.DMatrix(arr,label=v_out)
dtest = xgb.DMatrix(arr_b)

In [35]:
# Trial. Run instead of above three 
param = {'max_depth':18, 'eta':0.02, 'silent':1, 'objective':'binary:logistic' }
param['nthread'] = 4
param['eval_metric'] = 'auc'
param['subsample'] = 0.6
param['colsample_bytree']= 0.7
param['min_child_weight'] = 2
param['booster'] = "gbtree"

# param = {'max_depth':18, 'eta':0.02, 'silent':1, 'objective':'binary:logistic' }
# param['nthread'] = 4
# param['eval_metric'] = 'auc'
# param['subsample'] = 0.7
# param['colsample_bytree']= 0.7
# param['min_child_weight'] = 2
# param['booster'] = "gbtree"
# param['reg_alpha'] = 0.001

watchlist  = [(dtrain,'train')]
num_round = 300
early_stopping_rounds=10

bst = xgb.train(param, dtrain, num_round, watchlist,early_stopping_rounds=early_stopping_rounds)


[0]	train-auc:0.999856
Will train until train-auc hasn't improved in 10 rounds.
[1]	train-auc:0.999864
[2]	train-auc:0.999931
[3]	train-auc:0.999946
[4]	train-auc:0.999949
[5]	train-auc:0.999956
[6]	train-auc:0.999959
[7]	train-auc:0.99996
[8]	train-auc:0.999961
[9]	train-auc:0.99996
[10]	train-auc:0.999962
[11]	train-auc:0.999961
[12]	train-auc:0.999962
[13]	train-auc:0.999962
[14]	train-auc:0.999962
[15]	train-auc:0.999962
[16]	train-auc:0.999963
[17]	train-auc:0.999963
[18]	train-auc:0.999963
[19]	train-auc:0.999964
[20]	train-auc:0.999965
[21]	train-auc:0.999966
[22]	train-auc:0.99996
[23]	train-auc:0.99996
[24]	train-auc:0.999961
[25]	train-auc:0.999961
[26]	train-auc:0.999961
[27]	train-auc:0.999961
[28]	train-auc:0.999962
[29]	train-auc:0.999963
[30]	train-auc:0.999964
[31]	train-auc:0.999964
Stopping. Best iteration:
[21]	train-auc:0.999966



In [36]:
ypred = bst.predict(dtest)

In [38]:
#Trial. Run instead of above three 
param = {'max_depth':6, 'eta':0.02, 'silent':1, 'objective':'binary:logistic' }
param['nthread'] = 4
param['eval_metric'] = 'auc'
param['subsample'] = 0.5
param['colsample_bytree']= 0.3
param['min_child_weight'] = 2
param['booster'] = "gbtree"

watchlist  = [(dtrain,'train')]
num_round = 300
early_stopping_rounds=10

bst = xgb.train(param, dtrain, num_round, watchlist,early_stopping_rounds=early_stopping_rounds)

[0]	train-auc:0.999838
Will train until train-auc hasn't improved in 10 rounds.
[1]	train-auc:0.999307
[2]	train-auc:0.999714
[3]	train-auc:0.999585
[4]	train-auc:0.99976
[5]	train-auc:0.999746
[6]	train-auc:0.999706
[7]	train-auc:0.999804
[8]	train-auc:0.999837
[9]	train-auc:0.999902
[10]	train-auc:0.999879
[11]	train-auc:0.99987
[12]	train-auc:0.999881
[13]	train-auc:0.999885
[14]	train-auc:0.999878
[15]	train-auc:0.999898
[16]	train-auc:0.999892
[17]	train-auc:0.999899
[18]	train-auc:0.999904
[19]	train-auc:0.999906
[20]	train-auc:0.999904
[21]	train-auc:0.999892
[22]	train-auc:0.999883
[23]	train-auc:0.999884
[24]	train-auc:0.999889
[25]	train-auc:0.999876
[26]	train-auc:0.99988
[27]	train-auc:0.999883
[28]	train-auc:0.999887
[29]	train-auc:0.999877
Stopping. Best iteration:
[19]	train-auc:0.999906



In [39]:
ypred2=bst.predict(dtest)

In [42]:
#Trial. Run instead of above three 
param = {'max_depth':5, 'eta':0.02, 'silent':1, 'objective':'binary:logistic' }
param['nthread'] = 4
param['eval_metric'] = 'auc'
param['subsample'] = 0.6
param['colsample_bytree']= 0.3
param['min_child_weight'] = 2
param['booster'] = "gbtree"

watchlist  = [(dtrain,'train')]
num_round = 300
early_stopping_rounds=10

bst = xgb.train(param, dtrain, num_round, watchlist,early_stopping_rounds=early_stopping_rounds)

[0]	train-auc:0.999838
Will train until train-auc hasn't improved in 10 rounds.
[1]	train-auc:0.999418
[2]	train-auc:0.999745
[3]	train-auc:0.999615
[4]	train-auc:0.999774
[5]	train-auc:0.999747
[6]	train-auc:0.999705
[7]	train-auc:0.999795
[8]	train-auc:0.999827
[9]	train-auc:0.999899
[10]	train-auc:0.999872
[11]	train-auc:0.999863
[12]	train-auc:0.999875
[13]	train-auc:0.999879
[14]	train-auc:0.99987
[15]	train-auc:0.999893
[16]	train-auc:0.999887
[17]	train-auc:0.999895
[18]	train-auc:0.999899
[19]	train-auc:0.999903
[20]	train-auc:0.9999
[21]	train-auc:0.999889
[22]	train-auc:0.99988
[23]	train-auc:0.999881
[24]	train-auc:0.999886
[25]	train-auc:0.999873
[26]	train-auc:0.999877
[27]	train-auc:0.999879
[28]	train-auc:0.999883
[29]	train-auc:0.999872
Stopping. Best iteration:
[19]	train-auc:0.999903



In [43]:
ypred3=bst.predict(dtest)

In [48]:
#Trial. Run instead of above three 
param = {'max_depth':7, 'eta':0.02, 'silent':1, 'objective':'binary:logistic' }
param['nthread'] = 4
param['eval_metric'] = 'auc'
param['subsample'] = 0.7
param['colsample_bytree']= 0.7
param['min_child_weight'] = 2
param['booster'] = "gbtree"

watchlist  = [(dtrain,'train')]
num_round = 300
early_stopping_rounds=10

bst = xgb.train(param, dtrain, num_round, watchlist,early_stopping_rounds=early_stopping_rounds)

[0]	train-auc:0.999853
Will train until train-auc hasn't improved in 10 rounds.
[1]	train-auc:0.99986
[2]	train-auc:0.999924
[3]	train-auc:0.999931
[4]	train-auc:0.999935
[5]	train-auc:0.999938
[6]	train-auc:0.999942
[7]	train-auc:0.999943
[8]	train-auc:0.999945
[9]	train-auc:0.999944
[10]	train-auc:0.999946
[11]	train-auc:0.999942
[12]	train-auc:0.999944
[13]	train-auc:0.999943
[14]	train-auc:0.999944
[15]	train-auc:0.999944
[16]	train-auc:0.999942
[17]	train-auc:0.999942
[18]	train-auc:0.999943
[19]	train-auc:0.999944
[20]	train-auc:0.999945
Stopping. Best iteration:
[10]	train-auc:0.999946



In [49]:
ypred4=bst.predict(dtest)

In [93]:
#Trial. Run instead of above three 
param = {'max_depth':12, 'eta':0.02, 'silent':1, 'objective':'binary:logistic' }
param['nthread'] = 4
param['eval_metric'] = 'auc'
param['subsample'] = 0.4
param['colsample_bytree']= 0.5
param['min_child_weight'] = 2
param['booster'] = "gbtree"

watchlist  = [(dtrain,'train')]
num_round = 300
early_stopping_rounds=10

bst = xgb.train(param, dtrain, num_round, watchlist,early_stopping_rounds=early_stopping_rounds)

[0]	train-auc:0.99984
Will train until train-auc hasn't improved in 10 rounds.
[1]	train-auc:0.999913
[2]	train-auc:0.999903
[3]	train-auc:0.999856
[4]	train-auc:0.99991
[5]	train-auc:0.999846
[6]	train-auc:0.999869
[7]	train-auc:0.999888
[8]	train-auc:0.999898
[9]	train-auc:0.999932
[10]	train-auc:0.999912
[11]	train-auc:0.999912
[12]	train-auc:0.999918
[13]	train-auc:0.999919
[14]	train-auc:0.999921
[15]	train-auc:0.999922
[16]	train-auc:0.99993
[17]	train-auc:0.999932
[18]	train-auc:0.999934
[19]	train-auc:0.999936
[20]	train-auc:0.999936
[21]	train-auc:0.999937
[22]	train-auc:0.999934
[23]	train-auc:0.999934
[24]	train-auc:0.999936
[25]	train-auc:0.999928
[26]	train-auc:0.99993
[27]	train-auc:0.999933
[28]	train-auc:0.999936
[29]	train-auc:0.999929
[30]	train-auc:0.999931
[31]	train-auc:0.999933
Stopping. Best iteration:
[21]	train-auc:0.999937



In [94]:
ypred5=bst.predict(dtest)

In [56]:
#Trial. Run instead of above three 
param = {'max_depth':8, 'eta':0.02, 'silent':1, 'objective':'binary:logistic' }
param['nthread'] = 4
param['eval_metric'] = 'auc'
param['subsample'] = 0.6
param['colsample_bytree']= 0.7
param['min_child_weight'] = 2
param['booster'] = "gbtree"

watchlist  = [(dtrain,'train')]
num_round = 300
early_stopping_rounds=10

bst = xgb.train(param, dtrain, num_round, watchlist,early_stopping_rounds=early_stopping_rounds)

[0]	train-auc:0.999853
Will train until train-auc hasn't improved in 10 rounds.
[1]	train-auc:0.999861
[2]	train-auc:0.999925
[3]	train-auc:0.999933
[4]	train-auc:0.999937
[5]	train-auc:0.99994
[6]	train-auc:0.999944
[7]	train-auc:0.999946
[8]	train-auc:0.999947
[9]	train-auc:0.999946
[10]	train-auc:0.999948
[11]	train-auc:0.999945
[12]	train-auc:0.999946
[13]	train-auc:0.999946
[14]	train-auc:0.999947
[15]	train-auc:0.999946
[16]	train-auc:0.999945
[17]	train-auc:0.999944
[18]	train-auc:0.999946
[19]	train-auc:0.999947
[20]	train-auc:0.999948
Stopping. Best iteration:
[10]	train-auc:0.999948



In [57]:
ypred6=bst.predict(dtest)

In [61]:
#Trial. Run instead of above three 
#Trial. Run instead of above three 
param = {'max_depth':9, 'eta':0.02, 'silent':1, 'objective':'binary:logistic' }
param['nthread'] = 4
param['eval_metric'] = 'auc'
param['subsample'] = 0.5
param['colsample_bytree']= 0.7
param['min_child_weight'] = 2
param['booster'] = "gbtree"

watchlist  = [(dtrain,'train')]
num_round = 300
early_stopping_rounds=10
watchlist  = [(dtrain,'train')]
num_round = 300
early_stopping_rounds=10

bst = xgb.train(param, dtrain, num_round, watchlist,early_stopping_rounds=early_stopping_rounds)

[0]	train-auc:0.999853
Will train until train-auc hasn't improved in 10 rounds.
[1]	train-auc:0.999861
[2]	train-auc:0.999926
[3]	train-auc:0.999935
[4]	train-auc:0.999938
[5]	train-auc:0.999941
[6]	train-auc:0.999944
[7]	train-auc:0.999946
[8]	train-auc:0.999947
[9]	train-auc:0.999947
[10]	train-auc:0.999949
[11]	train-auc:0.999945
[12]	train-auc:0.999947
[13]	train-auc:0.999946
[14]	train-auc:0.999948
[15]	train-auc:0.999947
[16]	train-auc:0.999947
[17]	train-auc:0.999946
[18]	train-auc:0.999947
[19]	train-auc:0.999948
[20]	train-auc:0.999949
Stopping. Best iteration:
[10]	train-auc:0.999949



In [62]:
ypred7=bst.predict(dtest)

In [66]:
#Trial. Run instead of above three 
param = {'max_depth':10, 'eta':0.02, 'silent':1, 'objective':'binary:logistic' }
param['nthread'] = 4
param['eval_metric'] = 'auc'
param['subsample'] = 0.5
param['colsample_bytree']= 0.7
param['min_child_weight'] = 2
param['booster'] = "gbtree"

watchlist  = [(dtrain,'train')]
num_round = 300
early_stopping_rounds=10

bst = xgb.train(param, dtrain, num_round, watchlist,early_stopping_rounds=early_stopping_rounds)

[0]	train-auc:0.999854
Will train until train-auc hasn't improved in 10 rounds.
[1]	train-auc:0.999861
[2]	train-auc:0.999927
[3]	train-auc:0.999936
[4]	train-auc:0.99994
[5]	train-auc:0.999942
[6]	train-auc:0.999946
[7]	train-auc:0.999947
[8]	train-auc:0.999949
[9]	train-auc:0.999948
[10]	train-auc:0.99995
[11]	train-auc:0.999947
[12]	train-auc:0.999949
[13]	train-auc:0.999948
[14]	train-auc:0.99995
[15]	train-auc:0.999949
[16]	train-auc:0.999948
[17]	train-auc:0.999948
[18]	train-auc:0.999949
[19]	train-auc:0.99995
[20]	train-auc:0.999951
[21]	train-auc:0.999952
[22]	train-auc:0.999948
[23]	train-auc:0.999948
[24]	train-auc:0.999949
[25]	train-auc:0.999948
[26]	train-auc:0.99995
[27]	train-auc:0.999949
[28]	train-auc:0.999951
[29]	train-auc:0.999951
[30]	train-auc:0.999952
[31]	train-auc:0.999953
[32]	train-auc:0.999954
[33]	train-auc:0.999951
[34]	train-auc:0.999952
[35]	train-auc:0.999944
[36]	train-auc:0.999946
[37]	train-auc:0.999947
[38]	train-auc:0.999947
[39]	train-auc:0.99994

In [67]:
ypred8=bst.predict(dtest)

In [71]:
#Trial. Run instead of above three 
param = {'max_depth':4, 'eta':0.02, 'silent':1, 'objective':'binary:logistic' }
param['nthread'] = 4
param['eval_metric'] = 'auc'
param['subsample'] = 0.5
param['colsample_bytree']= 0.7
param['min_child_weight'] = 2
param['booster'] = "gbtree"

watchlist  = [(dtrain,'train')]
num_round = 300
early_stopping_rounds=10

bst = xgb.train(param, dtrain, num_round, watchlist,early_stopping_rounds=early_stopping_rounds)

[0]	train-auc:0.999844
Will train until train-auc hasn't improved in 10 rounds.
[1]	train-auc:0.99985
[2]	train-auc:0.99991
[3]	train-auc:0.999917
[4]	train-auc:0.999922
[5]	train-auc:0.99993
[6]	train-auc:0.999933
[7]	train-auc:0.999935
[8]	train-auc:0.999937
[9]	train-auc:0.999936
[10]	train-auc:0.999938
[11]	train-auc:0.999933
[12]	train-auc:0.999935
[13]	train-auc:0.999934
[14]	train-auc:0.999935
[15]	train-auc:0.999934
[16]	train-auc:0.999935
[17]	train-auc:0.999934
[18]	train-auc:0.999936
[19]	train-auc:0.999937
[20]	train-auc:0.999938
Stopping. Best iteration:
[10]	train-auc:0.999938



In [72]:
ypred9=bst.predict(dtest)

In [77]:
#Trial. Run instead of above three 
param = {'max_depth':3, 'eta':0.02, 'silent':1, 'objective':'binary:logistic' }
param['nthread'] = 4
param['eval_metric'] = 'auc'
param['subsample'] = 0.5
param['colsample_bytree']= 0.7
param['min_child_weight'] = 2
param['booster'] = "gbtree"

watchlist  = [(dtrain,'train')]
num_round = 300
early_stopping_rounds=10

bst = xgb.train(param, dtrain, num_round, watchlist,early_stopping_rounds=early_stopping_rounds)

[0]	train-auc:0.999837
Will train until train-auc hasn't improved in 10 rounds.
[1]	train-auc:0.999838
[2]	train-auc:0.999905
[3]	train-auc:0.999904
[4]	train-auc:0.999907
[5]	train-auc:0.999923
[6]	train-auc:0.999926
[7]	train-auc:0.999927
[8]	train-auc:0.999927
[9]	train-auc:0.999925
[10]	train-auc:0.999926
[11]	train-auc:0.999919
[12]	train-auc:0.99992
[13]	train-auc:0.999918
[14]	train-auc:0.999919
[15]	train-auc:0.999918
[16]	train-auc:0.999931
[17]	train-auc:0.99993
[18]	train-auc:0.999932
[19]	train-auc:0.999932
[20]	train-auc:0.999933
[21]	train-auc:0.999934
[22]	train-auc:0.999932
[23]	train-auc:0.999931
[24]	train-auc:0.999931
[25]	train-auc:0.99993
[26]	train-auc:0.999931
[27]	train-auc:0.999929
[28]	train-auc:0.999931
[29]	train-auc:0.999931
[30]	train-auc:0.999932
[31]	train-auc:0.999933
Stopping. Best iteration:
[21]	train-auc:0.999934



In [78]:
ypred10=bst.predict(dtest)

In [125]:
ypredf = np.mean([ypred,ypred2,ypred3,ypred4,ypred5,ypred6,ypred7,ypred8,ypred9,ypred10],axis=0)

In [126]:
test['outcome']=ypredf
jj_outcome = pd.read_csv('Data/manipulated_results.csv')
thegroup = jj_outcome[jj_outcome['people_group_1']==17304][['activity_id','outcome']]
thegroup['outcome'] = 0
test_n = test[['activity_id','outcome']]

In [127]:
test_n = pd.concat([thegroup,test_n])
test_n[['outcome','activity_id']].set_index('activity_id').drop('act_0').to_csv("FinalSubmission.csv")