In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import StratifiedShuffleSplit
import operator
from scipy.sparse import csr_matrix

In [2]:
train = pd.read_csv('trainExt.csv')
test = pd.read_csv('testExt.csv')

In [3]:
loader = np.load('Xtrainleak.npz') 
Xtrain = csr_matrix(( loader['data'], loader['indices'], loader['indptr']), shape = loader['shape'])
loader = np.load('Xtestleak.npz')
Xtest = csr_matrix(( loader['data'], loader['indices'], loader['indptr']), shape = loader['shape'])

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74645 entries, 0 to 74644
Columns: 328 entries, device_id to model_????metal
dtypes: float64(324), int64(2), object(2)
memory usage: 186.8+ MB


In [5]:
le = LabelEncoder()
le.fit(['F23-', 'F24-26','F27-28','F29-32', 'F33-42', 'F43+', 'M22-', 'M23-26', 'M27-28', 'M29-31', 'M32-38', 'M39+'])
labels = le.transform(train.group)
train.drop(['device_id', 'gender', 'age', 'group'], axis=1, inplace=True)
test.drop(['device_id'], axis=1, inplace=True)

In [6]:
# create feature map
outfile = open('xgb.fmap', 'w')
for i, feat in enumerate(train.columns):
    outfile.write('{0}\t{1}\tq\n'.format(i, feat))
outfile.close()

In [36]:
params = {
    'objective' : 'multi:softprob',
    'booster' : 'gbtree',
    'eval_metric' : 'mlogloss',
    'num_class' : 12,
    'eta' : 0.1,
    'gamma' : 0,
    'lambda' : 1,
    'alpha' : 0,
    'max_depth' : 6,
    'min_child_weight' : 1,
    'max_delta_step' : 0,
    'subsample' : .8,
    'colsample_bytree' : .8,
    'silent' : 1,
    'nthread' : 4
}
numtrees = 1500

In [37]:
split = StratifiedShuffleSplit(labels, n_iter=1, test_size=0.1, random_state=1)

best_rounds = []
for x, y in split:
    Dtrain = xgb.DMatrix(Xtrain[x], labels[x])
    Dvalid = xgb.DMatrix(Xtrain[y], labels[y])
    
    watchlist = [(Dtrain, 'train'), (Dvalid, 'valid')]
    
    gbm = xgb.train(params, Dtrain, num_boost_round=numtrees, evals=watchlist, early_stopping_rounds=50)
    
    #importance = gbm.get_fscore(fmap='xgb.fmap')
    #importance = sorted(importance.items(), key=operator.itemgetter(1))
    #print(importance)
    best_rounds.append(gbm.best_iteration)

[0]	train-mlogloss:2.46329	valid-mlogloss:2.46785
Multiple eval metrics have been passed: 'valid-mlogloss' will be used for early stopping.

Will train until valid-mlogloss hasn't improved in 50 rounds.
[1]	train-mlogloss:2.44531	valid-mlogloss:2.45357
[2]	train-mlogloss:2.42868	valid-mlogloss:2.44119
[3]	train-mlogloss:2.41366	valid-mlogloss:2.42965
[4]	train-mlogloss:2.40056	valid-mlogloss:2.41943
[5]	train-mlogloss:2.38863	valid-mlogloss:2.41068
[6]	train-mlogloss:2.37686	valid-mlogloss:2.40178
[7]	train-mlogloss:2.36576	valid-mlogloss:2.39405
[8]	train-mlogloss:2.35563	valid-mlogloss:2.38699
[9]	train-mlogloss:2.34623	valid-mlogloss:2.37976
[10]	train-mlogloss:2.33767	valid-mlogloss:2.37353
[11]	train-mlogloss:2.32942	valid-mlogloss:2.36816
[12]	train-mlogloss:2.32153	valid-mlogloss:2.36272
[13]	train-mlogloss:2.31424	valid-mlogloss:2.35826
[14]	train-mlogloss:2.3072	valid-mlogloss:2.35394
[15]	train-mlogloss:2.30036	valid-mlogloss:2.34998
[16]	train-mlogloss:2.29386	valid-mlogloss

In [38]:
print(best_rounds)

[876]


In [39]:
# make prediction

num_rounds = np.round(np.mean(best_rounds)).astype(int)

Dtrain = xgb.DMatrix(Xtrain, labels)
Dtest = xgb.DMatrix(Xtest)

gbm = xgb.train(params, Dtrain, num_boost_round=num_rounds, evals=watchlist)

[0]	train-mlogloss:2.4638	valid-mlogloss:2.46342
[1]	train-mlogloss:2.44546	valid-mlogloss:2.4453
[2]	train-mlogloss:2.42895	valid-mlogloss:2.42929
[3]	train-mlogloss:2.41414	valid-mlogloss:2.41419
[4]	train-mlogloss:2.40087	valid-mlogloss:2.4014
[5]	train-mlogloss:2.38819	valid-mlogloss:2.3887
[6]	train-mlogloss:2.37718	valid-mlogloss:2.37786
[7]	train-mlogloss:2.36665	valid-mlogloss:2.36724
[8]	train-mlogloss:2.35689	valid-mlogloss:2.35732
[9]	train-mlogloss:2.3475	valid-mlogloss:2.34786
[10]	train-mlogloss:2.33873	valid-mlogloss:2.33909
[11]	train-mlogloss:2.33085	valid-mlogloss:2.33091
[12]	train-mlogloss:2.32351	valid-mlogloss:2.32349
[13]	train-mlogloss:2.31622	valid-mlogloss:2.31603
[14]	train-mlogloss:2.30958	valid-mlogloss:2.30955
[15]	train-mlogloss:2.3034	valid-mlogloss:2.30346
[16]	train-mlogloss:2.29716	valid-mlogloss:2.29693
[17]	train-mlogloss:2.29112	valid-mlogloss:2.2909
[18]	train-mlogloss:2.28564	valid-mlogloss:2.28543
[19]	train-mlogloss:2.28061	valid-mlogloss:2.280

In [40]:
xgb_pred = gbm.predict(Dtest)

In [41]:
sample = pd.read_csv('sample_submission.csv')
sample[['F23-', 'F24-26','F27-28','F29-32', 'F33-42', 'F43+', 'M22-', 'M23-26', 'M27-28', 'M29-31', 'M32-38', 'M39+']] = xgb_pred
sample.to_csv('xgb6.csv', index=False)

In [13]:
sample.head()

Unnamed: 0,device_id,F23-,F24-26,F27-28,F29-32,F33-42,F43+,M22-,M23-26,M27-28,M29-31,M32-38,M39+
0,1002079943728939269,0.062702,0.060126,0.035383,0.074468,0.101711,0.06662,0.072807,0.103058,0.065228,0.098629,0.127127,0.132143
1,-1547860181818787117,0.080224,0.054045,0.03852,0.114592,0.055437,0.056219,0.0847,0.149174,0.052291,0.090287,0.116946,0.107564
2,7374582448058474277,0.070762,0.05308,0.041406,0.061269,0.088696,0.065126,0.090786,0.101418,0.055828,0.097369,0.129716,0.144545
3,-6220210354783429585,0.04826,0.046249,0.030653,0.063388,0.080631,0.066353,0.088449,0.153579,0.067187,0.107634,0.13324,0.114378
4,-5893464122623104785,0.048063,0.06542,0.042701,0.065155,0.054629,0.045401,0.084067,0.153677,0.105754,0.09883,0.12958,0.106723
