In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report
import numpy as np
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
import os
from sklearn.feature_extraction import DictVectorizer
import scipy.sparse as sp
from mylib.scoring import ndcg_at_k, mean_ndcg
from mylib.preprocess import make_sessions_features, make_user_features
import xgboost as xgb

In [2]:
test = pd.read_csv(os.path.join('data', 'test_users.csv'), header=0, parse_dates=[1,2,3])
train = pd.read_csv(os.path.join('data', 'train_users_2.csv'), header=0, parse_dates=[1,2,3])
df_sessions = pd.read_csv("data/sessions.csv", encoding='utf8')
from mylib.scoring import ndcg_at_k

In [3]:
user_features_train, user_features_test, y, le = make_user_features(train, test)

In [4]:
session_features_train = make_sessions_features(user_features_train, df_sessions)

In [5]:
session_features_test = make_sessions_features(user_features_test, df_sessions)

Running Model

In [6]:
X_train, X_test, y_train, y_test = train_test_split(session_features_train, 
                                                    y, test_size=0.33, 
                                                    random_state=42, 
                                                    stratify=y)

In [7]:
np.unique(y_test, return_counts=True)

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11], dtype=int64),
 array([  178,   471,   350,   742,  1658,   767,   936, 41099,   251,
           72, 20584,  3331], dtype=int64))

In [8]:
params = {
    'objective': 'multi:softprob',
    'num_class': 12,
    'bst:eta' :  0.3,
    'bst:max_depth': 4,
    'eval_metric': 'mlogloss',
    'silent': 0,
    'nthread': 2
}

values, counts = np.unique(y, return_counts=True)
freqs = counts/float(counts.sum())
weights = 1/freqs

for l in [1.0]:
    params['lambda'] = l
    print "Lambda", l
    dtest = xgb.DMatrix(X_test.values, label=y_test)
    dtrain = xgb.DMatrix(X_train.values, label=y_train, weight=weights.take(y_train))
    gbm = xgb.train(params, dtrain, num_boost_round=500, 
                    verbose_eval=True, evals=[(dtest, 'test'), (dtrain, 'train')],
                   early_stopping_rounds=5)
    gbm.predict_proba = gbm.predict
    print mean_ndcg(gbm, xgb.DMatrix(X_test.values), y_test)
    prediction = gbm.predict(xgb.DMatrix(X_test.values)).argmax(axis=1)
    print classification_report(prediction, y_test)

Will train until train error hasn't decreased in 5 rounds.
[0]	test-mlogloss:2.411685	train-mlogloss:2.444891
[1]	test-mlogloss:2.366699	train-mlogloss:2.415701
[2]	test-mlogloss:2.337090	train-mlogloss:2.391770
[3]	test-mlogloss:2.314797	train-mlogloss:2.370515
[4]	test-mlogloss:2.295016	train-mlogloss:2.350060
[5]	test-mlogloss:2.280881	train-mlogloss:2.334300
[6]	test-mlogloss:2.265913	train-mlogloss:2.319496
[7]	test-mlogloss:2.254261	train-mlogloss:2.305061
[8]	test-mlogloss:2.246562	train-mlogloss:2.291126
[9]	test-mlogloss:2.238514	train-mlogloss:2.279339
[10]	test-mlogloss:2.232111	train-mlogloss:2.267637
[11]	test-mlogloss:2.222831	train-mlogloss:2.253698
[12]	test-mlogloss:2.217058	train-mlogloss:2.242778
[13]	test-mlogloss:2.212477	train-mlogloss:2.230071
[14]	test-mlogloss:2.206125	train-mlogloss:2.220570
[15]	test-mlogloss:2.200396	train-mlogloss:2.210973
[16]	test-mlogloss:2.197464	train-mlogloss:2.204733
[17]	test-mlogloss:2.193844	train-mlogloss:2.191894
[18]	test-mlogl

Lambda 1.0
0.646386666602
             precision    recall  f1-score   support

          0       0.11      0.00      0.01      4334
          1       0.09      0.01      0.02      3358
          2       0.11      0.01      0.01      5174
          3       0.11      0.02      0.03      4039
          4       0.09      0.05      0.06      3168
          5       0.06      0.02      0.03      2656
          6       0.07      0.03      0.04      2485
          7       0.57      0.80      0.67     29310
          8       0.07      0.01      0.01      2815
          9       0.01      0.00      0.00      1437
         10       0.18      0.49      0.26      7606
         11       0.11      0.09      0.10      4057

avg / total       0.30      0.40      0.32     70439



[499]	test-mlogloss:1.878269	train-mlogloss:1.334505


In [10]:
print 'Test Set XgbBoost NDCG:', mean_ndcg(gbm, xgb.DMatrix(X_test.values), y_test)
print 'Test Set XgbBoost Classification Report:'
print classification_report(prediction, y_test)

Test Set XgbBoost NDCG: 0.646386666602
Test Set XgbBoost Classification Report:
             precision    recall  f1-score   support

          0       0.11      0.00      0.01      4334
          1       0.09      0.01      0.02      3358
          2       0.11      0.01      0.01      5174
          3       0.11      0.02      0.03      4039
          4       0.09      0.05      0.06      3168
          5       0.06      0.02      0.03      2656
          6       0.07      0.03      0.04      2485
          7       0.57      0.80      0.67     29310
          8       0.07      0.01      0.01      2815
          9       0.01      0.00      0.00      1437
         10       0.18      0.49      0.26      7606
         11       0.11      0.09      0.10      4057

avg / total       0.30      0.40      0.32     70439



 max_depth=3 eta=0.5 num_boost_round= 300
```
Lambda 1.0
0.916503864109
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        22
          1       0.00      0.00      0.00        34
          2       0.00      0.00      0.00        45
          3       0.01      0.09      0.01        70
          4       0.01      0.08      0.01       133
          5       0.00      0.03      0.00        80
          6       0.01      0.10      0.02        83
          7       0.86      0.70      0.77     50306
          8       0.00      0.00      0.00        39
          9       0.00      0.00      0.00        13
         10       0.47      0.50      0.49     19225
         11       0.02      0.14      0.03       389

avg / total       0.74      0.64      0.68     70439
```