In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report
import numpy as np
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
import os
from sklearn.feature_extraction import DictVectorizer
import scipy.sparse as sp
from mylib.scoring import ndcg_at_k, mean_ndcg
from mylib.preprocess import make_sessions_features, make_user_features
import xgboost as xgb

In [2]:
test = pd.read_csv(os.path.join('data', 'test_users.csv'), header=0, parse_dates=[1,2,3])
train = pd.read_csv(os.path.join('data', 'train_users_2.csv'), header=0, parse_dates=[1,2,3])
df_sessions = pd.read_csv("data/sessions.csv", encoding='utf8')
from mylib.scoring import ndcg_at_k

In [3]:
user_features_train, user_features_test, y, le = make_user_features(train, test)

In [4]:
session_features_train = make_sessions_features(user_features_train, df_sessions)

In [5]:
session_features_test = make_sessions_features(user_features_test, df_sessions)

Running Model

In [7]:
# X = final[:train.shape[0]]
X_train, X_test, y_train, y_test = train_test_split(session_features_train, 
                                                    y, test_size=0.33, 
                                                    random_state=42, 
                                                    stratify=y)

In [8]:
np.unique(y_test, return_counts=True)

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11]),
 array([  178,   471,   350,   742,  1658,   767,   936, 41099,   251,
           72, 20584,  3331]))

In [25]:
xgb.DMatrix?

In [32]:
params = {
    'objective': 'multi:softprob',
    'num_class': 12,
    'bst:eta' :  0.3,
    'bst:max_depth': 4,
    'eval_metric': 'mlogloss',
    'silent': 0,
    'nthread': 2
}

values, counts = np.unique(y, return_counts=True)
freqs = counts/float(counts.sum())
weights = 1/freqs

for l in [1.0]:
    params['lambda'] = l
    print "Lambda", l
    dtest = xgb.DMatrix(X_test.values, label=y_test)
    dtrain = xgb.DMatrix(X_train.values, label=y_train, weight=weights.take(y_train))
    gbm = xgb.train(params, dtrain, num_boost_round=5000, 
                    verbose_eval=True, evals=[(dtest, 'test'), (dtrain, 'train')],
                   early_stopping_rounds=10)
    gbm.predict_proba = gbm.predict
    print mean_ndcg(gbm, xgb.DMatrix(X_test.values), y_test)
    prediction = gbm.predict(xgb.DMatrix(X_test.values)).argmax(axis=1)
    print classification_report(prediction, y_test)

Will train until train error hasn't decreased in 10 rounds.
[0]	test-mlogloss:2.411685	train-mlogloss:2.444891
[1]	test-mlogloss:2.366699	train-mlogloss:2.415701
[2]	test-mlogloss:2.337090	train-mlogloss:2.391770
[3]	test-mlogloss:2.314797	train-mlogloss:2.370515
[4]	test-mlogloss:2.295016	train-mlogloss:2.350060
[5]	test-mlogloss:2.280881	train-mlogloss:2.334300
[6]	test-mlogloss:2.265913	train-mlogloss:2.319496
[7]	test-mlogloss:2.254261	train-mlogloss:2.305061
[8]	test-mlogloss:2.246562	train-mlogloss:2.291126
[9]	test-mlogloss:2.238514	train-mlogloss:2.279339
[10]	test-mlogloss:2.232111	train-mlogloss:2.267637
[11]	test-mlogloss:2.222831	train-mlogloss:2.253698
[12]	test-mlogloss:2.217057	train-mlogloss:2.242778
[13]	test-mlogloss:2.212477	train-mlogloss:2.230071
[14]	test-mlogloss:2.206125	train-mlogloss:2.220570
[15]	test-mlogloss:2.200396	train-mlogloss:2.210973
[16]	test-mlogloss:2.197464	train-mlogloss:2.204733
[17]	test-mlogloss:2.193844	train-mlogloss:2.191894
[18]	test-mlog

Lambda 1.0


[735]	test-mlogloss:1.829002	train-mlogloss:1.170495


KeyboardInterrupt: 

In [None]:
# eta 0.5, max_depth=2, lambda=0.1, test-mlogloss:1.038725	train-mlogloss:1.012042

In [None]:
# 0.73 ndcg on balanced dataset for xgboost 

In [None]:
# print mean_ndcg(gbm, xgb.DMatrix(X_test.values), y_test)
print classification_report(prediction, y_test)

# max_depth=3 eta=0.5 num_boost_round= 300
```
Lambda 1.0
0.916503864109
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        22
          1       0.00      0.00      0.00        34
          2       0.00      0.00      0.00        45
          3       0.01      0.09      0.01        70
          4       0.01      0.08      0.01       133
          5       0.00      0.03      0.00        80
          6       0.01      0.10      0.02        83
          7       0.86      0.70      0.77     50306
          8       0.00      0.00      0.00        39
          9       0.00      0.00      0.00        13
         10       0.47      0.50      0.49     19225
         11       0.02      0.14      0.03       389

avg / total       0.74      0.64      0.68     70439
```

In [34]:
params = {
    'objective': 'multi:softprob',
    'num_class': 12,
    'bst:eta' :  0.5,
    'bst:max_depth': 4,
    'eval_metric': 'ndcg@5-',
    'silent': 0,
    'nthread': 2,
    'lambda': 1.0
}

values, counts = np.unique(y, return_counts=True)
freqs = counts/float(counts.sum())
weights = 1/freqs

clf = xgb.train(params, xgb.DMatrix(session_features_train.values, label=y, 
                                    weight=weights.take(y)), num_boost_round=5000)
clf.predict_proba = clf.predict

y_pred = clf.predict_proba(xgb.DMatrix(session_features_test.values))

ids = []  #list of ids
cts = []  #list of countries
id_test = test.id.values
for i in range(len(id_test)):
    idx = id_test[i]
    ids += [idx] * 5
    cts += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist()

#Generate submission
sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
sub.to_csv(os.path.join('data', 'sub6.csv'),index=False)


In [16]:
from sklearn.metrics import f1_score, recall_score, confusion_matrix, classification_report

In [20]:
# X = final[:train.shape[0]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

n_trees = [20, 40, 80, 160]
df = pd.DataFrame(np.nan, index=n_trees, columns=['train_score', 'test_score', 'ndcg_score'])
for n in n_trees:
    clf = RandomForestClassifier(n_estimators=n, oob_score=True, n_jobs=1, random_state=42, criterion='entropy', max_depth=32)
    clf = clf.fit(X_train, y_train)
    
    # The RF will give you unusually high training error if calculated this way
    # http://stats.stackexchange.com/questions/66543/random-forest-is-overfitting
    
    # Changing to out-of-bag error
    print mean_ndcg(clf, X_test, y_test)
    print classification_report(clf.predict(X_test), y_test)
#     df.loc[[n],'train_score'] = clf.oob_score_
#     df.loc[[n],'test_score'] = recall_score(clf.predict(X_test), y_test, average='weighted')
#     df.loc[[n],'ndcg_score'] = mean_ndcg(clf, X_test, y_test)

0.907452273331
             precision    recall  f1-score   support

          0       0.00      0.00      0.00         9
          1       0.00      0.00      0.00        15
          2       0.00      0.00      0.00        16
          3       0.00      0.00      0.00        36
          4       0.00      0.02      0.00        88
          5       0.00      0.05      0.00        21
          6       0.00      0.04      0.00        28
          7       0.85      0.68      0.76     51714
          8       0.00      0.00      0.00         7
          9       0.00      0.00      0.00         2
         10       0.43      0.49      0.46     18291
         11       0.00      0.07      0.01       212

avg / total       0.74      0.62      0.67     70439

0.911470609321
             precision    recall  f1-score   support

          0       0.00      0.00      0.00         7
          1       0.00      0.00      0.00        12
          2       0.00      0.00      0.00        13
          3 

In [18]:
print classification_report(clf.predict(X_test), y_test)

             precision    recall  f1-score   support

          0       0.00      0.00      0.00         0
          1       0.00      0.00      0.00         0
          2       0.00      0.00      0.00         0
          3       0.00      0.00      0.00         0
          4       0.00      0.00      0.00         0
          5       0.00      0.00      0.00         0
          6       0.00      0.00      0.00         0
          7       0.93      0.65      0.77     58855
          8       0.00      0.00      0.00         0
          9       0.00      0.00      0.00         0
         10       0.31      0.55      0.39     11584
         11       0.00      0.00      0.00         0

avg / total       0.83      0.63      0.70     70439



In [12]:
#y_pred = clf.predict_proba(data[train.shape[0]:])  

In [13]:
#ids = []  #list of ids
#cts = []  #list of countries
#for i in range(len(id_test)):
#    idx = id_test[i]
#    ids += [idx] * 5
#    cts += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist()

#Generate submission
#sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
#sub.to_csv(os.path.join('data', 'sub.csv'),index=False)

In [2]:
from sklearn.grid_search import GridSearchCV

In [14]:
X.to_pickle('X.pkl')
np.save('y.npy', y)

In [5]:
X = pd.read_pickle('X.pkl')
y = np.load('y.npy')

In [None]:
# Feature selection, we run random forest and we remove some less-useful features
# to improve performance
clf = RandomForestClassifier(n_estimators=250, oob_score=True, n_jobs=1, criterion='entropy')

# use a full grid over all parameters
param_grid = {
                "max_depth": [16, 32, 64],
#                 "n_estimators": [500]
#               "max_features": ["sqrt", 'log2'],
#               "min_samples_split": [2],
#               "min_samples_leaf": [1, 3, 10],
#               "bootstrap": [True, False],
#               "criterion": ["gini", "entropy"]
}

search = GridSearchCV(clf, param_grid, scoring='f1_weighted', verbose=1)

In [None]:
# X = final[:train.shape[0]]
search.fit(X, y)
print search.best_params_
print search.best_score_

In [6]:
search.grid_scores_

AttributeError: 'GridSearchCV' object has no attribute 'grid_scores_'