In [33]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report
import numpy as np
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
import os
from sklearn.feature_extraction import DictVectorizer
import scipy.sparse as sp
from mylib.scoring import ndcg_at_k, mean_ndcg
from mylib.preprocess import make_sessions_features, make_user_features

In [5]:
test = pd.read_csv(os.path.join('data', 'test_users.csv'), header=0, parse_dates=[1,2,3])
train = pd.read_csv(os.path.join('data', 'train_users_2.csv'), header=0, parse_dates=[1,2,3])
df_sessions = pd.read_csv("data/sessions.csv", encoding='utf8')
from mylib.scoring import ndcg_at_k

In [8]:
user_features_train, user_features_test, y, le = make_user_features(train, test)
session_features = make_sessions_features(user_features_train, df_sessions)

Running Model

In [10]:
import xgboost as xgb

In [24]:
# X = final[:train.shape[0]]
X_train, X_test, y_train, y_test = train_test_split(session_features, y, test_size=0.33, random_state=42)

In [60]:
params = {
    'objective': 'multi:softprob',
    'num_class': 13,
    'bst:eta' :  0.1,
    'bst:max_depth': 6,
    'eval_metric': 'ndcg@5-',
    'silent': 0,
    'nthread': 2
}

for l in [0.0, 0.01, 0.1, 1, 10, 100]:
    params['lambda'] = l
    print "Lambda", l
    gbm = xgb.train(params, xgb.DMatrix(X_train.values, label=y_train), num_boost_round=10)
    gbm.predict_proba = gbm.predict
    print mean_ndcg(gbm, xgb.DMatrix(X_test.values), y_test)
    prediction = gbm.predict(xgb.DMatrix(X_test.values)).argmax(axis=1)
    print classification_report(prediction, y_test)


Lambda 0.0
0.921660328214
             precision    recall  f1-score   support

          0       0.00      0.00      0.00         0
          1       0.00      0.00      0.00         2
          2       0.00      0.00      0.00         0
          3       0.00      0.00      0.00         4
          4       0.00      0.50      0.00         2
          5       0.00      0.00      0.00         0
          6       0.00      0.00      0.00         2
          7       0.84      0.71      0.77     48374
          8       0.00      0.00      0.00         2
          9       0.00      0.00      0.00         0
         10       0.53      0.50      0.51     22048
         11       0.00      0.20      0.00         5

avg / total       0.74      0.64      0.69     70439

Lambda 0.01
0.921735136856
             precision    recall  f1-score   support

          0       0.00      0.00      0.00         0
          1       0.00      0.00      0.00         3
          2       0.00      0.00      0.00

0.921557212228


In [55]:
def precision_at_k(p_pred, y_true, k):
    
    

             precision    recall  f1-score   support

          0       0.00      0.00      0.00         0
          1       0.00      0.00      0.00         0
          2       0.00      0.00      0.00         0
          3       0.00      0.00      0.00         0
          4       0.00      0.00      0.00         1
          5       0.00      0.00      0.00         0
          6       0.00      0.00      0.00         1
          7       0.84      0.71      0.77     48313
          8       0.00      0.00      0.00         0
          9       0.00      0.00      0.00         0
         10       0.53      0.49      0.51     22118
         11       0.00      0.00      0.00         6

avg / total       0.74      0.64      0.69     70439



In [None]:
clf = RandomForestClassifier(n_estimators=200, oob_score=True, n_jobs=1, random_state=42, criterion='entropy', max_depth=32)
clf = clf.fit(X, y)

y_pred = clf.predict_proba(final.ix[train.shape[0]:])  
ids = []  #list of ids
cts = []  #list of countries
for i in range(len(id_test)):
    idx = id_test[i]
    ids += [idx] * 5
    cts += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist()

#Generate submission
sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
sub.to_csv(os.path.join('data', 'sub4.csv'),index=False)


In [16]:
from sklearn.metrics import f1_score, recall_score, confusion_matrix, classification_report

In [20]:
# X = final[:train.shape[0]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

n_trees = [20, 40, 80, 160]
df = pd.DataFrame(np.nan, index=n_trees, columns=['train_score', 'test_score', 'ndcg_score'])
for n in n_trees:
    clf = RandomForestClassifier(n_estimators=n, oob_score=True, n_jobs=1, random_state=42, criterion='entropy', max_depth=32)
    clf = clf.fit(X_train, y_train)
    
    # The RF will give you unusually high training error if calculated this way
    # http://stats.stackexchange.com/questions/66543/random-forest-is-overfitting
    
    # Changing to out-of-bag error
    print mean_ndcg(clf, X_test, y_test)
    print classification_report(clf.predict(X_test), y_test)
#     df.loc[[n],'train_score'] = clf.oob_score_
#     df.loc[[n],'test_score'] = recall_score(clf.predict(X_test), y_test, average='weighted')
#     df.loc[[n],'ndcg_score'] = mean_ndcg(clf, X_test, y_test)

0.907452273331
             precision    recall  f1-score   support

          0       0.00      0.00      0.00         9
          1       0.00      0.00      0.00        15
          2       0.00      0.00      0.00        16
          3       0.00      0.00      0.00        36
          4       0.00      0.02      0.00        88
          5       0.00      0.05      0.00        21
          6       0.00      0.04      0.00        28
          7       0.85      0.68      0.76     51714
          8       0.00      0.00      0.00         7
          9       0.00      0.00      0.00         2
         10       0.43      0.49      0.46     18291
         11       0.00      0.07      0.01       212

avg / total       0.74      0.62      0.67     70439

0.911470609321
             precision    recall  f1-score   support

          0       0.00      0.00      0.00         7
          1       0.00      0.00      0.00        12
          2       0.00      0.00      0.00        13
          3 

In [18]:
print classification_report(clf.predict(X_test), y_test)

             precision    recall  f1-score   support

          0       0.00      0.00      0.00         0
          1       0.00      0.00      0.00         0
          2       0.00      0.00      0.00         0
          3       0.00      0.00      0.00         0
          4       0.00      0.00      0.00         0
          5       0.00      0.00      0.00         0
          6       0.00      0.00      0.00         0
          7       0.93      0.65      0.77     58855
          8       0.00      0.00      0.00         0
          9       0.00      0.00      0.00         0
         10       0.31      0.55      0.39     11584
         11       0.00      0.00      0.00         0

avg / total       0.83      0.63      0.70     70439



In [12]:
#y_pred = clf.predict_proba(data[train.shape[0]:])  

In [13]:
#ids = []  #list of ids
#cts = []  #list of countries
#for i in range(len(id_test)):
#    idx = id_test[i]
#    ids += [idx] * 5
#    cts += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist()

#Generate submission
#sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
#sub.to_csv(os.path.join('data', 'sub.csv'),index=False)

In [2]:
from sklearn.grid_search import GridSearchCV

In [14]:
X.to_pickle('X.pkl')
np.save('y.npy', y)

In [5]:
X = pd.read_pickle('X.pkl')
y = np.load('y.npy')

In [None]:
# Feature selection, we run random forest and we remove some less-useful features
# to improve performance
clf = RandomForestClassifier(n_estimators=250, oob_score=True, n_jobs=1, criterion='entropy')

# use a full grid over all parameters
param_grid = {
                "max_depth": [16, 32, 64],
#                 "n_estimators": [500]
#               "max_features": ["sqrt", 'log2'],
#               "min_samples_split": [2],
#               "min_samples_leaf": [1, 3, 10],
#               "bootstrap": [True, False],
#               "criterion": ["gini", "entropy"]
}

search = GridSearchCV(clf, param_grid, scoring='f1_weighted', verbose=1)

In [None]:
# X = final[:train.shape[0]]
search.fit(X, y)
print search.best_params_
print search.best_score_

In [6]:
search.grid_scores_

AttributeError: 'GridSearchCV' object has no attribute 'grid_scores_'