In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
import numpy as np
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
import os

np.random.seed(0)

# Loading data
data = pd.read_csv(os.path.join('data', 'train_users_2.csv'), header=0, parse_dates=[1,2,3])

In [2]:
outcome = data.country_destination
labels = outcome.values
le = LabelEncoder()
y = le.fit_transform(labels)

data = data.drop(['id','date_first_booking'], axis=1)
#df_all = df_all.fillna(-1)

#date_account_created
data['dac_year'] = data.date_account_created.apply(lambda x: x.year)
data['dac_month'] = data.date_account_created.apply(lambda x: x.month)
data['dac_weekday'] = data.date_account_created.apply(lambda x: x.weekday())
data = data.drop(['date_account_created'], axis=1)

#timestamp_first_active
data['tfa_year'] = data.timestamp_first_active.apply(lambda x: x.year)
data['tfa_month'] = data.timestamp_first_active.apply(lambda x: x.month)
data['tfa_weekday'] = data.timestamp_first_active.apply(lambda x: x.weekday())
data = data.drop(['timestamp_first_active'], axis=1)

In [3]:
data.age = data.age.fillna(data.age.median())

In [4]:
bins = list(np.arange(15, 85, 5))
bins.insert(0,0)
bins.append(int(max(data.age)))

In [5]:
group_names = ['<15', '15-20', '20-25', '25-30', '30-35', '35-40', '40-45', '45-50', '50-55', '55-60', '60-65', '65-70', '70-75', '75-80', '>80']

In [6]:
data['age_bucket'] = pd.cut(data['age'], bins, labels=group_names)

In [7]:
data.gender = data.gender.replace('-unknown-','unknown')

In [8]:
data.ix[:, data.columns != 'age_bucket'] = data.ix[:, data.columns != 'age_bucket'].fillna('unknown')

In [9]:
to_be_dummified = ['gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser','age_bucket']
for f in to_be_dummified:
    dummies = pd.get_dummies(data[f], prefix=f)
    data = data.drop([f], axis=1)
    data = pd.concat((data, dummies), axis=1)

In [10]:
data.shape

(213451, 171)

In [11]:
data = data.drop(['country_destination'], axis=1)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.33, random_state=42)

In [59]:
from mylib.scoring import ndcg_at_k

def mean_ndcg(clf, X, y):
    # Predict class probabilities
    y_predict = clf.predict_proba(X)
    # Get highest 5 predictions
    best_5 = np.argsort(-y_predict, axis=1)[:, :5]
    
    # Transform to relevance scores
    relevance = (best_5 == y[:, np.newaxis]).astype('int')
    
    # Calculate ndcg for each sample and take average (?)
    return np.mean([ndcg_at_k(row, 5) for row in relevance])


In [61]:
for n in [10, 20, 40, 80, 160]:
    clf = RandomForestClassifier(n_estimators=n, oob_score=True)
    clf = clf.fit(X_train, y_train)
    
    # The RF will give you unusually high training error if calculated this way
    # http://stats.stackexchange.com/questions/66543/random-forest-is-overfitting
    # train_score = clf.score(X_train, y_train)
    
    # Changing to out-of-bag error
    train_score = clf.oob_score_
    test_score = clf.score(X_test, y_test)
    ndcg_score = mean_ndcg(clf, X_test, y_test)
    
    print "Estimators: {} Accuracy(oob): {} Accuracy(test): {} NDCG(test): {}".format(n, train_score, test_score, ndcg_score)

Estimators: 10 Accuracy(oob): 0.543821497497 Accuracy(test): 0.576214880961 NDCG(test): 0.842808754265
Estimators: 20 Accuracy(oob): 0.569805331021 Accuracy(test): 0.582134896861 NDCG(test): 0.867941441573
Estimators: 40 Accuracy(oob): 0.582321763209 Accuracy(test): 0.585272363321 NDCG(test): 0.880404718879
Estimators: 80 Accuracy(oob): 0.587733896456 Accuracy(test): 0.590865855563 NDCG(test): 0.888091856654
Estimators: 160 Accuracy(oob): 0.590971387016 Accuracy(test): 0.590695495393 NDCG(test): 0.89246613131
