In [27]:
import pandas as pd
import os
from sklearn.feature_extraction import DictVectorizer
import scipy.sparse as sp
import numpy as np

In [2]:
if not os.path.exists("data/sessions.csv"):
    print("Make sure to unzip the file data/sessions.csv.zip!")
else:
    df = pd.read_csv("data/train_users_2.csv", encoding='utf8')
    df_sessions = pd.read_csv("data/sessions.csv", encoding='utf8')

In [3]:
# Drop row with nan values from the "user_id" column as they're useless
df_sessions = df_sessions.dropna(subset=["user_id"])

# Making features from the session dataset

- action counts
- device counts
- ignore seconds elapsed for now

In [4]:
# Frequency of devices - by user
device_freq = df_sessions.groupby('user_id').device_type.value_counts()

In [5]:
# Frequency of actions taken - by user
action_freq = df_sessions.groupby('user_id').action.value_counts()

In [6]:
# Total list of users
users = df.id

In [7]:
def feature_dict(df):
    f_dict = dict(list(df.groupby(level='user_id')))
    res = {}
    for k, v in f_dict.items():
        v.index = v.index.droplevel('user_id')
        res[k] = v.to_dict()
    return res

In [8]:
# Make a dictionary with the frequencies { 'user_id' : {"IPhone": 2, "Windows": 1}}
action_dict = feature_dict(action_freq)
device_dict = feature_dict(device_freq)

In [9]:
# Transform to a list of dictionaries
action_rows = [action_dict.get(k, {}) for k in users]
device_rows = [device_dict.get(k, {}) for k in users]

In [10]:
device_transf = DictVectorizer()
tf = device_transf.fit_transform(device_rows)

In [11]:
action_transf = DictVectorizer()
tf2 = action_transf.fit_transform(action_rows)

In [12]:
# Concatenate the two datasets
# Those are row vectors with the frequencies of both device and actions [0, 0, 0, 2, 0, 1, ...]
features = sp.hstack([tf, tf2])

In [18]:
# We create a dataframe with the new features and we write it to disk
df_sess_features = pd.DataFrame(features.todense())
df_sess_features['id'] = df.id

df_sess_features.to_csv("postprocess/session_features.csv")

In [19]:
from sklearn.preprocessing import LabelBinarizer
dest_transf = LabelBinarizer()
y_tf = dest_transf.fit_transform(df.country_destination)

In [30]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, y_tf, test_size=0.33, random_state=42)

In [None]:
for n in [10, 20, 40, 80, 160]:
    clf = RandomForestClassifier(n_estimators=n, bootstrap=True, oob_score=True, random_state=42)
    clf.fit(X_train, y_train)
    print n
    print clf.oob_score_
    print clf.score(X_test, y_test)

10
0.930685886499
0.552676784168