In [15]:
import pandas as pd
import os
from sklearn.feature_extraction import DictVectorizer
import scipy.sparse as sp

In [6]:
if not os.path.exists("data/sessions.csv"):
    print("Make sure to unzip the file data/sessions.csv.zip!")
else:
    df = pd.read_csv("data/train_users_2.csv", encoding='utf8')
    df_sessions = pd.read_csv("data/sessions.csv", encoding='utf8')

In [7]:
# Drop row with nan values from the "user_id" column as they're useless
df_sessions = df_sessions.dropna(subset=["user_id"])

# Making features from the session dataset

- action counts
- device counts
- ignore seconds elapsed for now

In [8]:
# Frequency of devices - by user
device_freq = df_sessions.groupby('user_id').device_type.value_counts()

In [9]:
# Frequency of actions taken - by user
action_freq = df_sessions.groupby('user_id').action.value_counts()

In [10]:
# Total list of users
unique_users = df_sessions.user_id.unique()

In [11]:
def feature_dict(df):
    f_dict = dict(list(df.groupby(level='user_id')))
    res = {}
    for k, v in f_dict.items():
        v.index = v.index.droplevel('user_id')
        res[k] = v.to_dict()
    return res

In [12]:
# Make a dictionary with the frequencies { 'user_id' : {"IPhone": 2, "Windows": 1}}
action_dict = feature_dict(action_freq)
device_dict = feature_dict(device_freq)

In [13]:
# Transform to a list of dictionaries
action_rows = [action_dict.get(k, {}) for k in unique_users]
device_rows = [device_dict.get(k, {}) for k in unique_users]

In [16]:
device_transf = DictVectorizer()
tf = device_transf.fit_transform(device_rows)

In [17]:
action_transf = DictVectorizer()
tf2 = action_transf.fit_transform(action_rows)

In [18]:
# Concatenate the two datasets
# Those are row vectors with the frequencies of both device and actions [0, 0, 0, 2, 0, 1, ...]
features = sp.hstack([tf, tf2])

In [19]:
# TODO: we write the thing to disk?

In [20]:
# Transform output variable
df.index = df.id
dest = [df.country_destination[u] if u in df.index else "NDF" for u in unique_users]

In [21]:
from sklearn.preprocessing import LabelBinarizer
dest_transf = LabelBinarizer()
y_tf = dest_transf.fit_transform(dest)

In [25]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

In [29]:
model = ExtraTreesClassifier(n_estimators=30,bootstrap=True, oob_score=True)
model.fit(features, y_tf)

ExtraTreesClassifier(bootstrap=True, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1,
           oob_score=True, random_state=None, verbose=0, warm_start=False)

In [30]:
model.oob_score_

0.96686115601219347

In [37]:
import numpy as np
session_features = pd.DataFrame(np.matrix(features.todense()))

In [40]:
session_features.index = unique_users

In [41]:
session_features.to_csv()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,363,364,365,366,367,368,369,370,371,372
d1mm9tcy42,5,0,0,0,0,0,0,0,0,122,...,0,0,0,0,0,0,0,0,0,0
yo8nz8bqcq,0,0,0,0,0,0,9,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4grx6yxeby,4,0,0,0,0,0,0,0,0,12,...,0,0,0,0,0,0,0,0,0,0
ncf87guaf0,5,0,1,0,0,0,0,0,0,146,...,0,0,0,0,0,0,0,0,0,0
4rvqpxoh3h,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
c8mfesvkv0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
xwxei6hdk4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5f45ro5uzk,0,0,0,0,0,0,0,0,0,46,...,0,0,0,0,0,0,0,0,0,0
ro2stddszp,0,0,0,0,0,0,43,0,0,0,...,0,0,0,0,0,0,0,0,0,0
qtw88d9pbl,0,0,0,0,0,0,364,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [45]:
session_features = session_features.reset_index()

In [46]:
pd.merge(df, session_features, right_on='index')

TypeError: object of type 'NoneType' has no len()