# Machine Learning Pipeline & Testing

### load libraries that will be used

In [6]:
%matplotlib inline

import zipfile
from datetime import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats

from mpl_toolkits.basemap import Basemap

# make figures better:
font = {'weight':'normal','size':20}
plt.rc('font', **font)
plt.rc('figure', figsize=(9.0, 6.0))
plt.rc('xtick.major', pad=10) # xticks too close to border!
plt.style.use('ggplot')

#print(plt.style.available)

### anchor point

In [12]:
np.random.seed(0)

### unzip and load data into memory

In [36]:
# Dataset #1: Countries to visit
zf = zipfile.ZipFile('data/countries.csv.zip')
df_countries = pd.read_csv(zf.open('countries.csv'))
print("df_countries => rows: %0.0f; columns: %0.0f" % np.shape(df_countries))

# Dataset #2: Compare demographic distributions within destination countries
zf = zipfile.ZipFile('data/age_gender_bkts.csv.zip')
df_country_demographics = pd.read_csv(zf.open('age_gender_bkts.csv'))
print("df_country_demographics => rows: %0.0f; columns: %0.0f" % np.shape(df_country_demographics))

# Dataset #3: User interactions on airbnb website
zf = zipfile.ZipFile('data/sessions.csv.zip')
df_user_sessions = pd.read_csv(zf.open('sessions.csv'))
print("df_user_sessions => rows: %0.0f; columns: %0.0f" % np.shape(df_user_sessions ))

# Dataset #4: Comparing test and training data to what has been provided as user data for 2015
# train
zf = zipfile.ZipFile('data/train_users_2.csv.zip')
df_train = pd.read_csv(zf.open('train_users_2.csv'))
print("df_train => rows: %0.0f; columns: %0.0f" % np.shape(df_train))

# test
zf = zipfile.ZipFile('data/test_users.csv.zip')
df_test = pd.read_csv(zf.open('test_users.csv'))
print("df_test => rows: %0.0f; columns: %0.0f" % np.shape(df_test))

df_countries => rows: 10; columns: 7
df_country_demographics => rows: 420; columns: 5
df_user_sessions => rows: 10567737; columns: 6
df_train => rows: 213451; columns: 16
df_test => rows: 62096; columns: 15


### combine, transform and engineer features

In [56]:
# concatenate train- and test users together in order to do all the changes on both datasets
df_users = pd.concat((df_train, df_test), axis=0, ignore_index=True)
print("df_users => rows: %0.0f; columns: %0.0f" % np.shape(df_users))

df_users => rows: 275547; columns: 16


In [57]:
### transformations ###

# incorrectly populated ages
av = df_users.age.values
df_users['age'] = np.where(np.logical_and(av>1900, av<2015), 2015-av, av) # fix those with year of birth as age
df_users['age'] = np.where(np.logical_or(av<14, av>100), np.nan, av) # set all ages deemed unlikely as null

# handling nulls 
df_users.replace("-unknown-", np.nan, inplace=True)
df_users.fillna(-1, inplace=True)


### feature engineering ###

# date_account_created
df_users['date_account_created'] = pd.to_datetime(df_users.date_account_created)
df_users['year_account_created'] = df_users.date_account_created.dt.year
df_users['month_account_created'] = df_users.date_account_created.dt.month
df_users['week_account_created'] = df_users.date_account_created.dt.week
df_users['weekday_account_created'] = df_users.date_account_created.dt.weekday
df_users['day_account_created'] = df_users.date_account_created.dt.day

# timestamp_first_active
df_users['date_first_active'] = pd.to_datetime((df_users.timestamp_first_active // 1000000), format='%Y%m%d')
df_users['year_first_active'] = df_users.date_first_active.dt.year
df_users['month_first_active'] = df_users.date_first_active.dt.month
df_users['week_first_active'] = df_users.date_first_active.dt.week
df_users['weekday_first_active'] = df_users.date_first_active.dt.weekday
df_users['day_first_active'] = df_users.date_first_active.dt.day

# cleanup
# date_first_booking isn't populated in the test set so this feature can't be used 
# and I'm done with the orignal date fields
drop_list = ['date_account_created','timestamp_first_active','date_first_active','date_first_booking']
df_users.drop(drop_list, axis=1, inplace=True)

### check impact of changes ###
print("df_users => rows: %0.0f; columns: %0.0f" % np.shape(df_users))

df_users => rows: 275547; columns: 23


### test/train split
after the transfromations and feature engineering has been performed on the combination of the training and the test set, these two data sets are split out once more

In [58]:
# setup ml structure
labels = df_users['country_destination'].values
features = df_users.drop(['id','country_destination'], axis=1)

# split train and test
X_train, X_test, y_train, y_test = features[len(df_train):], labels[len(df_train):], features[-len(df_test):], labels[-len(df_test):]

# Random Forest

Characteristics:
* low bais
* high variance
* prone to overfitting

Tuning Parameters:
* number of trees
* number of features to consider at each split
* depth of trees

# Decision Tree Classifier

scikit-learn uses an optimised version of the CART algorithm. CART (Classification and Regression Trees) supports both categorical and numerical target variables (regression) and does not compute rule sets. CART constructs binary trees using the feature and threshold that yield the largest information gain at each node.



In [None]:
dataframe = df_users
# setup ml structure
labels = dataframe['country_destination'].values

# date_account_created
dataframe['date_account_created'] = pd.to_datetime(dataframe.date_account_created)
dataframe['creation_year'] = dataframe.date_account_created.dt.year
dataframe['creation_month'] = dataframe.date_account_created.dt.month
dataframe['creation_day'] = dataframe.date_account_created.dt.day

# timestamp_first_active
dataframe['date_first_active'] = pd.to_datetime((dataframe.timestamp_first_active // 1000000), format='%Y%m%d')
dataframe['active_year'] = dataframe.date_first_active.dt.year
dataframe['active_month'] = dataframe.date_first_active.dt.month
dataframe['active_day'] = dataframe.date_first_active.dt.day

# cleanup
# date_first_booking isn't populated in the test set so this feature can't be used
features = dataframe.drop(['id','country_destination','date_account_created','timestamp_first_active','date_first_active','date_first_booking'], axis=1)
features.replace("-unknown-", np.nan, inplace = True)
features = features.fillna(-1)

In [None]:
X_train, X_test, y_train, y_test = features[0:len(df_train)], labels[0:len(df_train)], features[len(df_test):], labels[len(df_test):]

In [None]:
len(df_train)

In [None]:
len(df_test)

In [None]:
X_train.shape, y_train.shape

In [None]:
X_test.shape,  y_test.shape

You have to do some encoding before using fit. As it was told fit() does not accept Strings but you solve this.

In [None]:
# encoding
encoded_features_train = pd.get_dummies(features_train)
encoded_features_train.head()

In [None]:
# assign
X_train, y_train = encoded_features_train, labels_train

In [None]:
# Fit regression model
from sklearn.tree import DecisionTreeClassifier

regr_1 = DecisionTreeClassifier(max_depth=2)
regr_2 = DecisionTreeClassifier(max_depth=5)
regr_1.fit(X, y)
regr_2.fit(X, y)

In [None]:
# Predict
y_1 = regr_1.predict(X_test)
y_2 = regr_2.predict(X_test)

In [None]:
# Plot the results
plt.figure()
plt.scatter(X_train, y_train, c="k", label="data")
plt.plot(X_test, y_1, c="g", label="max_depth=2", linewidth=2)
plt.plot(X_test, y_2, c="r", label="max_depth=5", linewidth=2)
plt.xlabel("data")
plt.ylabel("target")
plt.title("Decision Tree Regression")
plt.legend()
plt.show()

In [None]:
encoded_features = pd.get_dummies(features)

In [None]:
encoded_features.head()

In [None]:
# train/test dataset 70/30 split 
X, y, feature_names, target_name = encoded_features, target, list(encoded_features),list(target)

from sklearn import cross_validation
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.3, random_state=0)

# Feature Scaling

$x' = x - x_{min} / x_{max} - x_{min}$

# XGBClassifier

In [None]:
# source: https://www.kaggle.com/davidgasquez/airbnb-recruiting-new-user-bookings/ndcg-scorer/code

"""Metrics to compute the model performance."""

import numpy as np
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import make_scorer


def dcg_score(y_true, y_score, k=5):
    """Discounted cumulative gain (DCG) at rank K.

    Parameters
    ----------
    y_true : array, shape = [n_samples]
        Ground truth (true relevance labels).
    y_score : array, shape = [n_samples, n_classes]
        Predicted scores.
    k : int
        Rank.

    Returns
    -------
    score : float
    """
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])

    gain = 2 ** y_true - 1

    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gain / discounts)


def ndcg_score(ground_truth, predictions, k=5):
    """Normalized discounted cumulative gain (NDCG) at rank K.

    Normalized Discounted Cumulative Gain (NDCG) measures the performance of a
    recommendation system based on the graded relevance of the recommended
    entities. It varies from 0.0 to 1.0, with 1.0 representing the ideal
    ranking of the entities.

    Parameters
    ----------
    ground_truth : array, shape = [n_samples]
        Ground truth (true labels represended as integers).
    predictions : array, shape = [n_samples, n_classes]
        Predicted probabilities.
    k : int
        Rank.

    Returns
    -------
    score : float

    Example
    -------
    >>> ground_truth = [1, 0, 2]
    >>> predictions = [[0.15, 0.55, 0.2], [0.7, 0.2, 0.1], [0.06, 0.04, 0.9]]
    >>> score = ndcg_score(ground_truth, predictions, k=2)
    1.0
    >>> predictions = [[0.9, 0.5, 0.8], [0.7, 0.2, 0.1], [0.06, 0.04, 0.9]]
    >>> score = ndcg_score(ground_truth, predictions, k=2)
    0.6666666666
    """
    lb = LabelBinarizer()
    T = lb.fit_transform(ground_truth)

    scores = []

    # Iterate over each y_true and compute the DCG score
    for y_true, y_score in zip(T, predictions):
        actual = dcg_score(y_true, y_score, k)
        best = dcg_score(y_true, y_true, k)
        score = float(actual) / float(best)
        scores.append(score)

    return np.mean(scores)


# NDCG Scorer function
ndcg_scorer = make_scorer(ndcg_score, needs_proba=True, k=5)

In [None]:
from sklearn.preprocessing import LabelEncoder
from xgboost.sklearn import XGBClassifier

np.random.seed(0)

#Loading data
df_train = pd.read_csv('../input/train_users.csv')
df_test = pd.read_csv('../input/test_users.csv')
labels = df_train['country_destination'].values
df_train = df_train.drop(['country_destination'], axis=1)
id_test = df_test['id']
piv_train = df_train.shape[0]

#Creating a DataFrame with train+test data
df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)
#Removing id and date_first_booking
df_all = df_all.drop(['id', 'date_first_booking'], axis=1)
#Filling nan
df_all = df_all.fillna(-1)

#####Feature engineering#######
#date_account_created
dac = np.vstack(df_all.date_account_created.astype(str).apply(lambda x: list(map(int, x.split('-')))).values)
df_all['dac_year'] = dac[:,0]
df_all['dac_month'] = dac[:,1]
df_all['dac_day'] = dac[:,2]
df_all = df_all.drop(['date_account_created'], axis=1)

#timestamp_first_active
tfa = np.vstack(df_all.timestamp_first_active.astype(str).apply(lambda x: list(map(int, [x[:4],x[4:6],x[6:8],x[8:10],x[10:12],x[12:14]]))).values)
df_all['tfa_year'] = tfa[:,0]
df_all['tfa_month'] = tfa[:,1]
df_all['tfa_day'] = tfa[:,2]
df_all = df_all.drop(['timestamp_first_active'], axis=1)

#Age
av = df_all.age.values
df_all['age'] = np.where(np.logical_or(av<14, av>100), -1, av)

#One-hot-encoding features
ohe_feats = ['gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser']
for f in ohe_feats:
    df_all_dummy = pd.get_dummies(df_all[f], prefix=f)
    df_all = df_all.drop([f], axis=1)
    df_all = pd.concat((df_all, df_all_dummy), axis=1)

#Splitting train and test
vals = df_all.values
X = vals[:piv_train]
le = LabelEncoder()
y = le.fit_transform(labels)   
X_test = vals[piv_train:]

#Classifier
xgb = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=25,
                    objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0)                  
xgb.fit(X, y)
y_pred = xgb.predict_proba(X_test)  

#Taking the 5 classes with highest probabilities
ids = []  #list of ids
cts = []  #list of countries
for i in range(len(id_test)):
    idx = id_test[i]
    ids += [idx] * 5
    cts += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist()

#Generate submission
sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
sub.to_csv('sub.csv',index=False)

### Add more features 
In order to see whether adding session data makes a difference

In [None]:
# sessions
sessions.rename(columns = {'user_id': 'id'}, inplace=True)