# Machine Learning Pipeline & Testing

### load libraries that will be used

In [83]:
# load and transform
import zipfile
from datetime import datetime
import numpy as np
import pandas as pd

# ml
#from sklearn import cross_validation
#from sklearn.cross_validation import cross_val_score
#from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
#from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import make_scorer
from rank_metrics import ndcg_at_k
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from xgboost.sklearn import XGBClassifier

# graphics
%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap

# make figures better:
font = {'weight':'normal','size':20}
plt.rc('font', **font)
plt.rc('figure', figsize=(9.0, 6.0))
plt.rc('xtick.major', pad=10) # xticks too close to border!
plt.style.use('ggplot')

#print(plt.style.available)

### unzip and load data into memory

In [36]:
# Dataset #1: Countries to visit
zf = zipfile.ZipFile('data/countries.csv.zip')
df_countries = pd.read_csv(zf.open('countries.csv'))
print("df_countries => rows: %0.0f; columns: %0.0f" % np.shape(df_countries))

# Dataset #2: Compare demographic distributions within destination countries
zf = zipfile.ZipFile('data/age_gender_bkts.csv.zip')
df_country_demographics = pd.read_csv(zf.open('age_gender_bkts.csv'))
print("df_country_demographics => rows: %0.0f; columns: %0.0f" % np.shape(df_country_demographics))

# Dataset #3: User interactions on airbnb website
zf = zipfile.ZipFile('data/sessions.csv.zip')
df_user_sessions = pd.read_csv(zf.open('sessions.csv'))
print("df_user_sessions => rows: %0.0f; columns: %0.0f" % np.shape(df_user_sessions ))

# Dataset #4: Comparing test and training data to what has been provided as user data for 2015
# train
zf = zipfile.ZipFile('data/train_users_2.csv.zip')
df_train = pd.read_csv(zf.open('train_users_2.csv'))
print("df_train => rows: %0.0f; columns: %0.0f" % np.shape(df_train))

# test
zf = zipfile.ZipFile('data/test_users.csv.zip')
df_test = pd.read_csv(zf.open('test_users.csv'))
print("df_test => rows: %0.0f; columns: %0.0f" % np.shape(df_test))

df_countries => rows: 10; columns: 7
df_country_demographics => rows: 420; columns: 5
df_user_sessions => rows: 10567737; columns: 6
df_train => rows: 213451; columns: 16
df_test => rows: 62096; columns: 15


### combine, transform and engineer features

In [130]:
# concatenate train- and test users together in order to do all the changes on both datasets
df_users = pd.concat((df_train, df_test), axis=0, ignore_index=True)
print("df_users => rows: %0.0f; columns: %0.0f" % np.shape(df_users))

df_users => rows: 275547; columns: 16


In [131]:
### transformations ###

# incorrectly populated ages
av = df_users.age.values
df_users['age'] = np.where(np.logical_and(av>1900, av<2015), 2015-av, av) # fix those with year of birth as age
df_users['age'] = np.where(np.logical_or(av<14, av>100), np.nan, av) # set all ages deemed unlikely as null

# handling nulls 
df_users.replace("-unknown-", np.nan, inplace=True)
df_users.fillna(-1, inplace=True)


### feature engineering ###

# date_account_created
df_users['date_account_created'] = pd.to_datetime(df_users.date_account_created)
df_users['year_account_created'] = df_users.date_account_created.dt.year
df_users['month_account_created'] = df_users.date_account_created.dt.month
df_users['week_account_created'] = df_users.date_account_created.dt.week
df_users['weekday_account_created'] = df_users.date_account_created.dt.weekday
df_users['day_account_created'] = df_users.date_account_created.dt.day

# timestamp_first_active
df_users['date_first_active'] = pd.to_datetime((df_users.timestamp_first_active // 1000000), format='%Y%m%d')
df_users['year_first_active'] = df_users.date_first_active.dt.year
df_users['month_first_active'] = df_users.date_first_active.dt.month
df_users['week_first_active'] = df_users.date_first_active.dt.week
df_users['weekday_first_active'] = df_users.date_first_active.dt.weekday
df_users['day_first_active'] = df_users.date_first_active.dt.day

# cleanup
# date_first_booking isn't populated in the test set so this feature can't be used 
# and I'm done with the orignal date fields
drop_list = ['date_account_created','timestamp_first_active','date_first_active','date_first_booking']
df_users.drop(drop_list, axis=1, inplace=True)

#One-hot-encoding features
ohe_features = ['gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser']
for f in ohe_features:
    df_encodings = pd.get_dummies(df_users[f], prefix=f)
    df_users = df_users.drop([f], axis=1)
    df_users = pd.concat((df_users, df_encodings), axis=1)
    
    
### check impact of changes ###
print("df_users => observations: %0.0f; features: %0.0f" % np.shape(df_users))


### setup ml structure ###
le = LabelEncoder()
labels = df_users['country_destination'].values
y = le.fit_transform(labels) 
features = df_users.drop(['id','country_destination'], axis=1)

print("destination encoding:")
print(list(zip(le.classes_,range(0,len(y)))))

df_users => observations: 275547; features: 167
destination encoding:
[(-1, 0), ('AU', 1), ('CA', 2), ('DE', 3), ('ES', 4), ('FR', 5), ('GB', 6), ('IT', 7), ('NDF', 8), ('NL', 9), ('PT', 10), ('US', 11), ('other', 12)]


### Feature Scaling

$x' = x - x_{min} / x_{max} - x_{min}$

### test/train split
after the transfromations and feature engineering has been performed on the combination of the training and the test set, these two data sets are split out once more

In [132]:
# split train and test
cutt_off = df_train.shape[0]
X_train, X_test, y_train, y_test = features[:cutt_off], y[:cutt_off], features[cutt_off:], y[cutt_off:]

### create scorer

In [109]:
# Simulate NDCG scorer used by Kaggle competition
def ndcg_wrapper(y_true,y_pred_proba):
    Y = np.fliplr(y_pred_proba.argsort())
      
    R = []
    NDCG = []
    for i in range(0,y_true.size):
        r = (Y[i,:]==y_true[i]).astype(int)
        R.append(r)
        NDCG.append(ndcg_at_k(r,5,method=1))
    return np.mean(NDCG)

ndcg_scorer = make_scorer(ndcg_wrapper, greater_is_better=True, needs_proba=True)

for i in range(0,5):
    print("Correct Destination in Position %d: NDGG = %.3f" % (i,ndcg_at_k([0]*i+[1],5,1)))

Correct Destination in Position 0: NDGG = 1.000
Correct Destination in Position 1: NDGG = 0.631
Correct Destination in Position 2: NDGG = 0.500
Correct Destination in Position 3: NDGG = 0.431
Correct Destination in Position 4: NDGG = 0.387


# Benchmark

In [133]:
# any of the algorithms built beyond this one should at the very least improve on this attempt 
Dummy = DummyClassifier(strategy='prior').fit(X_train,y_train)
dummy_score = ndcg_scorer(Dummy, X_test, y_test)

print('NDCG score for Dummy Estimator: {0:.4f}'.format(dummy_score))



AttributeError: 'list' object has no attribute 'argsort'

# Logistic Regression

In [None]:
encoded_train_labels = pd.DataFrame(train_labels,columns = ['country_destination'])
encoded_train_labels['new_code'] = np.where((encoded_train_labels.country_destination == 'US'), 1,0)
encoded_train_labels = encoded_train_labels.new_code
encoded_train_labels.head()

In [None]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(
        train_set, train_labels, test_size=0.3, random_state=0)

In [None]:
print "training: %i, %i" % (X_train.shape[0],y_train.shape[0])
print "test: %i, %i" % (X_test.shape[0],y_test.shape[0])

In [None]:
clf = LogisticRegressionCV('l2',C=1.0)
clf.fit(X_train, y_train)

In [None]:
clf.predict(X_test)

In [None]:
clf.score(X_test, y_test)

# Random Forest

Characteristics:
* low bais
* high variance
* prone to overfitting

Tuning Parameters:
* number of trees
* number of features to consider at each split
* depth of trees

In [None]:
RF = RandomForestClassifier(n_estimators=200,n_jobs=-1,class_weight='balanced',oob_score=True)
CV_score = cross_val_score(RF,X,y,scoring=ndcg_scorer, cv=5, verbose=2)

print('CV scores = ',CV_score)
print('Mean CV score = ', np.mean(CV_score))

# XGBClassifier

In [None]:
xgb = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=25,
                    objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0)                  
xgb.fit(X, y)
y_pred = xgb.predict_proba(X_test)  



### Add more features 
In order to see whether adding session data makes a difference

In [None]:
# sessions
sessions.rename(columns = {'user_id': 'id'}, inplace=True)