# Test Machine Learning algorithms

In [2]:
# load libraries that will be used
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats

from mpl_toolkits.basemap import Basemap

# make figures better:
font = {'weight':'normal','size':20}
plt.rc('font', **font)
plt.rc('figure', figsize=(9.0, 6.0))
plt.rc('xtick.major', pad=10) # xticks too close to border!
plt.style.use('ggplot')

#print(plt.style.available)

In [3]:
# unzip and load data into memory
%run load.py

df_countries => rows: 10; columns: 7
df_country_demographics => rows: 420; columns: 5
df_user_sessions => rows: 10567737; columns: 6
df_train => rows: 213451; columns: 16
df_test => rows: 62096; columns: 15
df_users => rows: 275547; columns: 16


In [4]:
# clean up data, slit and format dates
%run transform.py

In [5]:
# transform
target = df_train['country_destination'].values

#date_account_created
df_train['date_account_created'] = pd.to_datetime(df_train.date_account_created)
df_train['creation_year'] = df_train.date_account_created.dt.year
df_train['creation_month'] = df_train.date_account_created.dt.month
df_train['creation_day'] = df_train.date_account_created.dt.day

#timestamp_first_active
df_train['date_first_active'] = pd.to_datetime((df_train.timestamp_first_active // 1000000), format='%Y%m%d')
df_train['active_year'] = df_train.date_first_active.dt.year
df_train['active_month'] = df_train.date_first_active.dt.month
df_train['active_day'] = df_train.date_first_active.dt.day


features = df_train.drop(['id','country_destination','date_account_created','timestamp_first_active','date_first_active','date_first_booking'], axis=1)
features.replace("-unknown-", np.nan, inplace = True)
features = features.fillna(-1)

In [6]:
encoded_features = pd.get_dummies(features)

In [7]:
encoded_features.head()

Unnamed: 0,age,signup_flow,creation_year,creation_month,creation_day,active_year,active_month,active_day,gender_-1,gender_FEMALE,...,first_browser_SeaMonkey,first_browser_Silk,first_browser_SiteKiosk,first_browser_SlimBrowser,first_browser_Sogou Explorer,first_browser_Stainless,first_browser_TenFourFox,first_browser_TheWorld Browser,first_browser_Yandex.Browser,first_browser_wOSBrowser
0,-1,0,2010,6,28,2009,3,19,1,0,...,0,0,0,0,0,0,0,0,0,0
1,38,0,2011,5,25,2009,5,23,0,0,...,0,0,0,0,0,0,0,0,0,0
2,56,3,2010,9,28,2009,6,9,0,1,...,0,0,0,0,0,0,0,0,0,0
3,42,0,2011,12,5,2009,10,31,0,1,...,0,0,0,0,0,0,0,0,0,0
4,41,0,2010,9,14,2009,12,8,1,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
# train/test dataset 70/30 split 
X, y, feature_names, target_name = encoded_features, target, list(encoded_features),list(target)

from sklearn import cross_validation
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.3, random_state=0)

In [9]:
np.random.seed(0)

## Decision Tree

In [None]:
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X, y)

In [None]:
from sklearn.externals.six import StringIO
import pydot
from IPython.display import Image  
dot_data = StringIO()  
tree.export_graphviz(clf, out_file=dot_data,  
                         feature_names=feature_names,  
                         class_names=target_name,  
                         filled=True, rounded=True,  
                         special_characters=True)  
graph = pydot.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())  

# Random Forest

Characteristics:
* low bais
* high variance
* prone to overfitting

Tuning Parameters:
* number of trees
* number of features to consider at each split
* depth of trees