# TRAINING 3 DIFFERENT REGRESSION MODELS: SVR, DECISION TREES AND XGB

In [8]:
import pandas as pd
import numpy as np
from numpy import genfromtxt

from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score

from sklearn.tree import DecisionTreeRegressor 
from sklearn.model_selection import GridSearchCV

import xgboost as xgb

from sklearn.externals import joblib
from sklearn.metrics import make_scorer

I'm using the all the six features from the weather features dataset here, but the most important features can be selected through feature selection

In [9]:
dftrain_weather = pd.read_csv("train_weather.csv")
dftrain_weather.head()

Unnamed: 0,avg_temp,snow_fall,snow_depth,min_temp,max_temp,precipitation
0,45.0,0,0,42,48,0.89
1,72.0,0,0,60,84,0.0
2,24.5,0,T,19,30,0.0
3,36.0,0,0,26,46,0.0
4,42.5,0,0,36,49,0.0


In [10]:
df_train = pd.read_csv("train.csv")
y = np.array(df_train.trip_duration)

In [11]:
X = genfromtxt("train_weather.csv", delimiter=',', skip_header = 1)
nans_loc = np.isnan(X)
X[nans_loc] = 0 

## SCORING FUNCTION

Defining RMSLE function as given on Kaggle.

In [17]:
def rmsle(real,predicted):
    sum=0.0
    
    for x in range(len(predicted)):
        p = np.log(predicted[x]+1)
        r = np.log(real[x]+1)
        sum = sum + (p - r)**2
    return (sum/len(predicted))**0.5

loss  = make_scorer(rmsle, greater_is_better=False)  #Call this during evaluation

In the sklearn implementation, setting the 'greater_is_better' parameter determines whether a higher score is better or a lower score (loss) is better. For rmsle, the lower the score, the better. Setting this parameter to 'False' simply negates the output thus picking the best estimator correctly

## SVR 

In [18]:
regressor = SVR(kernel='linear')
scores = cross_val_score(regressor, X[:5], y[:5], cv=3, scoring = loss) #Running for a 1000 datapoints

  """
  import sys
  


In [19]:
scores

array([        nan, -1.39389201, -0.04469059])

We still need to fit the model to the training set. Cross_val_score does not automatically fit the model to the training set

In [20]:
regressor.fit(X[:5], y[:5])

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [21]:
joblib.dump(regressor, 'SVR_model.pkl') #Save the model

['SVR_model.pkl']

## DECISION TREE REGRESSION

To determine the best depth for the Decision Tree, cross-validation can be applied

In [22]:
parameters = {'max_depth':range(2,5)} #Depth parameter range
regressor = DecisionTreeRegressor()

In [23]:
estimator = GridSearchCV(regressor, parameters, cv=3, scoring = loss)
estimator.fit(X[:5],y[:5])

GridSearchCV(cv=3, error_score='raise',
       estimator=DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='best'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'max_depth': [2, 3, 4]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score=True,
       scoring=make_scorer(rmsle, greater_is_better=False), verbose=0)

In [24]:
estimator.cv_results_['mean_test_score']

array([-0.57084226, -0.57662686, -0.57042816])

Now we can pick the best estimator from the cross-validation results. Based on the above results, it should be the tree with max_depth = 4 (least negative loss)

In [48]:
best_regressor = estimator.best_estimator_  
joblib.dump(best_regressor, 'DT_regressor.pkl') 

['DT_regressor.pkl']

Just doing a quick sanity check to make sure its the correct tree:

In [49]:
best_regressor

DecisionTreeRegressor(criterion='mse', max_depth=4, max_features=None,
           max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='best')

The max_depth parameter is 4 as expected

Optionally, cross-validation can be performed again on the best estimator model as well. I've seen this in some of the sklearn examples though I don't think it may be necessary. 

In [32]:
scores = cross_val_score(best_regressor, X[:5], y[:5], cv=3, scoring = loss) #Running for a 1000 datapoints
scores

array([-0.29912331, -1.12001857, -0.01385703])

## XGB

There are several parameters involved. I've set some of these parameters below. By default, xgb uses Decision Trees as the base classifier but it can be modified by setting the 'booster' parameter to 'gblinear' which uses a linear function or 'dart' which is another tree based model. Depending on which of these models is used, the parameters involved vary. I've used the default gbtree for regression.

In [26]:
params = {'max_depth':2, 'learning_rate':0.3, 'silent':True, 'objective':'reg:linear'} #Default booster:'gbtree'

#OR to explore a range of depth parameters
#params = {'max_depth':range(4)} 

max_depth: The maximum tree depth to use (specific to tree based models). It is possible to find the best max_depth parameter using the GridSearch method since the following model instantiation is tied to the Sklearn API but for now it is simply set to 2

learning_rate or eta: Analogous to learning rate and determines by how much the feature weights are modified at each step of boosting

silent: Analogous to verbose setting. When set to True, it doesn't print intermediate messages, otherwise the output is always printed at each step

objective: The objective function to use. Since we have a regression problem, it is set to reg:linear. It can be set to other parameters such as 'binary:logistic' for binary classification problem

In [60]:
#Optionally set number of parallel threads to run xgboost
#params['nthread'] = 4

The 'OR' section below is for max depth parameter selection using GridSearchCV

In [27]:
regressor = xgb.XGBRegressor(**params) 
scores = cross_val_score(regressor, X[:5], y[:5], cv=3, scoring = loss)

#OR
#estimator = GridSearchCV(regressor, params, cv=3) 
#estimator.fit(X[:5],y[:5])

In [28]:
scores.mean()

#OR
#print "Mean Test Score: " + estimator.cv_results_['mean_test_score']

-0.48799756919995785

In [30]:
regressor.fit(X[:5],y[:5])

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.3, max_delta_step=0, max_depth=2,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [31]:
joblib.dump(regressor, 'xgb_model.pkl') 

#OR
#best_regressor = estimator.best_estimator_  
#joblib.dump(best_regressor, 'xgb_model.pkl') 

['xgb_model.pkl']

## Testing time

In [39]:
df_test = pd.read_csv("test.csv")
trip_id = df_test.id[:5] #Save trip ID

Load test data and run desired model on it

In [4]:
dftest_weather = pd.read_csv("test_weather.csv")
dftest_weather.head()

Unnamed: 0,average temperature,snow fall,snow depth,minimum temperature,maximum temerature,precipitation
0,76.5,0,0,68,85,0
1,76.5,0,0,68,85,0
2,76.5,0,0,68,85,0
3,76.5,0,0,68,85,0
4,76.5,0,0,68,85,0


In [5]:
X = genfromtxt("test_weather.csv", delimiter=',', skip_header = 1)
nans_loc = np.isnan(X)
X[nans_loc] = 0 

### SVR

In [36]:
model = joblib.load('SVR_model.pkl') 
predictions = model.predict(X[:5])
predictions

array([ 455.10030321,  662.90033243,  337.81807834,  429.09933513,
        454.71857699])

### DECISION TREE

In [51]:
model = joblib.load('DT_Regressor.pkl') 
predictions = model.predict(X[:5])
predictions

array([ 455.10030321,  662.90033243,  337.81807834,  429.09933513,
        454.71857699])

### XGB

In [38]:
model = joblib.load('XGB_model.pkl') 
predictions = model.predict(X[:5])
predictions

array([  454.99993896,   662.99890137,  2123.99926758,   429.00183105,
         434.99981689], dtype=float32)

The XGB and Decision Tree models gave the EXACT same answers even though XGB was configured at max_depth = 2 and best Decision Tree that we found was at depth 4 

### SUBMISSION FORMAT

Finally, put the result in the submission format and ta-da!!

In [45]:
frames = [trip_id, pd.DataFrame({'trip_duration':predictions})]
result = pd.concat(frames, axis = 1)
result

Unnamed: 0,id,trip_duration
0,id3004672,454.999939
1,id3505355,662.998901
2,id1217141,2123.999268
3,id2150126,429.001831
4,id1598245,434.999817
