In [1]:
import pandas as pd
import numpy as np
import random
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import learning_curve
from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import ShuffleSplit
from sklearn.externals import joblib
from sklearn import svm
from sklearn import decomposition
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, AdaBoostRegressor, BaggingRegressor
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, explained_variance_score, mean_absolute_error, mean_squared_error, r2_score
from time import time
import json
import os
# The usual preamble
%matplotlib inline
import matplotlib.pyplot as plt

# Make the graphs a bit prettier, andigger
pd.set_option('display.mpl_style', 'default')
plt.rcParams['figure.figsize'] = (15, 5)

mpl_style had been deprecated and will be removed in a future version.
Use `matplotlib.pyplot.style.use` instead.

  exec(code_obj, self.user_global_ns, self.user_ns)


In [2]:
path = os.path.dirname(os.getcwd())
month = ['03']
isWeekday = True

def weekdayOrWeekend(df,isWeekday):
    df['day_of_week'] = pd.to_datetime(df['tpep_pickup_datetime']).dt.dayofweek
    if isWeekday:
        # remove weekend
        df = df[df['day_of_week'] <= 4]
    else:
        # remove weekday
        df = df[df['day_of_week'] > 4]

for i in month:
    
    if os.path.exists('trainData/train_%s.csv'%i):  
        continue
    
    # processing taxi data
    taxi = pd.read_csv(path+'/PreProcessTaxiData/cleanData_yellow_tripdata_2016-%s.csv'%i)
    taxi = taxi[['tpep_pickup_datetime','trip_distance','duration','total_amount']]
    taxi.index = taxi['tpep_pickup_datetime'].str.slice(8,13)
    taxi.index.names=['Time']
    weekdayOrWeekend(taxi,isWeekday)
    del taxi['day_of_week']
    del taxi['tpep_pickup_datetime']
    taxi = taxi[taxi['duration']>0]
    taxi = taxi[taxi['total_amount']>0]
    
    # processing weather data
    weather = pd.read_csv(path+'/WeatherProcess/weather_processed_again/weather_2016_%s.csv'%i,index_col='Time')
    weather.drop(['Humidity','Barometer'],axis=1,inplace=True)
    weather.index = weather.index.str.slice(0,5)
    
    # concat two dataframes
    result = taxi.join(weather)
    result.index = result.index.str.slice(3,5)
    del taxi
    del weather
    
    result.to_csv('trainData/train_%s.csv'%i,mode='w')

In [3]:
i = '03'
result = pd.read_csv('trainData/train_%s.csv'%i,index_col='Time')

  mask |= (ar1 == a)


In [4]:
result.head()

Unnamed: 0_level_0,trip_distance,duration,total_amount,Temp,Weather,Wind,Visibility
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,2.5,475.0,12.35,48,clear,9,10.0
0,2.9,666.0,15.35,48,clear,9,10.0
0,19.98,1866.0,63.8,48,clear,9,10.0
0,0.7,299.0,8.8,48,clear,9,10.0
0,7.18,1445.0,28.0,48,clear,9,10.0


In [5]:
length = len(result)
df = result.iloc[random.sample(range(0, length), int(length*0.0001))]

In [6]:
vecX = [df[['Weather']].iloc[index].to_dict() for index in range(len(df))]
if os.path.exists('models/vec.pkl'):
    vec = joblib.load('models/vec.pkl')
    dummyX = vec.transform(vecX).toarray()
else:
    vec = DictVectorizer()    
    dummyX = vec.fit_transform(vecX).toarray()
    joblib.dump(vec, 'models/vec.pkl')
print(vec.get_feature_names())


['Weather=clear', 'Weather=cloud', 'Weather=cold', 'Weather=fog', 'Weather=light rain', 'Weather=light snow']


In [7]:
vecX = pd.DataFrame(dummyX,columns=vec.get_feature_names())
vecX.index = df.index
df = pd.concat([df,vecX],axis=1)

In [8]:
df.head()

Unnamed: 0_level_0,trip_distance,duration,total_amount,Temp,Weather,Wind,Visibility,Weather=clear,Weather=cloud,Weather=cold,Weather=fog,Weather=light rain,Weather=light snow
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
11,1.92,605.0,9.8,42,clear,6,10.0,1.0,0.0,0.0,0.0,0.0,0.0
21,2.03,608.0,11.3,41,cloud,9,10.0,0.0,1.0,0.0,0.0,0.0,0.0
14,2.74,1401.0,19.56,56,clear,6,10.0,1.0,0.0,0.0,0.0,0.0,0.0
13,1.4,1270.0,15.3,51,clear,7,10.0,1.0,0.0,0.0,0.0,0.0,0.0
23,1.42,419.0,8.3,64,cloud,3,10.0,0.0,1.0,0.0,0.0,0.0,0.0


In [9]:
del df['Weather']

In [10]:
y = np.array(df['total_amount'])
del df['total_amount']
X = np.array(df)
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [11]:
def gridSearchCV(clf, param_grid,score=make_scorer(r2_score)):
    # scoring = {'evc': make_scorer(explained_variance_score), 'r2_score': make_scorer(r2_score), 'mse': make_scorer(mean_squared_error)}
    grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, scoring=score)
    start = time()
    grid_search.fit(X_train, y_train)
    bestScore = round(grid_search.best_score_, 4)
    parameters = grid_search.best_params_
    print("Best Score: " + str(bestScore) + "\nParameters: " + str(parameters))
    return bestScore, parameters

# return the optimal model
def optFunc(theModel,theParams):
    tempParam = list()
    for key, value in theParams.iteritems():
        if isinstance(value, str):
            tempParam.append(str(key) + "='" + value + "'") 
            continue
        tempParam.append(str(key) + "=" + str(value)) 
    theParams = ",".join(tempParam)
    return eval(theModel + theParams + ")")

# SVR RBF kernel
svr_rbf = svm.SVR(kernel='rbf')
param_grid = {"C": [1, 10, 100, 1000], "gamma": np.logspace(-2, 2, 5)}
svr_rbf_score, parameters = gridSearchCV(svr_rbf, param_grid)
svr_rbf = optFunc('svm.SVR(kernel="rbf", ', parameters)

# Gradient Boosting regression
gbr = GradientBoostingRegressor()
param_grid = {'n_estimators': [500, 700, 900], 'max_depth': [1, 2], 'min_samples_split': [2], 
          'learning_rate': [0.01, 0.1], 'loss': ['lad']}
gbr_score, parameters = gridSearchCV(gbr, param_grid)
gbr = optFunc('GradientBoostingRegressor(', parameters)

# Kernel Ridge Regression
krr = KernelRidge(kernel='rbf')
param_grid = {"alpha": [1e0, 1e-1, 1e-2, 1e-3], "gamma": np.logspace(-2, 2, 5)}
krr_score, parameters = gridSearchCV(krr, param_grid)
krr = optFunc('KernelRidge(kernel="rbf", ', parameters)

# DecisionTreeRegressor
dtr = DecisionTreeRegressor()
param_grid = {'max_depth': [2, 4, 6, 8]}
dtr_score, parameters = gridSearchCV(dtr, param_grid)
dtr = optFunc('DecisionTreeRegressor(', parameters)

# KNeighborsRegressor
knr = KNeighborsRegressor()
param_grid = {'n_neighbors': [2, 4, 6, 8]}
knr_score, parameters = gridSearchCV(knr, param_grid)
knr = optFunc('KNeighborsRegressor(', parameters)

# RandomForestRegressor
rfr = RandomForestRegressor()
param_grid = {'max_depth': [2, 4, 6, 8], 'n_estimators': [20, 50, 150, 500]}
rfr_score, parameters = gridSearchCV(rfr, param_grid)
rfr = optFunc('RandomForestRegressor(', parameters)

# AdaBoostRegressor
adr = AdaBoostRegressor()
param_grid = {'n_estimators': [20, 50, 150, 500]}
adr_score, parameters = gridSearchCV(adr, param_grid)
adr = optFunc('AdaBoostRegressor(', parameters)

# BaggingRegressor
br = BaggingRegressor()
param_grid = {'n_estimators': [20, 50, 150, 500]}
br_score, parameters = gridSearchCV(br, param_grid)
br = optFunc('BaggingRegressor(', parameters)

Best Score: 0.378
Parameters: {'C': 100, 'gamma': 0.01}
Best Score: 0.9391
Parameters: {'min_samples_split': 2, 'loss': 'lad', 'learning_rate': 0.01, 'n_estimators': 900, 'max_depth': 2}
Best Score: -0.7431
Parameters: {'alpha': 0.001, 'gamma': 0.01}
Best Score: 0.9143
Parameters: {'max_depth': 6}
Best Score: 0.7547
Parameters: {'n_neighbors': 8}
Best Score: 0.9404
Parameters: {'n_estimators': 50, 'max_depth': 6}
Best Score: 0.924
Parameters: {'n_estimators': 500}
Best Score: 0.9377
Parameters: {'n_estimators': 500}


In [12]:
# #############################################################################
# select model with minimized mse and save model
models = [(svr_rbf,'SVR rbf kernel'),\
          (gbr,'Gradient Boosting regression'),(krr,'kernel ridge regression'),\
         (dtr,'DecisionTreeRegressor'),(knr,'KNeighborsRegressor'), (rfr,'RandomForestRegressor'),\
         (adr,'AdaBoostRegressor'),(br,'BaggingRegressor')]
scores = [svr_rbf_score, gbr_score, krr_score, dtr_score,\
          knr_score, rfr_score, adr_score, br_score]

index = max(enumerate(scores),key=lambda x: x[1])[0]
model = models[index][0]
modelName = models[index][1]
print 'model: ' + modelName

# #############################################################################
# Visualize learning curves
def plot_learning_curve(model, modelName):
    train_sizes, train_scores, test_scores = learning_curve(estimator=model, X=X_train, y=y_train, \
                                                            train_sizes=np.linspace(0.1, 1.0, 10), cv=5, n_jobs=4)
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)

    # plot in a new graph
    plt.plot(train_sizes, train_mean, color='blue', marker='o', markersize=5, label='Training score')
    plt.fill_between(train_sizes, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue')
    plt.plot(train_sizes, test_mean, color='green', linestyle='--', marker='s', markersize=5, label='Validation score')
    plt.fill_between(train_sizes, test_mean + test_std, test_mean - test_std, alpha=0.15, color='green')
    plt.grid(True)
    plt.xlabel('Number of training samples')
    plt.ylabel('Score')
    plt.title('Learning Curves (%s)'%modelName)
    plt.legend(loc='best')

# #############################################################################
# plor the result
def plot_result(y_pred, modelName):
    x = range(0, len(y_test))
    plt.scatter(x, y_test, color='darkorange', label='Ground truth')
    plt.scatter(x, y_pred, color='navy')
    plt.plot(x, y_pred, color='navy', label='Predict value')
    plt.title(modelName)
    plt.xlabel('time_by_hour')
    plt.ylabel('pickup_number')
    plt.legend()
    
# #############################################################################
# Predict on test samples
plt.figure()
plt.subplot(1, 2, 1)
plot_learning_curve(model, modelName)

# fit the model on train samples
model.fit(X_train, y_train)

#     # save the model
#     joblib.dump(model, r'models/' + filename[:-4] + '.pkl')
#     dic = {'y_max':y_max, 'y_min':y_min, 'x_max':x_max, 'x_min':x_min}
#     with open(r'models/' + filename[:-4] + '_coefficient.js','w') as f:
#         json.dump(dic, f)

plt.subplot(1, 2, 2)
plot_result(model.predict(X_test)*y_max+y_min, modelName)
plt.show()

for each in models:
    plt.figure()
    plt.subplot(1, 2, 1)
    plot_learning_curve(each[0], each[1])
    each[0].fit(X_train, y_train)
    plt.subplot(1, 2, 2)
    plot_result(each[0].predict(X_test)*y_max+y_min, each[1])
    plt.show()


model: RandomForestRegressor
