In [3]:
import pandas as pd
from datetime import datetime, timedelta
from bs4 import BeautifulSoup
import requests
from unicodedata import normalize
import re, copy, random, time, csv, math
from geopy.geocoders import Nominatim
from geopy.distance import geodesic
import numpy as np
from pandas.tseries.holiday import USFederalHolidayCalendar
from uberPrice import getPriceNow
from create_weather_features import init_weather_features, create_weather_features
from create_date_features import create_date_features
from time import strftime, strptime
from sklearn.model_selection import train_test_split

In [4]:
#Step 1, Read the dataset with features, clean any unecessary stuff
cab_data = pd.read_csv("data_with_features.csv")
del cab_data["Unnamed: 0"]
del cab_data["Unnamed: 0.1"]
del cab_data["Unnamed: 0.1.1"]
#Print columns to validate
print(cab_data.columns.values)

['dropoff_latitude' 'dropoff_longitude' 'pickup_latitude'
 'pickup_longitude' 'total_amount' 'tpep_dropoff_datetime'
 'tpep_pickup_datetime' 'start_latlng' 'end_latlng' 'distance'
 'tpep_dropoff_datetime_years' 'tpep_dropoff_datetime_months'
 'tpep_dropoff_datetime_days' 'tpep_dropoff_datetime_hours'
 'tpep_dropoff_datetime_minutes' 'tpep_dropoff_datetime_seconds'
 'tpep_dropoff_datetime_day_of_week' 'tpep_dropoff_datetime_is_holiday'
 'tpep_pickup_datetime_years' 'tpep_pickup_datetime_months'
 'tpep_pickup_datetime_days' 'tpep_pickup_datetime_hours'
 'tpep_pickup_datetime_minutes' 'tpep_pickup_datetime_seconds'
 'tpep_pickup_datetime_day_of_week' 'tpep_pickup_datetime_is_holiday'
 'duration' 'Mean Temperature' 'Max Temperature' 'Min Temperature'
 'Dew Point' 'Average Humidity' 'Precipitation' 'Snow' 'Wind Speed'
 'Visibility']


In [5]:
#Filter out any nonzero price values
price_filter = cab_data["total_amount"] > 0
cab_data = cab_data[price_filter]

print("Data size: ", len(cab_data))

#Filter out any negative distance values
dist_filter = cab_data['distance'] > 0
cab_data = cab_data[dist_filter]

#Add the log total amount (for price verification)
cab_data["log_total_amount"] = np.log(cab_data["total_amount"] + 1)

data_size = len(cab_data)
print("Data size: ", data_size)

Data size:  982936
Data size:  978979


In [6]:
def create_features(df):
    features = []
    feature_names = ['distance', 'duration', 'tpep_dropoff_datetime_years', 'tpep_dropoff_datetime_months',
       'tpep_dropoff_datetime_days', 'tpep_dropoff_datetime_hours',
       'tpep_dropoff_datetime_minutes', 'tpep_dropoff_datetime_seconds',
       'tpep_dropoff_datetime_day_of_week',
       'tpep_dropoff_datetime_is_holiday', 'tpep_pickup_datetime_years',
       'tpep_pickup_datetime_months', 'tpep_pickup_datetime_days',
       'tpep_pickup_datetime_hours', 'tpep_pickup_datetime_minutes',
       'tpep_pickup_datetime_seconds', 'tpep_pickup_datetime_day_of_week',
       'tpep_pickup_datetime_is_holiday', 'Mean Temperature',
       'Max Temperature', 'Min Temperature', 'Dew Point',
       'Average Humidity', 'Precipitation', 'Snow', 'Wind Speed',
       'Visibility']
    for feature_name in feature_names:
        features.append(df[feature_name])
    
    X = np.array(features).T
    y = df["log_total_amount"].values
    return (X,y)

In [7]:
def create_train_test_split(X,y):
    X_, X_test, y_, y_test = train_test_split(X, y, test_size = 0.3, random_state=0)
    X_train, X_cv, y_train, y_cv = train_test_split(X_, y_, test_size = 0.3, random_state=0)
    
    return (X_train, y_train, X_cv, y_cv, X_test, y_test)

In [8]:
(X,y) = create_features(cab_data)
(X_train, y_train, X_cv, y_cv, X_test, y_test) = create_train_test_split(X,y)

Okay great! The data is clean and is ready to be processed!

The next step is to initialize the model and train on our dataset.

In [9]:
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingRegressor

# Apply PCA
def applyPCA(X_train, X_test, numComponents):
    
    # Initialize the PCA class and pass in the number of dimensions to which you want to reduce your data to.
    pca = PCA(n_components = numComponents)

    # Fit the training data to the PCA model.
    pca.fit(X_train)

    # Reduce the dimensionality of the training set.
    newX_train = pca.transform(X_train)

    # Reduce the dimensionality of the testing set.
    newX_test = pca.transform(X_test)
    
    return newX_train, newX_test

# Train the PCA and get the right number of 
def PCAWithLinearRegression(X_train, y_train, X_test, y_test):
    numCols = X_train.shape[1]
    errors = []
    print(numCols)
    for i in range(numCols):
        print(i+1)
        pcaX_train, pcaX_test = applyPCA(X_train, X_test, i+1)
        (model, y_pred, error) = train_linear_regression(pcaX_train, y_train, pcaX_test, y_test)
        errors.append(error)
    return errors

# Graphs above determine that 16 is the best number of components
def PCAwithGradientBoostingAndLR(X_train, X_test, y_train, numComponents):
    
    pcaX_train, pcaX_test = applyPCA(X_train, X_test, numComponents)
    clf = GradientBoostingRegressor()
    clf.fit(pcaX_train, y_train)
    modelResults = clf.predict(pcaX_test)
    
    return modelResults

In [38]:
#Step 1: Determine interval for sampling Uber API
time_period = 2*60 #In X hrs * 60 minutes 
sample_rate = 5 #In 1 sample per *sample_rate* minutes
num_samples = time_period // sample_rate
print("We will be making %d calls to uber API." % num_samples)

#Step 2: Get random subset of data to use as test points
sample_data = cab_data.sample(n=num_samples)
sample_data = sample_data[['start_latlng', 'end_latlng', 'distance', 'duration']]
#print(sample_data)

We will be making 24 calls to uber API.


In [39]:
#List of features we are using in our model
feature_names = ['distance', 'duration', 'tpep_dropoff_datetime_years', 'tpep_dropoff_datetime_months',
       'tpep_dropoff_datetime_days', 'tpep_dropoff_datetime_hours',
       'tpep_dropoff_datetime_minutes', 'tpep_dropoff_datetime_seconds',
       'tpep_dropoff_datetime_day_of_week',
       'tpep_dropoff_datetime_is_holiday', 'tpep_pickup_datetime_years',
       'tpep_pickup_datetime_months', 'tpep_pickup_datetime_days',
       'tpep_pickup_datetime_hours', 'tpep_pickup_datetime_minutes',
       'tpep_pickup_datetime_seconds', 'tpep_pickup_datetime_day_of_week',
       'tpep_pickup_datetime_is_holiday', 'Mean Temperature',
       'Max Temperature', 'Min Temperature', 'Dew Point',
       'Average Humidity', 'Precipitation', 'Snow', 'Wind Speed',
       'Visibility']
print("Number of features: ", len(feature_names))

Number of features:  27


In [40]:
def generate_estimates(sample_data):


    data_with_features = []
    uber_estimates = []

    for (idx,row) in sample_data.iterrows():

        ########## CREATE TIME/DATE/WEATHER FEATURES FOR MODEL ESTIMATE ###############
        F = open('price_estimates.csv', 'a')
        
        #Pickup time is now, dropoff is trip duration after pickup
        pickup_time = datetime.now()
        dropoff_time = pickup_time + timedelta(minutes=row['duration'])

        #Format start and end times for feature extraction
        start = datetime(pickup_time.year, pickup_time.month, pickup_time.day)
        start = '%4d-%02d-%02d' % (start.year, start.month, start.day)
        end = datetime(pickup_time.year, pickup_time.month, pickup_time.day + 1)
        end = '%4d-%02d-%02d' % (end.year, end.month, end.day)
        #print(start,end, type(start), type(end))

        start_date = datetime(pickup_time.year, pickup_time.month, pickup_time.day)
        end_date = datetime(pickup_time.year, pickup_time.month, pickup_time.day)
        #print(start_date, end_date)
        weather_date_features = init_weather_features(start_date, end_date)

        #Format pickup/dropoff time
        pickup_time = '%4d-%02d-%02dT%02d:%02d:%f' % (pickup_time.year, pickup_time.month, pickup_time.day, pickup_time.hour, pickup_time.minute, pickup_time.second)
        dropoff_time = '%4d-%02d-%02dT%02d:%02d:%f' % (dropoff_time.year, dropoff_time.month, dropoff_time.day, dropoff_time.hour, dropoff_time.minute, dropoff_time.second)
        #print(pickup_time, dropoff_time)
        curr_row = dict()
        curr_row["tpep_dropoff_datetime"] = str(dropoff_time)[:len(dropoff_time)-3] #strftime('%Y-%m-%dT%H:%M:%S.%f', dropoff_time)
        curr_row["tpep_pickup_datetime"] = str(pickup_time)[:len(pickup_time)-3] #strftime('%Y-%m-%dT%H:%M:%S.%f', pickup_time)

        curr_row['distance'] = row['distance']
        
        #Convert to dataframe and get date  + weather features
        curr_row = pd.DataFrame([curr_row])
        curr_row = create_date_features(curr_row, start, end)
        curr_row = create_weather_features(curr_row, weather_date_features)

        #Write the synthesized row of features to csv file (append)
        curr_row.to_csv("price_features.csv", mode='a',header=False)
        data_with_features.append(curr_row)
        
        #print(curr_row, curr_row['distance'])


        ######### GET UBER ESTIMATE ##############
        start_coord = row['start_latlng'].split(',')
        end_coord = row['end_latlng'].split(',')
        #print(start_coord, end_coord, type(start_coord))
        curr_uber_estimates = getPriceNow(float(start_coord[0]), float(start_coord[1]), float(end_coord[0]), float(end_coord[1]))
        uberX_estimate = curr_uber_estimates[1]

        high_estimate = uberX_estimate['high_estimate']
        low_estimate = uberX_estimate['low_estimate']
        avg_estimate = (high_estimate + low_estimate) / 2

        all_estimates = (high_estimate, low_estimate, avg_estimate)
        
        uber_estimates.append(all_estimates)

        print("uberX estimate: %s" % (avg_estimate))
        F.write(str(all_estimates) + "\n")
        
        ####### COMPARE UBER ESTIMATE WITH MODEL ESTIMATE #########
        F.close()
        time.sleep(sample_rate * 60)
        #time.sleep(1)
    

    
    return (data_with_features, uber_estimates)

In [None]:
features, estimates = generate_estimates(sample_data)
#print(features)
#print(estimates)

2018 5 6
uberX estimate: 9.5
2018 5 6
uberX estimate: 9.5
2018 5 6
uberX estimate: 12.0
2018 5 6
uberX estimate: 9.5
2018 5 6
uberX estimate: 34.0
2018 5 6
uberX estimate: 33.0
2018 5 6
uberX estimate: 44.0


In [26]:
#Check model performance

####### STEP 1: Get model prediction of price (from our synthesized features file) ########
X_test = pd.read_csv('price_features.csv')
del X_test['i']
del X_test['tpep_dropoff_datetime']
del X_test['tpep_pickup_datetime']
#print(list(X_test), len(list(X_test)))
modelResults = PCAwithGradientBoostingAndLR(X_train, X_test, y_train, 16)

In [27]:
print(modelResults)
model_results = [math.exp(r)-1 for r in modelResults]
print(model_results)

[ 1.02160475  1.02160475  2.33850251  0.93060194  0.98890781  1.00718132
  1.00718132  1.00718132  1.00718132  1.89674175  1.00718132  2.19236575
  1.00718132  2.20476499  2.31579407  0.87555673  0.98616196  1.01958056
  1.01958056  1.01958056  1.01958056  1.01958056  1.60907408  1.01958056
  2.20476499  2.20476499  0.90118097  0.93509821  2.24804013  1.01958056
  1.01958056  1.01958056  1.01958056  2.18408074  1.01958056  1.60907408
  2.20476499  2.20476499  0.86326895  0.93114236  1.00130705  1.01958056
  1.01958056  1.01958056  1.01958056  1.01958056  1.01958056  1.01958056
  2.20476499  0.93114236  0.99889631  0.93487737  1.01958056  0.99889631
  1.01958056  1.01958056  1.01958056  1.01958056  1.01958056  2.20476499
  2.20476499  0.90422714  0.87638645  0.93114236  1.01958056  2.22773536
  1.01958056  1.01958056  1.00130705  1.01958056  1.01958056  1.32894759
  2.20476499  0.86326895  0.93060194  0.86326895  1.59080057  0.96315251
  0.99889631  1.01958056  1.01958056  1.01958056  1

In [28]:
####### STEP 2: Get Uber prediction of price (already written in file)
uber_pricing_estimates = []
with open('price_estimates.csv', 'r') as f:
    r = csv.reader(f)
    uber_pricing_estimates = list(r)

uber_pricing_estimates.pop(0)
print(uber_pricing_estimates)

[['(17.0', ' 13.0', ' 15.0)'], ['(23.0', ' 18.0', ' 20.5)'], ['(69.0', ' 55.0', ' 62.0)'], ['(15.0', ' 11.0', ' 13.0)'], ['(14.0', ' 11.0', ' 12.5)'], ['(11.0', ' 8.0', ' 9.5)'], ['(11.0', ' 8.0', ' 9.5)'], ['(19.0', ' 15.0', ' 17.0)'], ['(17.0', ' 13.0', ' 15.0)'], ['(60.0', ' 48.0', ' 54.0)'], ['(11.0', ' 8.0', ' 9.5)'], ['(11.0', ' 8.0', ' 9.5)'], ['(13.0', ' 10.0', ' 11.5)'], ['(11.0', ' 8.0', ' 9.5)'], ['(61.0', ' 49.0', ' 55.0)'], ['(14.0', ' 10.0', ' 12.0)'], ['(16.0', ' 12.0', ' 14.0)'], ['(12.0', ' 9.0', ' 10.5)'], ['(11.0', ' 8.0', ' 9.5)'], ['(11.0', ' 8.0', ' 9.5)'], ['(20.0', ' 16.0', ' 18.0)'], ['(18.0', ' 14.0', ' 16.0)'], ['(29.0', ' 23.0', ' 26.0)'], ['(11.0', ' 8.0', ' 9.5)'], ['(11.0', ' 8.0', ' 9.5)'], ['(13.0', ' 9.0', ' 11.0)'], ['(11.0', ' 8.0', ' 9.5)'], ['(11.0', ' 8.0', ' 9.5)'], ['(66.0', ' 53.0', ' 59.5)'], ['(12.0', ' 9.0', ' 10.5)'], ['(25.0', ' 19.0', ' 22.0)'], ['(12.0', ' 9.0', ' 10.5)'], ['(17.0', ' 13.0', ' 15.0)'], ['(17.0', ' 13.0', ' 15.0)'], ['(11

In [29]:
assert(len(model_results) == len(uber_pricing_estimates))

for i in range(len(model_results)):
    uber_hi, uber_lo, uber_avg = uber_pricing_estimates[i]
    #Weird formatting stuff, ignore for now
    uber_avg = uber_avg[:len(uber_avg)-1]
    uber_hi = uber_hi[1:]

    our_estimate = model_results[i]
    
    print("For trip %d, Model: %f        Uber: %s" % (i, our_estimate, uber_avg))

For trip 0, Model: 1.777649        Uber:  15.0
For trip 1, Model: 1.777649        Uber:  20.5
For trip 2, Model: 9.365702        Uber:  62.0
For trip 3, Model: 1.536035        Uber:  13.0
For trip 4, Model: 1.688297        Uber:  12.5
For trip 5, Model: 1.737873        Uber:  9.5
For trip 6, Model: 1.737873        Uber:  9.5
For trip 7, Model: 1.737873        Uber:  17.0
For trip 8, Model: 1.737873        Uber:  15.0
For trip 9, Model: 5.664146        Uber:  54.0
For trip 10, Model: 1.737873        Uber:  9.5
For trip 11, Model: 7.956377        Uber:  9.5
For trip 12, Model: 1.737873        Uber:  11.5
For trip 13, Model: 8.068120        Uber:  9.5
For trip 14, Model: 9.132966        Uber:  55.0
For trip 15, Model: 1.400211        Uber:  12.0
For trip 16, Model: 1.680925        Uber:  14.0
For trip 17, Model: 1.772032        Uber:  10.5
For trip 18, Model: 1.772032        Uber:  9.5
For trip 19, Model: 1.772032        Uber:  9.5
For trip 20, Model: 1.772032        Uber:  18.0
For trip 