In [22]:
import pandas as pd
from datetime import datetime, timedelta
from bs4 import BeautifulSoup
import requests
from unicodedata import normalize
import re, copy, random, time, csv
from geopy.geocoders import Nominatim
from geopy.distance import geodesic
import numpy as np
from pandas.tseries.holiday import USFederalHolidayCalendar
from uberPrice import getPriceNow
from create_weather_features import init_weather_features, create_weather_features
from create_date_features import create_date_features
from time import strftime, strptime

In [2]:
#Step 1, Read the dataset with features, clean any unecessary stuff
cab_data = pd.read_csv("data_with_features.csv")
del cab_data["Unnamed: 0"]
del cab_data["Unnamed: 0.1"]
del cab_data["Unnamed: 0.1.1"]
#Print columns to validate
print(cab_data.columns.values)

['dropoff_latitude' 'dropoff_longitude' 'pickup_latitude'
 'pickup_longitude' 'total_amount' 'tpep_dropoff_datetime'
 'tpep_pickup_datetime' 'start_latlng' 'end_latlng' 'distance'
 'tpep_dropoff_datetime_years' 'tpep_dropoff_datetime_months'
 'tpep_dropoff_datetime_days' 'tpep_dropoff_datetime_hours'
 'tpep_dropoff_datetime_minutes' 'tpep_dropoff_datetime_seconds'
 'tpep_dropoff_datetime_day_of_week' 'tpep_dropoff_datetime_is_holiday'
 'tpep_pickup_datetime_years' 'tpep_pickup_datetime_months'
 'tpep_pickup_datetime_days' 'tpep_pickup_datetime_hours'
 'tpep_pickup_datetime_minutes' 'tpep_pickup_datetime_seconds'
 'tpep_pickup_datetime_day_of_week' 'tpep_pickup_datetime_is_holiday'
 'duration' 'Mean Temperature' 'Max Temperature' 'Min Temperature'
 'Dew Point' 'Average Humidity' 'Precipitation' 'Snow' 'Wind Speed'
 'Visibility']


In [3]:
#Filter out any nonzero price values
price_filter = cab_data["total_amount"] > 0
cab_data = cab_data[price_filter]

print("Data size: ", len(cab_data))

#Filter out any negative distance values
dist_filter = cab_data['distance'] > 0
cab_data = cab_data[dist_filter]

data_size = len(cab_data)
print("Data size: ", data_size)

Data size:  982936
Data size:  978979


Okay great! The data is clean and is ready to be processed!

In [16]:
#Step 1: Determine interval for sampling Uber API
time_period = 1*60 #In X hrs * 60 minutes 
sample_rate = 10 #In 1 sample per *sample_rate* minutes
num_samples = time_period // sample_rate
print("We will be making %d calls to uber API." % num_samples)

#Step 2: Get random subset of data to use as test points
sample_data = cab_data.sample(n=num_samples)
sample_data = sample_data[['start_latlng', 'end_latlng', 'distance', 'duration']]
#print(sample_data)

We will be making 6 calls to uber API.


In [17]:
#List of features we are using in our model
feature_names = ["distance", 'tpep_dropoff_datetime_years', 'tpep_dropoff_datetime_months',
       'tpep_dropoff_datetime_days', 'tpep_dropoff_datetime_hours',
       'tpep_dropoff_datetime_minutes', 'tpep_dropoff_datetime_seconds',
       'tpep_dropoff_datetime_day_of_week',
       'tpep_dropoff_datetime_is_holiday', 'tpep_pickup_datetime_years',
       'tpep_pickup_datetime_months', 'tpep_pickup_datetime_days',
       'tpep_pickup_datetime_hours', 'tpep_pickup_datetime_minutes',
       'tpep_pickup_datetime_seconds', 'tpep_pickup_datetime_day_of_week',
       'tpep_pickup_datetime_is_holiday', 'Mean Temperature',
       'Max Temperature', 'Min Temperature', 'Dew Point',
       'Average Humidity', 'Precipitation', 'Snow', 'Wind Speed',
       'Visibility']
print("Number of features: ", len(feature_names))

Number of features:  26


In [18]:
def compare_estimates(sample_data):
    
    data_with_features = []
    uber_estimates = []

    for (idx,row) in sample_data.iterrows():
        #print(row)

        ########## CREATE TIME/DATE/WEATHER FEATURES FOR MODEL ESTIMATE ###############

        #Pickup time is now, dropoff is trip duration after pickup
        pickup_time = datetime.now()
        dropoff_time = pickup_time + timedelta(minutes=row['duration'])

        #Format start and end times for feature extraction
        start = datetime(pickup_time.year, pickup_time.month, pickup_time.day)
        start = '%4d-%02d-%02d' % (start.year, start.month, start.day)
        end = datetime(pickup_time.year, pickup_time.month, pickup_time.day + 1)
        end = '%4d-%02d-%02d' % (end.year, end.month, end.day)
        #print(start,end, type(start), type(end))

        start_date = datetime(pickup_time.year, pickup_time.month, pickup_time.day)
        end_date = datetime(pickup_time.year, pickup_time.month, pickup_time.day)
        #print(start_date, end_date)
        weather_date_features = init_weather_features(start_date, end_date)

        #Format pickup/dropoff time
        pickup_time = '%4d-%02d-%02dT%02d:%02d:%f' % (pickup_time.year, pickup_time.month, pickup_time.day, pickup_time.hour, pickup_time.minute, pickup_time.second)
        dropoff_time = '%4d-%02d-%02dT%02d:%02d:%f' % (dropoff_time.year, dropoff_time.month, dropoff_time.day, dropoff_time.hour, dropoff_time.minute, dropoff_time.second)
        #print(pickup_time, dropoff_time)
        curr_row = dict()
        curr_row["tpep_dropoff_datetime"] = str(dropoff_time)[:len(dropoff_time)-3] #strftime('%Y-%m-%dT%H:%M:%S.%f', dropoff_time)
        curr_row["tpep_pickup_datetime"] = str(pickup_time)[:len(pickup_time)-3] #strftime('%Y-%m-%dT%H:%M:%S.%f', pickup_time)

        #Convert to dataframe and get date  + weather features
        curr_row = pd.DataFrame([curr_row])
        curr_row = create_date_features(curr_row, start, end)
        curr_row = create_weather_features(curr_row, weather_date_features)

        #Next, use the dataframe of just 1 row (aka the attributes for one trip) with our model to predict price


        data_with_features.append(curr_row)


        ######### GET UBER ESTIMATE ##############
        start_coord = row['start_latlng'].split(',')
        end_coord = row['end_latlng'].split(',')
        print(start_coord, end_coord, type(start_coord))
        curr_uber_estimates = getPriceNow(float(start_coord[0]), float(start_coord[1]), float(end_coord[0]), float(end_coord[1]))
        uberX_estimate = curr_uber_estimates[1]

        high_estimate = uberX_estimate['high_estimate']
        low_estimate = uberX_estimate['low_estimate']
        avg_estimate = (high_estimate + low_estimate) / 2

        uber_estimates.append((high_estimate, low_estimate, avg_estimate))

        print("uberX estimate: ", avg_estimate)

        ####### COMPARE UBER ESTIMATE WITH MODEL ESTIMATE #########

        time.sleep(1 * 60)
        
    return (data_with_features, uber_estimates)

In [19]:
features, estimates = compare_estimates(sample_data)
print(features)
print(estimates)

2018 5 5
['40.782475', ' -73.957672'] ['40.776001', ' -73.946869'] <class 'list'>
uberX estimate:  9.5
2018 5 5
['40.781853', ' -73.975639'] ['40.769924', ' -73.863304'] <class 'list'>
uberX estimate:  44.0
2018 5 5
['40.725105', ' -73.999321'] ['40.770691', ' -73.959854'] <class 'list'>
uberX estimate:  25.0
2018 5 5
['40.762566', ' -73.982300'] ['40.748661', ' -74.007050'] <class 'list'>
uberX estimate:  13.5
2018 5 5
['40.774410', ' -73.873047'] ['40.751377', ' -73.976463'] <class 'list'>
uberX estimate:  39.5
2018 5 5
['40.731617', ' -74.004745'] ['40.738029', ' -73.992470'] <class 'list'>
uberX estimate:  12.0
[     tpep_dropoff_datetime     tpep_pickup_datetime  \
0  2018-05-05T15:51:20.000  2018-05-05T15:47:20.000   

   tpep_dropoff_datetime_years  tpep_dropoff_datetime_months  \
0                         2018                             5   

   tpep_dropoff_datetime_days  tpep_dropoff_datetime_hours  \
0                           5                           15   

   tpep_dro

In [31]:
#Write estimates to file
F = open('price_estimates.csv', 'w')  

for e in estimates:  
    F.write(str(e) + "\n")  
F.close()
#Write features of each trip to file (for later use)
features = pd.concat(features)
features.to_csv("test_features.csv")