In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns


In [2]:
data = pd.read_csv("train.csv")

In [3]:
data = data[data["fare_amount"] < 100]
data = data[data["fare_amount"] > 0]

In [None]:
data.shape

In [4]:
#an item this large is too large to work with 
#finding hyper params use 10k
data = data.sample(1000000)
#training for final submission use 1mil

In [None]:
%matplotlib inline
sns.distplot(data["fare_amount"])

In [None]:
#75% of our data exists with fares below $12.50 so it would make sense to do a cutoff at approximately 
#$100 and that should capture everything

#we notice there are a few blips on our distribution chart between $40 and $60
#whether those are legitimate or not is another question, we will look at the other features in our
#data set and start to eliminate records that do not name sense in those cases

In [None]:
#cases to eliminate
#passengers over 8
#we are only dealing with new york city so latitudes and longitudes outside of a small realm is
#not realistic
#latitude more than 42 and less than 35 needs to be removed
#longitude less than -76 or greater than -73 needs to be removed

In [5]:
data = data[(data["pickup_latitude"] <43) & (data["pickup_latitude"] > 39)]
data = data[(data["dropoff_latitude"] <43) & (data["dropoff_latitude"] > 39)]
data = data[(data["pickup_longitude"] < -73) & (data["pickup_longitude"] > - 75)]
data = data[(data["dropoff_longitude"] <-73) & (data["dropoff_longitude"] > -75)]
data = data[(data["passenger_count"] > 0) & (data["passenger_count"] < 7)]

In [None]:
sns.distplot(data["fare_amount"])

In [6]:
from haversine import haversine
def get_distance(row):
    distance = haversine([row["pickup_latitude"], row["pickup_longitude"]],[row["dropoff_latitude"], row["dropoff_longitude"]])
    distance_miles = distance * 3280.84/5280
    return distance_miles

def distance_from_pickup(row, loc_lat, loc_lng):
    distance = haversine([row["pickup_latitude"], row["pickup_longitude"]], [loc_lat, loc_lng])
    distance_miles = distance* 3280.84/5280
    return distance_miles
    
def distance_from_dropoff(row, loc_lat, loc_lng):
    distance = haversine([row["dropoff_latitude"], row["dropoff_longitude"]], [loc_lat, loc_lng])
    distance_miles = distance* 3280.84/5280
    return distance_miles

def check_heading_dt(row):
    if (row["dist_to_wtc_pickup"] > row["dist_to_wtc_dropoff"]) & (row["dist_to_wtc_dropoff"] < 2):
        return 1
    return 0

def heading_to_airport(row):
    if (row["dist_to_lag_pickup"] > row["dist_to_lag_dropoff"]) & (row["dist_to_lag_dropoff"] < 3):
        return 1
    if (row["dist_to_nwk_pickup"] > row["dist_to_nwk_dropoff"]) & (row["dist_to_nwk_dropoff"] <3):
        return 1
    if (row["dist_to_jfk_pickup"] > row["dist_to_jfk_dropoff"]) & (row["dist_to_jfk_dropoff"] < 3):
        return 1
    return 0

In [7]:
central_park = [40.778940,-73.962295]
time_square = [40.758623,-73.985043]
one_world_trade = [40.712613,-74.014262]
laguardia = [40.776288,-73.872115]
newark = [40.693711,-74.179404]
jfk = [40.644195,-73.782446]
data["dist_to_cp_pickup"] = data.apply(distance_from_pickup, axis = 1, loc_lat = central_park[0], loc_lng = central_park[1])
data["dist_to_cp_dropoff"] = data.apply(distance_from_dropoff, axis = 1, loc_lat = central_park[0], loc_lng = central_park[1])
data["dist_to_ts_pickup"] = data.apply(distance_from_pickup, axis = 1, loc_lat = time_square[0], loc_lng = time_square[1])
data["dist_to_ts_dropoff"] = data.apply(distance_from_dropoff, axis = 1, loc_lat = time_square[0], loc_lng = time_square[1])
data["dist_to_wtc_pickup"] = data.apply(distance_from_pickup, axis = 1, loc_lat = one_world_trade[0], loc_lng = one_world_trade[1])
data["dist_to_wtc_dropoff"] = data.apply(distance_from_dropoff, axis = 1, loc_lat = one_world_trade[0], loc_lng = one_world_trade[1])
data["dist_to_lag_pickup"] = data.apply(distance_from_pickup, axis = 1, loc_lat = laguardia[0], loc_lng = laguardia[1]) 
data["dist_to_lag_dropoff"] = data.apply(distance_from_dropoff, axis = 1, loc_lat = laguardia[0], loc_lng = laguardia[1])
data["dist_to_nwk_pickup"] = data.apply(distance_from_pickup, axis = 1, loc_lat = newark[0], loc_lng = newark[1])
data["dist_to_nwk_dropoff"] = data.apply(distance_from_dropoff, axis = 1, loc_lat = newark[0], loc_lng = newark[1])
data["dist_to_jfk_pickup"] = data.apply(distance_from_pickup, axis = 1, loc_lat = jfk[0], loc_lng = jfk[1])
data["dist_to_jfk_dropoff"] = data.apply(distance_from_dropoff, axis = 1, loc_lat = jfk[0], loc_lng = jfk[1])
data["distance_traveled"] = data.apply(get_distance, axis = 1)
data["heading_dt"] = data.apply(check_heading_dt, axis = 1)
data["heading_to_airport"] = data.apply(heading_to_airport, axis= 1)

In [None]:
#distance should be a straight forward measurement for cost of trip as well as teh amount of time the 
#trip took. we have a start time but I don't believe we have a drop off time.

#the next low hanging fruit is obviously pickup time/day of week 
#we would expect rush hour times to have higher fares than late night or weekend

#i wonder if we can plot the lat/lng values for pick up and drop off and see if there are some "hot" 
#areas that would coincide for high prices

In [None]:
import seaborn as sns


def plot_correlation_heatmap(df):
    corr = df.corr()
    
    sns.set(style="white")
    mask = np.zeros_like(corr, dtype=np.bool)
    mask[np.triu_indices_from(mask)] = True

    f, ax = plt.subplots(figsize=(11, 9))
    cmap = sns.diverging_palette(220, 10, as_cmap=True)


    sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})
    plt.show()
    
    
plot_correlation_heatmap(data)

In [None]:
data.columns

In [None]:
data["pickup_datetime"].head()

In [None]:
test_set = data.head(10)

In [8]:
data["pickup_datetime"] = pd.to_datetime(data["pickup_datetime"], infer_datetime_format = True)

In [9]:
from datetime import datetime
data["day_of_week"] = [datetime.weekday(x) for x in data["pickup_datetime"]]
data["hour_24"] = [x.time().hour for x in data["pickup_datetime"]]
data["year"] = [x.year for x in data["pickup_datetime"]]

In [None]:
plt.subplots(figsize = (20,10))
ax = sns.boxplot(x = "year", y = "fare_amount", data = data)
plt.show()

In [None]:
fig,ax= plt.subplots(figsize = (20,10))
data.groupby('year')['fare_amount'].mean().plot(ax = ax)
plt.show()

In [None]:
#there has been an obvious increase in average fare by year

In [None]:
plt.subplots(figsize = (20,10))
ax = sns.boxplot(x = "day_of_week", y = "fare_amount", data = data)
plt.show()

In [None]:
#this isn't particularly helpful because of all the outliers above around $22
fig,ax= plt.subplots(figsize = (20,10))
data.groupby('day_of_week')['fare_amount'].mean().plot(ax = ax)
plt.show()

In [None]:
#there isn't a lot of disparity between days only a range of $.60 on the mean from the lowest average
#fare day to the highest. let's look at time of day

In [None]:
plt.subplots(figsize = (20,10))
ax = sns.boxplot(x = "hour_24", y = "fare_amount", data = data)
plt.show()

In [None]:
#again box plots are hard to distinguish except for the width of the 5am boxplot. 
fig,ax= plt.subplots(figsize = (20,10))
data.groupby('hour_24')['fare_amount'].mean().plot(ax = ax)
plt.show()

In [None]:
#here we see a very large spike around 5 UTC time. this corresponds to roughly midnight on east coast 
#time. this appears to coincide with a fare spike late at night. we see another fare spike during the 
#morning rush hour between 7am and around noon. This appears to be a variable that will be of serious
#use. We also see this is a highly nonlinear relationship. We can attempt to build this into a linear
#model using dummy variables or we can use a decision tree model and not worry about building dummy
#variables

In [None]:
#let's plot our latitude and longitude values to see if high fares are associated with start lat/lng

In [None]:
plt.scatter(data["pickup_longitude"], data["pickup_latitude"], c = data["fare_amount"])

In [None]:

sns.distplot(data["distance_traveled"])
plt.show()

In [None]:
data[data["distance_traveled"] > 100]
#these don't seem right. these people traveled hundreds of miles for less than $25. these should be
#thrown out. let's restrict our latitude and longitudes more at the beginning and see those are removed

In [10]:
data = data[data["distance_traveled"] < 100] 

In [11]:
data["day_of_week"] = pd.Categorical(data["day_of_week"])
data["hour_24"] = pd.Categorical(data["hour_24"])
data["year"] = pd.Categorical(data["year"])

In [12]:
def create_dummies(df,column_name):
    dummies = pd.get_dummies(df[column_name],prefix=column_name)
    df = pd.concat([df,dummies],axis=1)
    return df

data = create_dummies(data, "day_of_week")
data = create_dummies(data, "hour_24")
data = create_dummies(data, "year")

In [None]:
data.columns

In [None]:
#now that we have all these new columns lets look at our new correlation map to see which features
#are the most important to fare_amount

In [None]:
plot_correlation_heatmap(data)

In [None]:
#here we see a few standouts with latitude and longitude of pickup and dropoff, and distance traveled,
#a few of the hours. we can automate this by just grabbing it by a correlation cutoff

In [13]:
correlations = data.corr()["fare_amount"]

In [14]:
key_corrs = correlations.loc[abs(correlations) > .03]

In [None]:
key_corrs

In [None]:
key_corrs.index

In [15]:
key_cols = [x for x in key_corrs.index if "fare_amount" not in x]
key_cols

['pickup_longitude',
 'pickup_latitude',
 'dropoff_longitude',
 'dropoff_latitude',
 'dist_to_cp_pickup',
 'dist_to_cp_dropoff',
 'dist_to_ts_pickup',
 'dist_to_ts_dropoff',
 'dist_to_wtc_pickup',
 'dist_to_wtc_dropoff',
 'dist_to_lag_dropoff',
 'dist_to_nwk_pickup',
 'dist_to_nwk_dropoff',
 'dist_to_jfk_pickup',
 'dist_to_jfk_dropoff',
 'distance_traveled',
 'heading_to_airport',
 'hour_24_5',
 'year_2009',
 'year_2010',
 'year_2011',
 'year_2013',
 'year_2014',
 'year_2015']

In [None]:
#this is probably a good setup to start with, even with hour 5 so low

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

lr = LinearRegression(normalize = True)
scores = cross_val_score(lr, data[key_cols], data['fare_amount'], scoring = "neg_mean_squared_error", cv = 10)
mean_error_lr = scores.mean()
print(mean_error_lr)


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

hyperparameters = {"criterion": ["mse"],
                   "max_depth": [5, 10,15],
                   "max_features": ["auto","log2", "sqrt"],
                   "min_samples_leaf": [1, 5],
                   "min_samples_split": [2,5],
                   "n_estimators": [5,10,20]
}

clf = RandomForestRegressor(random_state=1)
grid = GridSearchCV(clf,param_grid=hyperparameters,cv=3)

grid.fit(data[key_cols], data['fare_amount'])



In [None]:
best_params = grid.best_params_
best_score = grid.best_score_

In [None]:
best_score

In [16]:
best_params = {
    'criterion' : 'mse',
    'max_depth': 15,
    'max_features' : 'auto',
    'min_samples_leaf': 5,
    'min_samples_split': 2,
    'n_estimators': 20
}

In [None]:
from sklearn.ensemble import RandomForestRegressor

clf = RandomForestRegressor(random_state = 1)
scores = cross_val_score(clf, data[key_cols], data['fare_amount'],scoring = "neg_mean_squared_error", cv= 5)
mean_error_rf = scores.mean()
print(mean_error_rf)

In [17]:
from sklearn.ensemble import RandomForestRegressor
clf = RandomForestRegressor(**best_params)
clf.fit(data[key_cols], data['fare_amount'])

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=15,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=5, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [18]:
def transform_data(data):
    #data = data[(data["pickup_latitude"] <43) & (data["pickup_latitude"] > 39)]
    #data = data[(data["dropoff_latitude"] <43) & (data["dropoff_latitude"] > 39)]
    #data = data[(data["pickup_longitude"] < -73) & (data["pickup_longitude"] > - 75)]
    #data = data[(data["dropoff_longitude"] <-73) & (data["dropoff_longitude"] > -75)]
    #data = data[(data["passenger_count"] > 0) & (data["passenger_count"] < 8)]
    central_park = [40.778940,-73.962295]
    time_square = [40.758623,-73.985043]
    one_world_trade = [40.712613,-74.014262]
    laguardia = [40.776288,-73.872115]
    newark = [40.693711,-74.179404]
    jfk = [40.644195,-73.782446]
    data["dist_to_cp_pickup"] = data.apply(distance_from_pickup, axis = 1, loc_lat = central_park[0], loc_lng = central_park[1])
    data["dist_to_cp_dropoff"] = data.apply(distance_from_dropoff, axis = 1, loc_lat = central_park[0], loc_lng = central_park[1])
    data["dist_to_ts_pickup"] = data.apply(distance_from_pickup, axis = 1, loc_lat = time_square[0], loc_lng = time_square[1])
    data["dist_to_ts_dropoff"] = data.apply(distance_from_dropoff, axis = 1, loc_lat = time_square[0], loc_lng = time_square[1])
    data["dist_to_wtc_pickup"] = data.apply(distance_from_pickup, axis = 1, loc_lat = one_world_trade[0], loc_lng = one_world_trade[1])
    data["dist_to_wtc_dropoff"] = data.apply(distance_from_dropoff, axis = 1, loc_lat = one_world_trade[0], loc_lng = one_world_trade[1])
    data["dist_to_lag_pickup"] = data.apply(distance_from_pickup, axis = 1, loc_lat = laguardia[0], loc_lng = laguardia[1]) 
    data["dist_to_lag_dropoff"] = data.apply(distance_from_dropoff, axis = 1, loc_lat = laguardia[0], loc_lng = laguardia[1])
    data["dist_to_nwk_pickup"] = data.apply(distance_from_pickup, axis = 1, loc_lat = newark[0], loc_lng = newark[1])
    data["dist_to_nwk_dropoff"] = data.apply(distance_from_dropoff, axis = 1, loc_lat = newark[0], loc_lng = newark[1])
    data["dist_to_jfk_pickup"] = data.apply(distance_from_pickup, axis = 1, loc_lat = jfk[0], loc_lng = jfk[1])
    data["dist_to_jfk_dropoff"] = data.apply(distance_from_dropoff, axis = 1, loc_lat = jfk[0], loc_lng = jfk[1])
    data["distance_traveled"] = data.apply(get_distance, axis = 1)
    data["heading_dt"] = data.apply(check_heading_dt, axis = 1)
    data["heading_to_airport"] = data.apply(heading_to_airport, axis= 1)

    data["pickup_datetime"] = pd.to_datetime(data["pickup_datetime"], infer_datetime_format = True)
    data["day_of_week"] = [datetime.weekday(x) for x in data["pickup_datetime"]]
    data["hour_24"] = [x.time().hour for x in data["pickup_datetime"]]
    data["year"] = [x.year for x in data["pickup_datetime"]]
    #data = data[data["distance_traveled"] < 100] 
    data = create_dummies(data, "day_of_week")
    data = create_dummies(data, "hour_24")
    data = create_dummies(data, "year")
    
    return data

In [19]:
test_data = pd.read_csv("test.csv")

In [20]:
test_data.head()

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2015-01-27 13:08:24.0000002,2015-01-27 13:08:24 UTC,-73.97332,40.763805,-73.98143,40.743835,1
1,2015-01-27 13:08:24.0000003,2015-01-27 13:08:24 UTC,-73.986862,40.719383,-73.998886,40.739201,1
2,2011-10-08 11:53:44.0000002,2011-10-08 11:53:44 UTC,-73.982524,40.75126,-73.979654,40.746139,1
3,2012-12-01 21:12:12.0000002,2012-12-01 21:12:12 UTC,-73.98116,40.767807,-73.990448,40.751635,1
4,2012-12-01 21:12:12.0000003,2012-12-01 21:12:12 UTC,-73.966046,40.789775,-73.988565,40.744427,1


In [21]:
test_data = transform_data(test_data)

In [22]:
predictions = clf.predict(test_data[key_cols])

In [23]:
keys = test_data['key']
submission_dict = {
    'key' : keys,
    'fare_amount': predictions.round(2)
}

submission_df = pd.DataFrame(submission_dict)

In [24]:
cols_in_order = ['key','fare_amount']
submission_df = submission_df[cols_in_order]
submission_df.head()
submission_df.to_csv("taxi_submission1.csv",index = False)

In [25]:
submission_df.shape

(9914, 2)

In [None]:
test_data

In [None]:

sns.distplot(test_data["distance_traveled"])
plt.show()