In [1]:
# Data Exploration

# Importing various libraries

import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import datetime
import math

In [2]:
# Still can't read 55 million.. capping it at 10 million

df_train = pd.read_csv("train.csv", nrows=1000000)

In [3]:
# Removing null entries

df_train = df_train[df_train['dropoff_latitude'].isnull() == False]
print("Fare amount has Nan: {}".format(df_train['fare_amount'].isnull().values.any()))
print("Pickup datetime amount has Nan: {}".format(df_train['pickup_datetime'].isnull().values.any()))
print("Pickup lat amount has Nan: {}".format(df_train['pickup_latitude'].isnull().values.any()))
print("Pickup long amount has Nan: {}".format(df_train['pickup_longitude'].isnull().values.any()))
print("Dropoff lat amount has Nan: {}".format(df_train['dropoff_latitude'].isnull().values.any()))
print("Dropoff long amount has Nan: {}".format(df_train['dropoff_longitude'].isnull().values.any()))
print("Passenger count amount has Nan: {}".format(df_train['passenger_count'].isnull().values.any()))

Fare amount has Nan: False
Pickup datetime amount has Nan: False
Pickup lat amount has Nan: False
Pickup long amount has Nan: False
Dropoff lat amount has Nan: False
Dropoff long amount has Nan: False
Passenger count amount has Nan: False


In [4]:
# Removing odd passenger counts

df_train = df_train[df_train.passenger_count != 208]
df_train = df_train[df_train.passenger_count != 0]

In [5]:
# Removing negative fares

df_train = df_train[df_train.fare_amount > 0]

In [6]:
# Creating new features date and time

df_clean = df_train

df_clean['date'] = df_train['pickup_datetime'].apply(lambda x : x[:-12])
df_clean['time'] = df_train['pickup_datetime'].apply(lambda x : x[11:-4])

# removing datetime, since no longer needed
df_clean = df_clean.drop(columns=['pickup_datetime'])

df_clean['day'] = df_clean['date'].apply(lambda x : datetime.datetime.strptime(x.strip(), "%Y-%m-%d").weekday())

In [7]:
# Exploring Time feature


df_clean['hour'] = df_clean['time'].apply(lambda x : int(x.split(":")[0]))

In [8]:
# These cut off values were done by trial and error, and observing the bounding box of NYC on Google Maps

df_clean = df_clean[df_clean['pickup_longitude'] > -75]
df_clean = df_clean[df_clean['pickup_longitude'] < -72]
df_clean = df_clean[df_clean['pickup_latitude'] < 41]
df_clean = df_clean[df_clean['pickup_latitude'] > 40]


df_clean = df_clean[df_clean['dropoff_longitude'] > -75]
df_clean = df_clean[df_clean['dropoff_longitude'] < -72]
df_clean = df_clean[df_clean['dropoff_latitude'] < 41]
df_clean = df_clean[df_clean['dropoff_latitude'] > 40]


In [9]:
# Got it from https://www.kaggle.com/pavanraj159/nyc-taxi-fare-time-series-forecasting

R = 6373.0

pickup_lat  = np.radians(df_clean["pickup_latitude"])
pickup_lon  = np.radians(df_clean["pickup_longitude"])
dropoff_lat = np.radians(df_clean["dropoff_latitude"])
dropoff_lon = np.radians(df_clean["dropoff_longitude"])

dist_lon = dropoff_lon - pickup_lon
dist_lat = dropoff_lat - pickup_lat

#Formula
a = (np.sin(dist_lat/2))**2 + np.cos(pickup_lat) * np.cos(dropoff_lat) * (np.sin(dist_lon/2))**2 
c = 2 * np.arctan2( np.sqrt(a), np.sqrt(1-a) ) 
d = R * c #(where R is the radius of the Earth)

df_clean['distance'] = d

In [10]:
df_clean = df_clean[df_clean['distance'] > 1]
df_clean = df_clean[df_clean['distance'] <40]

In [11]:
features = ['day', 'hour', 'distance', 'passenger_count']
x = df_clean[features]
y = df_clean['fare_amount']

x.shape,y.shape

((813922, 4), (813922,))

In [12]:
# create training and test sets
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

In [13]:
# testing the model

from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor

mlp = MLPRegressor(hidden_layer_sizes=(5,5))

scaler = StandardScaler()
# Fit only to the training data
scaler.fit(x_train)

mlp.fit(x_train, y_train)

y_test_pred = mlp.predict(x_test)

In [14]:
# evaluating the model

from sklearn.metrics import r2_score

r2_score(y_test, y_test_pred)

0.8144731333988284

In [15]:
# Not too shabby! Lets generate a Kaggle submission and see how it performs

df_test = pd.read_csv('test.csv')

df_test = df_test[df_test['dropoff_latitude'].isnull() == False]
print("Pickup datetime amount has Nan: {}".format(df_test['pickup_datetime'].isnull().values.any()))
print("Pickup lat amount has Nan: {}".format(df_test['pickup_latitude'].isnull().values.any()))
print("Pickup long amount has Nan: {}".format(df_test['pickup_longitude'].isnull().values.any()))
print("Dropoff lat amount has Nan: {}".format(df_test['dropoff_latitude'].isnull().values.any()))
print("Dropoff long amount has Nan: {}".format(df_test['dropoff_longitude'].isnull().values.any()))
print("Passenger count amount has Nan: {}".format(df_test['passenger_count'].isnull().values.any()))

Pickup datetime amount has Nan: False
Pickup lat amount has Nan: False
Pickup long amount has Nan: False
Dropoff lat amount has Nan: False
Dropoff long amount has Nan: False
Passenger count amount has Nan: False


In [16]:
df_test.head()

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2015-01-27 13:08:24.0000002,2015-01-27 13:08:24 UTC,-73.97332,40.763805,-73.98143,40.743835,1
1,2015-01-27 13:08:24.0000003,2015-01-27 13:08:24 UTC,-73.986862,40.719383,-73.998886,40.739201,1
2,2011-10-08 11:53:44.0000002,2011-10-08 11:53:44 UTC,-73.982524,40.75126,-73.979654,40.746139,1
3,2012-12-01 21:12:12.0000002,2012-12-01 21:12:12 UTC,-73.98116,40.767807,-73.990448,40.751635,1
4,2012-12-01 21:12:12.0000003,2012-12-01 21:12:12 UTC,-73.966046,40.789775,-73.988565,40.744427,1


In [17]:
# Creating new features date and time

df_clean = df_test

df_clean['date'] = df_clean['pickup_datetime'].apply(lambda x : x[:-12])
df_clean['time'] = df_clean['pickup_datetime'].apply(lambda x : x[11:-4])



# removing datetime, since no longer needed
df_clean = df_clean.drop(columns=['pickup_datetime'])

df_clean['day'] = df_clean['date'].apply(lambda x : datetime.datetime.strptime(x.strip(), "%Y-%m-%d").weekday())

In [18]:
any(df_clean['time'].isnull())

False

In [19]:
# Exploring Time feature

df_clean['hour'] = df_clean['time'].apply(lambda x : int(x.split(":")[0]))

In [20]:
# Got it from https://www.kaggle.com/pavanraj159/nyc-taxi-fare-time-series-forecasting

R = 6373.0

pickup_lat  = np.radians(df_clean["pickup_latitude"])
pickup_lon  = np.radians(df_clean["pickup_longitude"])
dropoff_lat = np.radians(df_clean["dropoff_latitude"])
dropoff_lon = np.radians(df_clean["dropoff_longitude"])

dist_lon = dropoff_lon - pickup_lon
dist_lat = dropoff_lat - pickup_lat

#Formula
a = (np.sin(dist_lat/2))**2 + np.cos(pickup_lat) * np.cos(dropoff_lat) * (np.sin(dist_lon/2))**2 
c = 2 * np.arctan2( np.sqrt(a), np.sqrt(1-a) ) 
d = R * c #(where R is the radius of the Earth)

df_clean['distance'] = d

In [21]:
features = ['day', 'hour', 'distance', 'passenger_count']
x = df_clean[features].values

x.shape

(9914, 4)

In [22]:
pred = mlp.predict(x)

In [23]:
df_pred = pd.DataFrame(pred, columns=["fare_amount"])
df_pred.head()

Unnamed: 0,fare_amount
0,9.523135
1,9.761744
2,5.247986
3,8.139517
4,15.85371


In [24]:
len(df_pred)

9914

In [25]:
df_key = df_clean['key']

In [26]:
result = pd.concat([df_key,df_pred], axis=1, sort=False)
result.head()

Unnamed: 0,key,fare_amount
0,2015-01-27 13:08:24.0000002,9.523135
1,2015-01-27 13:08:24.0000003,9.761744
2,2011-10-08 11:53:44.0000002,5.247986
3,2012-12-01 21:12:12.0000002,8.139517
4,2012-12-01 21:12:12.0000003,15.85371


In [27]:
result.to_csv("my_submission.csv", index=False)