In [1]:
# Data Exploration

# Importing various libraries

import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import datetime
import math
import xgboost as xgb

In [2]:
# Still can't read 55 million.. capping it at 10 million

df_train = pd.read_csv("train.csv", nrows=10000000)

Data Cleaning
---
---


In [3]:
def CleanData(df_train):
    # Removing null entries

    df_train = df_train[df_train['dropoff_latitude'].isnull() == False]

    # Removing odd passenger counts

    df_train = df_train[df_train.passenger_count < 8]
    df_train = df_train[df_train.passenger_count >= 1]

    # Removing negative fares

    df_train = df_train[df_train.fare_amount > 0]

    # These cut off values were done by trial and error, and observing the bounding box of NYC on Google Maps

    df_train = df_train[df_train['pickup_longitude'] > -75]
    df_train = df_train[df_train['pickup_longitude'] < -72]
    df_train = df_train[df_train['pickup_latitude'] < 41]
    df_train = df_train[df_train['pickup_latitude'] > 40]


    df_train = df_train[df_train['dropoff_longitude'] > -75]
    df_train = df_train[df_train['dropoff_longitude'] < -72]
    df_train = df_train[df_train['dropoff_latitude'] < 41]
    df_train = df_train[df_train['dropoff_latitude'] > 40]
    
    return df_train
    
df_train = CleanData(df_train)

Feature Creation
---
---

In [4]:
def HourGroup(hour):
    group = 0
    
    # 0:00 - 7:00
    if hour >= 0 and hour <=7:
        group = 1
    # 18:00 - 22:00
    elif hour >= 18 and hour <= 22:
        group = 2
    # other hours
    else:
        group = 3
        
    return group

In [5]:
def FeatureCreation(df_train):

    df_train['date'] = df_train['pickup_datetime'].apply(lambda x : x[:-12])
    df_train['time'] = df_train['pickup_datetime'].apply(lambda x : x[11:-4])

    df_train['month'] = df_train['date'].apply(lambda x : int(x.split("-")[1]))
    df_train['hour'] = df_train['time'].apply(lambda x : int(x.split(":")[0]))

    df_train['hour_group'] = df_train['hour'].apply(lambda x : HourGroup(x))

    df_train['day'] = df_train['date'].apply(lambda x : datetime.datetime.strptime(x.strip(), "%Y-%m-%d").weekday())
    
    return df_train

df_train = FeatureCreation(df_train)

In [28]:
def DistanceCal(df_train, test=1):
    # Got it from https://www.kaggle.com/pavanraj159/nyc-taxi-fare-time-series-forecasting
    # Calculating distnace between coordinates
    R = 6373.0

    pickup_lat  = np.radians(df_train["pickup_latitude"])
    pickup_lon  = np.radians(df_train["pickup_longitude"])
    dropoff_lat = np.radians(df_train["dropoff_latitude"])
    dropoff_lon = np.radians(df_train["dropoff_longitude"])

    dist_lon = dropoff_lon - pickup_lon
    dist_lat = dropoff_lat - pickup_lat

    #Formula
    a = (np.sin(dist_lat/2))**2 + np.cos(pickup_lat) * np.cos(dropoff_lat) * (np.sin(dist_lon/2))**2 
    c = 2 * np.arctan2( np.sqrt(a), np.sqrt(1-a) ) 
    d = R * c #(where R is the radius of the Earth)

    df_train['distance'] = d
    
    if test:
        df_train = df_train[df_train['distance'] > 1]
        df_train = df_train[df_train['distance'] <40]
    
    return df_train

df_train = DistanceCal(df_train)

Training the model
---
---

In [37]:
features = ['day', 'hour', 'distance', 'passenger_count', 'month', 'hour_group']
x = df_train[features]
y = df_train['fare_amount']

x.shape,y.shape

((8133662, 5), (8133662,))

In [38]:
# create training and test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

In [39]:
# testing the model

#Cross-validation
params = {
    # Parameters that we are going to tune.
    'max_depth': 8, #Result of tuning with CV
    'eta':.03, #Result of tuning with CV
    'subsample': 1, #Result of tuning with CV
    'colsample_bytree': 0.8, #Result of tuning with CV
    # Other parameters
    'objective':'reg:linear',
    'eval_metric':'rmse',
    'silent': 1
}

def XGBmodel(x_train,x_test,y_train,y_test,params):
    matrix_train = xgb.DMatrix(x_train,label=y_train)
    matrix_test = xgb.DMatrix(x_test,label=y_test)
    model=xgb.train(params=params,
                    dtrain=matrix_train,num_boost_round=5000, 
                    early_stopping_rounds=10,evals=[(matrix_test,'test')])
    return model

model = XGBmodel(X_train,X_test,y_train,y_test,params)

[0]	test-rmse:14.8981
Will train until test-rmse hasn't improved in 10 rounds.
[1]	test-rmse:14.4874
[2]	test-rmse:14.089
[3]	test-rmse:13.7035
[4]	test-rmse:13.3308
[5]	test-rmse:12.9702
[6]	test-rmse:12.6224
[7]	test-rmse:12.4043
[8]	test-rmse:12.0753
[9]	test-rmse:11.7574
[10]	test-rmse:11.4503
[11]	test-rmse:11.1525
[12]	test-rmse:10.8649
[13]	test-rmse:10.5871
[14]	test-rmse:10.319
[15]	test-rmse:10.0601
[16]	test-rmse:9.8103
[17]	test-rmse:9.56928
[18]	test-rmse:9.42206
[19]	test-rmse:9.19491
[20]	test-rmse:8.97663
[21]	test-rmse:8.76633
[22]	test-rmse:8.56297
[23]	test-rmse:8.44167
[24]	test-rmse:8.25024
[25]	test-rmse:8.1388
[26]	test-rmse:7.95884
[27]	test-rmse:7.78632
[28]	test-rmse:7.68799
[29]	test-rmse:7.5943
[30]	test-rmse:7.50509
[31]	test-rmse:7.34939
[32]	test-rmse:7.19987
[33]	test-rmse:7.05631
[34]	test-rmse:6.9191
[35]	test-rmse:6.78681
[36]	test-rmse:6.65987
[37]	test-rmse:6.53824
[38]	test-rmse:6.42175
[39]	test-rmse:6.36077
[40]	test-rmse:6.25227
[41]	test-rmse:6

Working on test set
---
---

In [40]:
df_test = pd.read_csv('test.csv')

In [41]:
df_test = FeatureCreation(df_test)
df_test = DistanceCal(df_test, 0)

In [42]:
features = ['day', 'hour', 'distance', 'passenger_count', 'month', 'hour_group']
X = df_test[features].values

X_pred = pd.DataFrame(X, columns=['day', 'hour', 'distance', 'passenger_count', 'month', 'hour_group'])

In [43]:
#Predict from test set
prediction = model.predict(xgb.DMatrix(X_pred), ntree_limit = model.best_ntree_limit)

Writing Submission
---
---

In [44]:
df_pred = pd.DataFrame(prediction, columns=["fare_amount"])

df_key = df_test['key']

result = pd.concat([df_key,df_pred], axis=1, sort=False)

In [45]:
result.to_csv("my_submission.csv", index=False)

In [46]:
result.head()

Unnamed: 0,key,fare_amount
0,2015-01-27 13:08:24.0000002,9.108109
1,2015-01-27 13:08:24.0000003,9.343334
2,2011-10-08 11:53:44.0000002,5.539584
3,2012-12-01 21:12:12.0000002,8.466614
4,2012-12-01 21:12:12.0000003,15.965824


In [47]:
df_key.head()


0    2015-01-27 13:08:24.0000002
1    2015-01-27 13:08:24.0000003
2    2011-10-08 11:53:44.0000002
3    2012-12-01 21:12:12.0000002
4    2012-12-01 21:12:12.0000003
Name: key, dtype: object

In [48]:
df_pred.head()

Unnamed: 0,fare_amount
0,9.108109
1,9.343334
2,5.539584
3,8.466614
4,15.965824
