### This is the preliminary analysis script to test out features and combine different datasets to form ane master dataset and then establish a baseline score with logistic regressor and SVR

In [9]:
import math
import Geohash
import nltk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn
import datetime
%matplotlib inline

Reading in the train and test datasets

In [40]:
dfTrain = pd.read_csv('trainSet.csv')
dfTrain.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


In [41]:
dfTest = pd.read_csv('testSet.csv')
dfTest.head()

Unnamed: 0,id,vendor_id,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag
0,id3004672,1,2016-06-30 23:59:58,1,-73.988129,40.732029,-73.990173,40.75668,N
1,id3505355,1,2016-06-30 23:59:53,1,-73.964203,40.679993,-73.959808,40.655403,N
2,id1217141,1,2016-06-30 23:59:47,1,-73.997437,40.737583,-73.98616,40.729523,N
3,id2150126,2,2016-06-30 23:59:41,1,-73.95607,40.7719,-73.986427,40.730469,N
4,id1598245,1,2016-06-30 23:59:33,1,-73.970215,40.761475,-73.96151,40.75589,N


Extracting basic features from the dataset

In [43]:
#Calculating distance from lat/lon using haversine formula
EARTH_RADIUS_METERS = 6378137
def distMoved(currentRowLat, currentRowLon, lagRowLat, lagRowLon):
    if (
        lagRowLat is None or 
        lagRowLon is None or 
        currentRowLon is None or 
        currentRowLat is None
    ):    
        return 0

    currentRowLat = float(currentRowLat)
    currentRowLon = float(currentRowLon)
    lagRowLat = float(lagRowLat)
    lagRowLon = float(lagRowLon)

    currentRowLat = math.radians(currentRowLat)
    currentRowLon = math.radians(currentRowLon)
    lagRowLat = math.radians(lagRowLat)
    lagRowLon = math.radians(lagRowLon)

    # haversine formula 
    dlon = currentRowLon - lagRowLon 
    dlat = currentRowLat - lagRowLat 

    a = (math.sin(dlat/2)**2 + math.cos(currentRowLat) *
        math.cos(lagRowLat) * math.sin(dlon/2)**2)
    c = 2 * math.asin(math.sqrt(a)) 
    return (EARTH_RADIUS_METERS * c)

In [44]:
dfTrain['distance(meters)'] = dfTrain.apply(lambda row: distMoved(row['dropoff_latitude'], row['dropoff_longitude'],
                                                                            row['pickup_latitude'], 
                                                                             row['pickup_longitude']), axis=1)

In [45]:
dfTrain.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,distance(meters)
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455,1500.199471
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663,1807.529756
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124,6392.251289
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429,1487.162526
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435,1189.919955


In [46]:
# Extracting the month and day from the date time. Also extracting time of day (such as morning, afternoon, 
# evening and night) from the hours

def timeOfDay(hourOfDay):
    if hourOfDay >= 7 and hourOfDay < 12:
        return 'MORNING'
    elif hourOfDay >= 12 and hourOfDay < 17:
        return 'AFTERNOON'
    elif hourOfDay >= 17 and hourOfDay < 22:
        return 'EVENING'
    else:
        return 'NIGHT'

In [59]:
def dayOfWeek(day):
    if day == 0:
        return 'MONDAY'
    elif day == 1:
        return 'TUESDAY'
    elif day == 2:
        return 'WEDNESDAY'
    elif day == 3:
        return 'THURSDAY'
    elif day == 4:
        return 'FRIDAY'
    elif day == 5:
        return 'SATURDAY'
    else:
        return 'SUNDAY'

In [62]:
dfTrain['date'] = dfTrain.pickup_datetime.str[:10]
dfTrain['time'] = dfTrain.pickup_datetime.str[10:]
dfTrain['timestamp'] = pd.to_datetime(dfTrain['pickup_datetime']) 
dfTrain['year'] = dfTrain.timestamp.dt.year
dfTrain['month'] = dfTrain.timestamp.dt.month
dfTrain['dayNum'] = dfTrain.timestamp.dt.weekday
dfTrain['day'] = dfTrain.dayNum.apply(dayOfWeek)
dfTrain['hour'] = dfTrain.timestamp.dt.hour
dfTrain['timeOfDay'] = dfTrain.hour.apply(timeOfDay)
dfTrain.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,...,distance(meters),date,time,timestamp,year,month,day,dayNum,hour,timeOfDay
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,...,1500.199471,2016-03-14,17:24:55,2016-03-14 17:24:55,2016,3,MONDAY,0,17,EVENING
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,...,1807.529756,2016-06-12,00:43:35,2016-06-12 00:43:35,2016,6,SUNDAY,6,0,NIGHT
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,...,6392.251289,2016-01-19,11:35:24,2016-01-19 11:35:24,2016,1,TUESDAY,1,11,MORNING
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,...,1487.162526,2016-04-06,19:32:31,2016-04-06 19:32:31,2016,4,WEDNESDAY,2,19,EVENING
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,...,1189.919955,2016-03-26,13:30:55,2016-03-26 13:30:55,2016,3,SATURDAY,5,13,AFTERNOON
