In [83]:
# get data for dataset

# Common imports
import numpy as np
import pandas as pd

TAXI_PATH = os.path.join("data", "taxi")

def load_taxi_data(taxi_path=TAXI_PATH):
    csv_path = os.path.join(taxi_path, "train.csv")
    return pd.read_csv(csv_path)

import warnings
warnings.filterwarnings(action="ignore", message="^RuntimeWarning")

In [84]:
train = load_taxi_data()
csv_path = os.path.join(TAXI_PATH, "test.csv")
test = pd.read_csv(csv_path)

In [85]:
import mpu

def calculateDistances(data, distances):
    for each in data.iterrows():
        lat1 = float(each[1]['pickup_latitude'])
        lon1 = float(each[1]['pickup_longitude'])
        lat2 = float(each[1]['dropoff_latitude'])
        lon2 = float(each[1]['dropoff_longitude'])
        dist = mpu.haversine_distance((lat1, lon1), (lat2, lon2))
        distances.append(dist)
        

distances = []
distances_test = []
calculateDistances(train, distances)
calculateDistances(test, distances_test)
distances = np.array(distances)
train['haversine_distance'] = distances
test['haversine_distance'] = distances_test

In [86]:
train.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,haversine_distance
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455,1.498521
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663,1.805507
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124,6.385098
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429,1.485498
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435,1.188588


In [87]:
from math import *
def manhattan_distance(lat1,lon1,lat2,lon2):
    a = mpu.haversine_distance((lat1, lon1), (lat1, lon2))
    b = mpu.haversine_distance((lat1, lon1), (lat2, lon1))
    return a + b

def euclidean_distance(x,y):
  return sqrt(sum(pow(a-b,2) for a, b in zip(x, y)))

train['euclidean_distance'] = train.apply(lambda row: euclidean_distance( (row['pickup_latitude'], row['pickup_longitude']), (row['dropoff_latitude'], row['dropoff_longitude']) ), axis=1)
test['euclidean_distance'] = test.apply(lambda row: euclidean_distance( (row['pickup_latitude'], row['pickup_longitude']), (row['dropoff_latitude'], row['dropoff_longitude']) ), axis=1)
train['log_euclidean_distance'] = np.log1p(train['euclidean_distance']) 
test['log_euclidean_distance'] = np.log1p(test['euclidean_distance']) 

train['manhattan_distance'] = train.apply(lambda row: manhattan_distance( row['pickup_latitude'], row['pickup_longitude'], row['dropoff_latitude'], row['dropoff_longitude'] ), axis=1)
test['manhattan_distance'] = test.apply(lambda row: manhattan_distance( row['pickup_latitude'], row['pickup_longitude'], row['dropoff_latitude'], row['dropoff_longitude'] ), axis=1)
train['log_manhattan_distance'] = np.log1p(train['manhattan_distance']) 
test['log_manhattan_distance'] = np.log1p(test['manhattan_distance']) 

In [88]:
dataframe = pd.DataFrame(train['haversine_distance']);
dataframe['id'] = train['id']
dataframe['euclidean_distance'] = train['euclidean_distance']
dataframe['log_euclidean_distance'] = train['log_euclidean_distance']
dataframe['manhattan_distance'] = train['manhattan_distance']
dataframe['log_manhattan_distance'] = train['log_manhattan_distance']
dataframe.head()

Unnamed: 0,haversine_distance,id,euclidean_distance,log_euclidean_distance,manhattan_distance,log_manhattan_distance
0,1.498521,id2875421,0.01768,0.017525,1.735433,1.00629
1,1.805507,id2377394,0.020456,0.020249,2.430506,1.232708
2,6.385098,id3858529,0.059934,0.058206,8.203575,2.219592
3,1.485498,id3504673,0.013438,0.013349,1.661331,0.978826
4,1.188588,id2181028,0.01069,0.010633,1.199457,0.78821


In [89]:
df = pd.DataFrame(test['haversine_distance']);
df['id'] = test['id']
df['euclidean_distance'] = test['euclidean_distance']
df['log_euclidean_distance'] = test['log_euclidean_distance']
df['manhattan_distance'] = test['manhattan_distance']
df['log_manhattan_distance'] = test['log_manhattan_distance']
df.head()

Unnamed: 0,haversine_distance,id,euclidean_distance,log_euclidean_distance,manhattan_distance,log_manhattan_distance
0,2.746426,id3004672,0.024735,0.024434,2.913304,1.364382
1,2.759239,id3505355,0.024979,0.024672,3.104805,1.412158
2,1.306155,id1217141,0.013861,0.013766,1.84634,1.046034
3,5.269088,id2150126,0.051363,0.050087,7.163347,2.099654
4,0.960842,id1598245,0.010343,0.010289,1.354164,0.856186


In [90]:
csv_path = os.path.join(TAXI_PATH, "train_distance.csv")
dataframe.to_csv(csv_path)
dataframe.head()

Unnamed: 0,haversine_distance,id,euclidean_distance,log_euclidean_distance,manhattan_distance,log_manhattan_distance
0,1.498521,id2875421,0.01768,0.017525,1.735433,1.00629
1,1.805507,id2377394,0.020456,0.020249,2.430506,1.232708
2,6.385098,id3858529,0.059934,0.058206,8.203575,2.219592
3,1.485498,id3504673,0.013438,0.013349,1.661331,0.978826
4,1.188588,id2181028,0.01069,0.010633,1.199457,0.78821


In [91]:
csv_path = os.path.join(TAXI_PATH, "test_distance.csv")
df.to_csv(csv_path)
df.head()

Unnamed: 0,haversine_distance,id,euclidean_distance,log_euclidean_distance,manhattan_distance,log_manhattan_distance
0,2.746426,id3004672,0.024735,0.024434,2.913304,1.364382
1,2.759239,id3505355,0.024979,0.024672,3.104805,1.412158
2,1.306155,id1217141,0.013861,0.013766,1.84634,1.046034
3,5.269088,id2150126,0.051363,0.050087,7.163347,2.099654
4,0.960842,id1598245,0.010343,0.010289,1.354164,0.856186


In [92]:
csv_path = os.path.join(TAXI_PATH, "train_distance.csv")
tester = pd.read_csv(csv_path, index_col=0)
tester.head()
tester.isnull().sum(axis=0).sum(axis=0)

  mask |= (ar1 == a)


0

In [93]:
csv_path = os.path.join(TAXI_PATH, "test_distance.csv")
tester1 = pd.read_csv(csv_path, index_col=0)
tester1.head()
tester1.isnull().sum(axis=0).sum(axis=0)

0

In [94]:
#df.to_csv(file_name, sep='\t', encoding='utf-8')