In [1]:
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
import holidays

In [2]:
trainset = pd.read_csv("archive/train.csv")
testset = pd.read_csv("archive/test_public.csv")

In [3]:
def polyline_to_trip_duration(polyline):
  return max(polyline.count("[") - 2, 0) * 15

trainset["LEN"] = trainset["POLYLINE"].apply(polyline_to_trip_duration)

In [4]:
from datetime import datetime
Portugal_holidays = holidays.PT()

def parse_time(x):
  dt = datetime.fromtimestamp(x["TIMESTAMP"])
  is_holiday = datetime(dt.year, dt.month, dt.day) in Portugal_holidays
  day_before_holiday = False
  try:
      day_before_holiday = datetime(dt.year, dt.month, dt.day + 1) in Portugal_holidays
  except:
      pass
  else:
      day_before_holiday = datetime(dt.year, dt.month, dt.day + 1) in Portugal_holidays
  return dt.year, dt.month, dt.day, dt.hour, dt.minute, dt.weekday(), 1 if is_holiday else 0, 1 if day_before_holiday else 0

trainset[["YR", "MON", "DAY", "HR", "MIN", "WK", "HOLI", "BHOLI"]] = trainset[["TIMESTAMP"]].apply(parse_time, axis=1, result_type="expand")
testset[["YR", "MON", "DAY", "HR", "MIN", "WK", "HOLI", "BHOLI"]] = testset[["TIMESTAMP"]].apply(parse_time, axis=1, result_type="expand")

In [5]:
trainset.drop(trainset[trainset["MISSING_DATA"] == True].index, inplace=True)
# trainset.drop(trainset[trainset["YR"] == 2013].index, inplace=True)
# trainset.drop(trainset[trainset["LEN"] == 0].index, inplace=True)

bad_columns = ["DAY_TYPE", "MISSING_DATA", "YR"]
trainset.drop(columns=bad_columns, inplace=True)
testset.drop(columns=bad_columns, inplace=True)

In [6]:
trainset.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,POLYLINE,LEN,MON,DAY,HR,MIN,WK,HOLI,BHOLI
0,1372636858620000589,C,,,20000589,1372636858,"[[-8.618643,41.141412],[-8.618499,41.141376],[...",330,6,30,17,0,6,0,0
1,1372637303620000596,B,,7.0,20000596,1372637303,"[[-8.639847,41.159826],[-8.640351,41.159871],[...",270,6,30,17,8,6,0,0
2,1372636951620000320,C,,,20000320,1372636951,"[[-8.612964,41.140359],[-8.613378,41.14035],[-...",960,6,30,17,2,6,0,0
3,1372636854620000520,C,,,20000520,1372636854,"[[-8.574678,41.151951],[-8.574705,41.151942],[...",630,6,30,17,0,6,0,0
4,1372637091620000337,C,,,20000337,1372637091,"[[-8.645994,41.18049],[-8.645949,41.180517],[-...",420,6,30,17,4,6,0,0


In [7]:
testset.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,MON,DAY,HR,MIN,WK,HOLI,BHOLI
0,T1,B,,15.0,20000542,1408039037,8,14,10,57,3,0,1
1,T2,B,,57.0,20000108,1408038611,8,14,10,50,3,0,1
2,T3,B,,15.0,20000370,1408038568,8,14,10,49,3,0,1
3,T4,B,,53.0,20000492,1408039090,8,14,10,58,3,0,1
4,T5,B,,18.0,20000621,1408039177,8,14,10,59,3,0,1


In [8]:
onehot_cols = ["CALL_TYPE", "MON", "DAY", "HR", "WK"]

num_drops = 0

for x in range(12):
    testset.loc[len(testset.index)] = [0,"A",0,0,0,0,x+1,1,0,0,0,0,0] 
    num_drops += 1

for x in range(31):
    testset.loc[len(testset.index)] = [0,"A",0,0,0,0,1,x+1,0,0,0,0,0] 
    num_drops += 1

for x in range(24):
    testset.loc[len(testset.index)] = [0,"A",0,0,0,0,1,1,x,0,0,0,0] 
    num_drops += 1

for x in range(7):
    testset.loc[len(testset.index)] = [0,"A",0,0,0,0,1,1,0,0,x,0,0] 
    num_drops += 1

trainset = pd.get_dummies(trainset, columns=onehot_cols)
testset = pd.get_dummies(testset, columns=onehot_cols)

testset = testset.head(testset.shape[0]-num_drops)

In [9]:
start_timestamp = trainset["TIMESTAMP"].min()
trainset["TIMESTAMP"] = trainset["TIMESTAMP"].apply(lambda x: x - start_timestamp)
testset["TIMESTAMP"] = testset["TIMESTAMP"].apply(lambda x: x - start_timestamp)

In [10]:
start_taxi_id = trainset["TAXI_ID"].min()
trainset["TAXI_ID"] = trainset["TAXI_ID"].apply(lambda x: x - start_taxi_id)
testset["TAXI_ID"] = testset["TAXI_ID"].apply(lambda x: x - start_taxi_id)

In [11]:
trainset.fillna(0, inplace=True)
testset.fillna(0, inplace=True)

In [12]:
trainset.head()

Unnamed: 0,TRIP_ID,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,POLYLINE,LEN,MIN,HOLI,BHOLI,...,HR_21,HR_22,HR_23,WK_0,WK_1,WK_2,WK_3,WK_4,WK_5,WK_6
0,1372636858620000589,0.0,0.0,588,5,"[[-8.618643,41.141412],[-8.618499,41.141376],[...",330,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,1372637303620000596,0.0,7.0,595,450,"[[-8.639847,41.159826],[-8.640351,41.159871],[...",270,8,0,0,...,0,0,0,0,0,0,0,0,0,1
2,1372636951620000320,0.0,0.0,319,98,"[[-8.612964,41.140359],[-8.613378,41.14035],[-...",960,2,0,0,...,0,0,0,0,0,0,0,0,0,1
3,1372636854620000520,0.0,0.0,519,1,"[[-8.574678,41.151951],[-8.574705,41.151942],[...",630,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,1372637091620000337,0.0,0.0,336,238,"[[-8.645994,41.18049],[-8.645949,41.180517],[-...",420,4,0,0,...,0,0,0,0,0,0,0,0,0,1


In [13]:
testset.head()

Unnamed: 0,TRIP_ID,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,MIN,HOLI,BHOLI,CALL_TYPE_A,CALL_TYPE_B,...,HR_21,HR_22,HR_23,WK_0,WK_1,WK_2,WK_3,WK_4,WK_5,WK_6
0,T1,0.0,15.0,541,35402184,57,0,1,0,1,...,0,0,0,0,0,0,1,0,0,0
1,T2,0.0,57.0,107,35401758,50,0,1,0,1,...,0,0,0,0,0,0,1,0,0,0
2,T3,0.0,15.0,369,35401715,49,0,1,0,1,...,0,0,0,0,0,0,1,0,0,0
3,T4,0.0,53.0,491,35402237,58,0,1,0,1,...,0,0,0,0,0,0,1,0,0,0
4,T5,0.0,18.0,620,35402324,59,0,1,0,1,...,0,0,0,0,0,0,1,0,0,0


In [14]:
trainset.to_csv("processed_taxi/train.csv")
testset.to_csv("processed_taxi/test.csv")

In [15]:
print(trainset["TAXI_ID"].min(), trainset["TAXI_ID"].max())
print(testset["TAXI_ID"].min(), testset["TAXI_ID"].max())

0 980
3 903
