[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/13Fiai89Jo5lsOLRRYlkKMVongEHt6W-A?usp=sharing)

# Preprocess data

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')
import pandas as pd
import numpy as np
import pickle
import os
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter


input_month = "06" # this global variable is used to download/save file according to the month
file_prefix = "/content/gdrive/Shareddrives/Education/5709_Capstone/taxi_"

Mounted at /content/drive


## Cleaning function define

In [None]:
def request_write(input_str, input_month):
    url = "https://nyc-tlc.s3.amazonaws.com/trip+data/"+input_str+"_tripdata_2019-"+input_month+".csv"
    !wget $url

# final_col_set = ["PULocationID", "DOLocationID", "PU_datetime", "DO_datetime", "taxi", "weekNum", "hour_of_day", "min_cat"]
final_col_set = ["PULocationID", "DOLocationID", "PU_datetime", "DO_datetime", "taxi"]

def time_series_process(input_pd, month):
    # changes apply to the external variables, by reference
    input_pd.PU_datetime = pd.to_datetime(input_pd.PU_datetime).dt.floor('10min') # floor to nearest 10 min
    input_pd.drop(input_pd[(input_pd.PU_datetime.dt.year != 2019) | (input_pd.PU_datetime.dt.month != int(month))].index, inplace=True) # because some file contains more than this month
    input_pd.DO_datetime = pd.to_datetime(input_pd.DO_datetime).dt.floor('10min')
    input_pd.drop(input_pd[(input_pd.DO_datetime.dt.year != 2019) | (input_pd.DO_datetime.dt.month != int(month))].index, inplace=True)
    # input_pd['weekNum'] = input_pd.PU_datetime.dt.day_name()
    # input_pd['hour_of_day'] = input_pd.PU_datetime.dt.hour
    # input_pd['min_cat'] = input_pd.PU_datetime.dt.minute
    input_pd.drop(columns = [col for col in input_pd if col not in final_col_set], inplace = True)

## Preprocess and save on each dataset

Download

In [None]:
request_write("green", input_month)
request_write("yellow", input_month)
request_write("fhvhv", input_month)

Green

In [None]:
%%time
taxi_green = pd.read_csv("/content/green_tripdata_2019-"+input_month+".csv")\
                    .rename(columns={'lpep_pickup_datetime':'PU_datetime',"lpep_dropoff_datetime": "DO_datetime"})
taxi_green["taxi"] = "green"
time_series_process(taxi_green, input_month)
taxi_green = taxi_green[final_col_set] # order the columns
taxi_green.to_csv(file_prefix + input_month + "_all.csv", index=False)
# clean up memory
del taxi_green

CPU times: user 6.91 s, sys: 546 ms, total: 7.46 s
Wall time: 31.7 s


Yellow

In [None]:
%%time
taxi_yellow = pd.read_csv("/content/yellow_tripdata_2019-"+input_month+".csv")\
            .rename(columns={'tpep_pickup_datetime':'PU_datetime',"tpep_dropoff_datetime": "DO_datetime"})
taxi_yellow["taxi"] = "yellow"
time_series_process(taxi_yellow, input_month)
taxi_yellow = taxi_yellow[final_col_set] # order the columns
taxi_yellow.to_csv(file_prefix + input_month + "_all.csv", mode='a', header=False, index=False)
# clean up memory
del taxi_yellow

CPU times: user 54.1 s, sys: 6.04 s, total: 1min
Wall time: 1min 4s


FHVHV (too large, needs to read in chunks)

In [None]:
%%time
chunksize = 1e6
chunk_list = []
taxi_fhvhv_chunk = pd.read_csv("/content/fhvhv_tripdata_2019-"+input_month+".csv", chunksize=chunksize)
for chunk in taxi_fhvhv_chunk:
    chunk.rename(columns={'pickup_datetime':'PU_datetime',"dropoff_datetime": "DO_datetime", "hvfhs_license_num": "taxi"}, inplace = True)
    chunk["taxi"] = chunk.taxi.map({"HV0003": "uber", "HV0004": "via", "HV0005": "lyft"}, na_action = "ignore")
    time_series_process(chunk, input_month)
    chunk_list.append(chunk)
# clean up memory
del taxi_fhvhv_chunk
tmp = pd.concat(chunk_list)
del chunk_list
tmp = tmp[final_col_set] # order the columns
tmp.to_csv(file_prefix + input_month + "_all.csv", mode='a', header=False, index=False)
del tmp

CPU times: user 2min 27s, sys: 6.04 s, total: 2min 33s
Wall time: 2min 48s


The resulting file is huge. 

Look like this:

<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>PULocationID</th>
      <th>DOLocationID</th>
      <th>PU_datetime</th>
      <th>DO_datetime</th>
      <th>taxi</th>
      <th>weekNum</th>
      <th>hour_of_day</th>
      <th>min_cat</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>74</td>
      <td>263</td>
      <td>2019-06-01 00:20:00</td>
      <td>2019-06-01 00:33:52</td>
      <td>green</td>
      <td>Saturday</td>
      <td>0</td>
      <td>20</td>
    </tr>
    <tr>
      <th>1</th>
      <td>75</td>
      <td>74</td>
      <td>2019-06-01 00:30:00</td>
      <td>2019-06-01 00:46:38</td>
      <td>green</td>
      <td>Saturday</td>
      <td>0</td>
      <td>30</td>
    </tr>
    <tr>
      <th>2</th>
      <td>75</td>
      <td>74</td>
      <td>2019-06-01 00:50:00</td>
      <td>2019-06-01 01:00:29</td>
      <td>green</td>
      <td>Saturday</td>
      <td>0</td>
      <td>50</td>
    </tr>
    <tr>
      <th>3</th>
      <td>255</td>
      <td>37</td>
      <td>2019-06-01 00:50:00</td>
      <td>2019-06-01 01:10:07</td>
      <td>green</td>
      <td>Saturday</td>
      <td>0</td>
      <td>50</td>
    </tr>
    <tr>
      <th>4</th>
      <td>41</td>
      <td>116</td>
      <td>2019-06-01 00:00:00</td>
      <td>2019-06-01 00:15:45</td>
      <td>green</td>
      <td>Saturday</td>
      <td>0</td>
      <td>0</td>
    </tr>
  </tbody>
</table>




```
html = taxi_all.head().to_html()
print(html)
```



## Aggregation and save

[Pandas groupby](https://www.shanelynn.ie/summarising-aggregation-and-grouping-data-in-python-pandas/)

In [None]:
%%time

def write_to_aggregate_file(req_datetime, req_LocationID, file_suffix):
    # agg() returns only the count column with all group names as index, reset_index() split these group names into columns
    taxi_agg_demand = taxi_all.groupby([req_datetime, req_LocationID, "taxi"]).agg(count = ("taxi", "count")).reset_index()
    taxi_agg_demand.to_csv(file_prefix + file_suffix , index=False)

for input_month in ["07", "08", "09"]:
    taxi_all = pd.read_csv(file_prefix + input_month + "_all.csv")

    # rewrite
    time_series_process(taxi_all, input_month)
    taxi_all = taxi_all[final_col_set] # order the columns
    taxi_all.to_csv(file_prefix + input_month + "_all.csv", index=False)

    req_datetime, req_LocationID, file_suffix = 'PU_datetime', 'PULocationID', input_month+"_agg_demand.csv"
    write_to_aggregate_file(req_datetime, req_LocationID, file_suffix)

    req_datetime, req_LocationID, file_suffix = 'DO_datetime', 'DOLocationID', input_month+"_agg_supply.csv"
    write_to_aggregate_file(req_datetime, req_LocationID, file_suffix)

CPU times: user 11min 39s, sys: 31.7 s, total: 12min 11s
Wall time: 13min 11s


The aggregated table looks like this:
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>DOLocationID</th>
      <th>DO_datetime</th>
      <th>taxi</th>
      <th>count</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>1</td>
      <td>2019-06-01 00:50:00</td>
      <td>lyft</td>
      <td>1</td>
    </tr>
    <tr>
      <th>1</th>
      <td>1</td>
      <td>2019-06-01 01:00:00</td>
      <td>lyft</td>
      <td>1</td>
    </tr>
    <tr>
      <th>2</th>
      <td>1</td>
      <td>2019-06-01 01:20:00</td>
      <td>uber</td>
      <td>1</td>
    </tr>
    <tr>
      <th>3</th>
      <td>1</td>
      <td>2019-06-01 01:40:00</td>
      <td>lyft</td>
      <td>2</td>
    </tr>
    <tr>
      <th>4</th>
      <td>1</td>
      <td>2019-06-01 01:40:00</td>
      <td>uber</td>
      <td>2</td>
    </tr>
  </tbody>
</table>

<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>PULocationID</th>
      <th>PU_datetime</th>
      <th>taxi</th>
      <th>count</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>1</td>
      <td>2019-06-01 00:00:00</td>
      <td>via</td>
      <td>1</td>
    </tr>
    <tr>
      <th>1</th>
      <td>1</td>
      <td>2019-06-01 00:20:00</td>
      <td>via</td>
      <td>2</td>
    </tr>
    <tr>
      <th>2</th>
      <td>1</td>
      <td>2019-06-01 01:00:00</td>
      <td>via</td>
      <td>1</td>
    </tr>
    <tr>
      <th>3</th>
      <td>1</td>
      <td>2019-06-01 01:10:00</td>
      <td>via</td>
      <td>1</td>
    </tr>
    <tr>
      <th>4</th>
      <td>1</td>
      <td>2019-06-01 02:00:00</td>
      <td>via</td>
      <td>1</td>
    </tr>
  </tbody>
</table>