In [1]:
# import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
all_trips = pd.read_csv("uber_tripdata_2021_w_ids.csv", compression="gzip")

In [3]:
print("Number of rows in dataframe:", len(all_trips)) # should be 121,645,919

Number of rows in dataframe: 121645919


In [4]:
all_trips.head()

Unnamed: 0,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,pickup_borough,pickup_zone,dropoff_borough,dropoff_zone
0,2021-01-01 00:33:44,2021-01-01 00:49:07,230,166,Manhattan,Times Sq/Theatre District,Manhattan,Morningside Heights
1,2021-01-01 00:55:19,2021-01-01 01:18:21,152,167,Manhattan,Manhattanville,Bronx,Morrisania/Melrose
2,2021-01-01 00:23:56,2021-01-01 00:38:05,233,142,Manhattan,UN/Turtle Bay South,Manhattan,Lincoln Square East
3,2021-01-01 00:42:51,2021-01-01 00:45:50,142,143,Manhattan,Lincoln Square East,Manhattan,Lincoln Square West
4,2021-01-01 00:48:14,2021-01-01 01:08:42,143,78,Manhattan,Lincoln Square West,Bronx,East Tremont


In [5]:
# separate pickup_datetime (and dropoff_datetime) into date, month, day of week, and hour

temp_1 = pd.DatetimeIndex(all_trips["pickup_datetime"])
all_trips["pickup_date"] = temp_1.date
all_trips["pickup_month"] = temp_1.month
all_trips["pickup_day"] = temp_1.weekday
all_trips["pickup_hour"] = temp_1.hour

# temp_2 = pd.DatetimeIndex(all_trips["dropoff_datetime"])
# all_trips["dropoff_date"] = temp_2.date
# all_trips["dropoff_month"] = temp_2.month
# all_trips["dropoff_day"] = temp_2.weekday
# all_trips["dropoff_hour"] = temp_2.hour

In [6]:
all_trips.head()

Unnamed: 0,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,pickup_borough,pickup_zone,dropoff_borough,dropoff_zone,pickup_date,pickup_month,pickup_day,pickup_hour
0,2021-01-01 00:33:44,2021-01-01 00:49:07,230,166,Manhattan,Times Sq/Theatre District,Manhattan,Morningside Heights,2021-01-01,1,4,0
1,2021-01-01 00:55:19,2021-01-01 01:18:21,152,167,Manhattan,Manhattanville,Bronx,Morrisania/Melrose,2021-01-01,1,4,0
2,2021-01-01 00:23:56,2021-01-01 00:38:05,233,142,Manhattan,UN/Turtle Bay South,Manhattan,Lincoln Square East,2021-01-01,1,4,0
3,2021-01-01 00:42:51,2021-01-01 00:45:50,142,143,Manhattan,Lincoln Square East,Manhattan,Lincoln Square West,2021-01-01,1,4,0
4,2021-01-01 00:48:14,2021-01-01 01:08:42,143,78,Manhattan,Lincoln Square West,Bronx,East Tremont,2021-01-01,1,4,0


In [7]:
# drop pickup_datetime and dropoff_datetime since we already have other time indicators
all_trips.drop(columns=["pickup_datetime", "dropoff_datetime"], inplace=True)

In [8]:
all_trips.head()

Unnamed: 0,PULocationID,DOLocationID,pickup_borough,pickup_zone,dropoff_borough,dropoff_zone,pickup_date,pickup_month,pickup_day,pickup_hour
0,230,166,Manhattan,Times Sq/Theatre District,Manhattan,Morningside Heights,2021-01-01,1,4,0
1,152,167,Manhattan,Manhattanville,Bronx,Morrisania/Melrose,2021-01-01,1,4,0
2,233,142,Manhattan,UN/Turtle Bay South,Manhattan,Lincoln Square East,2021-01-01,1,4,0
3,142,143,Manhattan,Lincoln Square East,Manhattan,Lincoln Square West,2021-01-01,1,4,0
4,143,78,Manhattan,Lincoln Square West,Bronx,East Tremont,2021-01-01,1,4,0


In [None]:
all_trips.dtypes

In [9]:
all_trips["pickup_date"] = all_trips["pickup_date"].astype("string")

In [10]:
all_trips.head()

Unnamed: 0,PULocationID,DOLocationID,pickup_borough,pickup_zone,dropoff_borough,dropoff_zone,pickup_date,pickup_month,pickup_day,pickup_hour
0,230,166,Manhattan,Times Sq/Theatre District,Manhattan,Morningside Heights,2021-01-01,1,4,0
1,152,167,Manhattan,Manhattanville,Bronx,Morrisania/Melrose,2021-01-01,1,4,0
2,233,142,Manhattan,UN/Turtle Bay South,Manhattan,Lincoln Square East,2021-01-01,1,4,0
3,142,143,Manhattan,Lincoln Square East,Manhattan,Lincoln Square West,2021-01-01,1,4,0
4,143,78,Manhattan,Lincoln Square West,Bronx,East Tremont,2021-01-01,1,4,0


In [None]:
all_trips.dtypes

In [11]:
# label day as weekday (Mon, Tue, Wed, Thu, Fri) or weekend (Sat, Sun)
# referred to https://stackoverflow.com/a/57410089
all_trips.loc[all_trips["pickup_day"].isin([0,1,2,3,4]), "PU_type_of_day"] = "weekday"
all_trips.loc[all_trips["pickup_day"].isin([5,6]), "PU_type_of_day"] = "weekend"
# all_trips.loc[all_trips["dropoff_day"].isin([0,1,2,3,4]), "DO_type_of_day"] = "weekday"
# all_trips.loc[all_trips["dropoff_day"].isin([5,6]), "DO_type_of_day"] = "weekend"

In [12]:
all_trips.head()

Unnamed: 0,PULocationID,DOLocationID,pickup_borough,pickup_zone,dropoff_borough,dropoff_zone,pickup_date,pickup_month,pickup_day,pickup_hour,PU_type_of_day
0,230,166,Manhattan,Times Sq/Theatre District,Manhattan,Morningside Heights,2021-01-01,1,4,0,weekday
1,152,167,Manhattan,Manhattanville,Bronx,Morrisania/Melrose,2021-01-01,1,4,0,weekday
2,233,142,Manhattan,UN/Turtle Bay South,Manhattan,Lincoln Square East,2021-01-01,1,4,0,weekday
3,142,143,Manhattan,Lincoln Square East,Manhattan,Lincoln Square West,2021-01-01,1,4,0,weekday
4,143,78,Manhattan,Lincoln Square West,Bronx,East Tremont,2021-01-01,1,4,0,weekday


In [17]:
# drop pickup_day (and dropoff_day) columns since we only want to compare weekday vs weekend
all_trips.drop(columns=["pickup_day"], inplace=True)

MemoryError: Unable to allocate 4.53 GiB for an array with shape (5, 121645919) and data type object

In [22]:
all_trips.head()

Unnamed: 0,PULocationID,DOLocationID,pickup_borough,pickup_zone,dropoff_borough,dropoff_zone,pickup_date,pickup_month,pickup_day,pickup_hour,PU_type_of_day
0,230,166,Manhattan,Times Sq/Theatre District,Manhattan,Morningside Heights,2021-01-01,1,4,0,weekday
1,152,167,Manhattan,Manhattanville,Bronx,Morrisania/Melrose,2021-01-01,1,4,0,weekday
2,233,142,Manhattan,UN/Turtle Bay South,Manhattan,Lincoln Square East,2021-01-01,1,4,0,weekday
3,142,143,Manhattan,Lincoln Square East,Manhattan,Lincoln Square West,2021-01-01,1,4,0,weekday
4,143,78,Manhattan,Lincoln Square West,Bronx,East Tremont,2021-01-01,1,4,0,weekday


In [23]:
compression_opts = dict(method="gzip")
all_trips.to_csv("./uber_tripdata_2021_labeled_days_with_dates.csv", index=False, compression=compression_opts)