In [19]:
import os
import pandas as pd
import datetime
import dropoff_finder

In [49]:
#Run this cell to get ONE parquet file of ALL cleaned data stored in the folder all_cleaned_data.

def get_cleaned_df():
    folder_path = "cleaned_data/"
    
    os.makedirs("all_cleaned_data", exist_ok=True)
    
    # Get all parquet files in the folder
    parquet_files = [f for f in os.listdir(folder_path) if f.endswith('.parquet')]
    
    # Load and concatenate them into a single DataFrame
    dataframes = [pd.read_parquet(os.path.join(folder_path, file)) for file in parquet_files]
    
    # Combine all dataframes
    df = pd.concat(dataframes, ignore_index=True) if dataframes else pd.DataFrame()
    
    df.to_parquet("all_cleaned_data/all_cleaned_data.parquet", index=False)

get_cleaned_df()

In [11]:
#return all cleaned data file as a dataframe
df = pd.read_parquet("all_cleaned_data/all_cleaned_data.parquet")

Unnamed: 0,VendorID,pickup_datetime,dropoff_datetime,trip_duration,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,payment_type,fare_amount,total_amount,tip_amount,tolls_amount,congestion_surcharge
0,2,2024-01-01 00:57:55,2024-01-01 01:17:43,19.800000,1.0,1.72,1.0,186,79,2,17.70,22.70,0.00,0.00,2.5
1,1,2024-01-01 00:03:00,2024-01-01 00:09:36,6.600000,1.0,1.80,1.0,140,236,1,10.00,18.75,3.75,0.00,2.5
2,1,2024-01-01 00:17:06,2024-01-01 00:35:01,17.916667,1.0,4.70,1.0,236,79,1,23.30,31.30,3.00,0.00,2.5
3,1,2024-01-01 00:36:38,2024-01-01 00:44:56,8.300000,1.0,1.40,1.0,79,211,1,10.00,17.00,2.00,0.00,2.5
4,1,2024-01-01 00:46:51,2024-01-01 00:52:57,6.100000,1.0,0.80,1.0,211,148,1,7.90,16.10,3.20,0.00,2.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39135369,1,2024-12-31 23:21:36,2024-12-31 23:32:10,10.566667,1.0,1.60,0.0,188,61,0,13.28,14.78,0.00,0.00,0.0
39135370,1,2024-12-31 23:14:53,2024-12-31 23:35:13,20.333333,1.0,2.90,0.0,145,164,0,21.94,32.88,0.00,6.94,0.0
39135371,2,2024-12-31 23:03:16,2024-12-31 23:28:35,25.316667,1.0,3.94,0.0,181,255,0,4.46,11.15,5.19,0.00,0.0
39135372,1,2024-12-31 23:15:33,2024-12-31 23:36:29,20.933333,1.0,4.20,0.0,165,61,0,27.07,28.57,0.00,0.00,0.0


In [37]:
def DOPU_given_timerange(df, zone, start, end, isDropoff=True):
    """
    input: 
        df: dataframe to extract number of dropoffs from
        zone: integer zone id
        start: time of start (pandas time object)
        end: time of end (pandas time object)
        isDropoff: bool. If true, 
        
    returns:
        integer- average number of dropoffs/pickups made in the zone during the time range
    """
    
    
    #given a zone id, a start time, and end time, find the average number of dropoffs/pickups made during this time
    df_zone = df[df["DOLocationID"] == zone].copy() if isDropoff else df[df["PULocationID"] == zone].copy()
    
    #remove date from datetime object
    df_zone["time"] = df_zone["dropoff_datetime"].dt.time if isDropoff else df_zone["pickup_datetime"].dt.time
    
    #filter by times within the given time range
    df_filtered = df_zone[(df_zone["time"] >= start) & (df_zone["time"] <= end)]
    
    #sum all dropoff/pickups
    dropoffs_per_day = df_filtered.groupby(df_filtered["dropoff_datetime"].dt.date).size()
    
    #return mean dropoffs/pickups
    return int(dropoffs_per_day.mean()) if not dropoffs_per_day.empty else 0



In [48]:
#Usage:
start = datetime.time(0,0,0)
end = datetime.time(23,0,0)
n = DOPU_given_timerange(df, 10, start, end, False)
n

35