In [2]:
import os
from datetime import datetime
import pandas as pd
import glob


import pyarrow as pa
import pyarrow.parquet as pq


In [None]:
date_range = pd.date_range(start='2021-01', end='2024-04', freq='ME')
print("Date range:", date_range)

In [None]:
cwd = os.getcwd()
print("Current Working Directory:", cwd)

In [None]:
data_dir = os.path.join(os.getcwd(), "..", "Datasets", "taxi_parquets")
print(f"Data directory: {data_dir}")

# Check if the directory exists
if not os.path.exists(data_dir):
    print(f"Directory {data_dir} does not exist")
else:
    # List all files in the directory to check for existence and naming
    all_files_in_dir = os.listdir(data_dir)
    print(f"Files in directory {data_dir}: {all_files_in_dir}")

all_files = []


In [None]:
for date in date_range:
    search_pattern = os.path.join(data_dir, f"yellow_{date.strftime('%Y_%m')}*.parquet")
    print(f"Searching for files with pattern: {search_pattern}")
    files = glob.glob(search_pattern)
    if files:
        print(f"Files found for pattern {search_pattern}: {files}")
    all_files.extend(files)  # Add the found files to the list

print("All files found:", all_files)

In [None]:
print("Number of files found:", len(all_files))

In [None]:
print("File path for yellow_2021_01:", r"/data-analytics/Datasets/taxi_parquets/yellow_2021_01.parquet")

In [None]:
def renaming_yellow_to_standard(df):
    """ 
    Function for renaming the columns of a dataset to standard names, which will ease the cleaning process
    """
    if isinstance(df, pd.DataFrame):
        df.rename(columns={
            'tpep_pickup_datetime': 'pickup_datetime', 
            'tpep_dropoff_datetime': 'dropoff_datetime', 
            'PULocationID': 'pickup_zone', 
            'DOLocationID': 'dropoff_zone'
        }, inplace=True)
        return df
    else:
        print("Warning: The input is not a DataFrame in renaming_yellow_to_standard")


In [None]:
def convert_float_to_int(df):
    """ 
    Function for converting datatypes of specific columns of a DataFrame to appropriate types.
    """
    if isinstance(df, pd.DataFrame):
        print("Converting float columns to int...")
        if "RatecodeID" in df.columns:
            df["RatecodeID"] = df["RatecodeID"].fillna(0).astype("int32")
        if "passenger_count" in df.columns:
            df["passenger_count"] = df["passenger_count"].fillna(0).astype("int32")
        if "pickup_zone" in df.columns:
            df["pickup_zone"] = df["pickup_zone"].fillna(0).astype("int32")
        if "dropoff_zone" in df.columns:
            df["dropoff_zone"] = df["dropoff_zone"].fillna(0).astype("int32")
        return df
    else:
        print("Warning: The input is not a DataFrame in convert_float_to_int")
        return df

In [None]:
print("Current Working Directory:", cwd)

taxi_zone_dir = os.path.join(os.getcwd(), "..", "Datasets", "taxi_other")


# Define the directory where the data is located relative to the current working directory
print("Taxi Zone CSV Directory:", taxi_zone_dir)

# Define the file path relative to the data directory
taxi_zone_path = os.path.join(cwd, taxi_zone_dir, "taxi_zone_lookup.csv")

taxi_zone = pd.read_csv(taxi_zone_path, keep_default_na=True, delimiter=",", skipinitialspace=True, encoding="Windows-1252")

def valid_zones_1(df):
    manhattan_df = df[df["Borough"] == "Manhattan"]
    unique_zones = manhattan_df["LocationID"].unique()
    
    print(f"Number of Unique Zones: {len(unique_zones)}")
    print("List of Unique Zones:", unique_zones)

valid_zones_1(taxi_zone)
    

In [None]:
# Define a function to get unique zones for Manhattan
def get_manhattan_zones(df):
    manhattan_df = df[df["Borough"] == "Manhattan"]
    unique_zones = manhattan_df["LocationID"].unique()
    return set(unique_zones)

# Get the unique Manhattan zones from the taxi_zone DataFrame
manhattan_zones = get_manhattan_zones(taxi_zone)

In [None]:
# Define the function to check pickup and dropoff zones
def check_zones(df, manhattan_zones):
    # Check if both pickup_zone and dropoff_zone are not in manhattan_zones
    invalid_zones = df[~df["pickup_zone"].isin(manhattan_zones) & ~df["dropoff_zone"].isin(manhattan_zones)]
    
    print(f"Invalid zones count: {invalid_zones.shape[0]}")
    
    if not invalid_zones.empty:
        print("Examples of rows with invalid zones:")
        print(invalid_zones.head())  # Print first few invalid rows

In [None]:
def drop_yellow_invalid_rows(df, manhattan_zones):
    if isinstance(df, pd.DataFrame):
        # Drop duplicate rows
        df = df.drop_duplicates()
        
        if "airport_fee" in df.columns:
            df = df.drop("airport_fee", axis=1)
        if "Airport_fee" in df.columns:
            df = df.drop("Airport_fee", axis=1)
            
        # Drop rows where passenger_count == 0 or >= 6
        df = df[(df["passenger_count"] > 0) & (df["passenger_count"] < 6)]
        
        # Drop rows where fare_amount or total_amount <= 0
        df = df[(df["fare_amount"] > 0) & (df["total_amount"] > 0)]
        
        # Drop rows where extra, mta_tax, tip_amount, tolls_amount, improvement_surcharge, or congestion_surcharge < 0
        df = df[(df["extra"] >= 0) & (df["mta_tax"] >= 0) & (df["tip_amount"] >= 0) & 
                (df["tolls_amount"] >= 0) & (df["improvement_surcharge"] >= 0) & 
                (df["congestion_surcharge"] >= 0)]
        
        # Drop rows where pickup_datetime == dropoff_datetime
        df = df[df["pickup_datetime"] != df["dropoff_datetime"]]
        
        # Drop rows where trip_distance <= 0
        df = df[df["trip_distance"] > 0]
        
        # Drop rows where RateCodeID != 1-6
        df = df[df["RatecodeID"].isin([1, 2, 3, 4, 5, 6])]
        
        # Drop rows where payment_type == 4
        df = df[df["payment_type"] != 4]
        
        # Drop rows where both pickup_zone and dropoff_zone are not Manhattan zones
        df = df[df["pickup_zone"].isin(manhattan_zones) | df["dropoff_zone"].isin(manhattan_zones)]
        
    else:
        print("Warning: Input is not a DataFrame")

    return df

In [None]:
def drop_yellow_columns(df):
    columns_to_drop = ["VendorID", "trip_distance", "RatecodeID", "store_and_fwd_flag", "payment_type", 
                       "fare_amount", "extra", "mta_tax", "improvement_surcharge", "tip_amount", 
                       "tolls_amount", "total_amount", "congestion_surcharge"]
    
    # Drop only the columns that exist in the DataFrame
    existing_columns_to_drop = [col for col in columns_to_drop if col in df.columns]
    df.drop(columns=existing_columns_to_drop, inplace=True)

    return df

In [None]:
def drop_missing_values(df):
    """
    Drops any rows with missing values from the DataFrame.
    
    Parameters:
        df (pd.DataFrame): The DataFrame to clean.
        
    Returns:
        pd.DataFrame: The cleaned DataFrame.
        int: The number of rows that were dropped.
    """
    
    if isinstance(df, pd.DataFrame):
        initial_row_count = df.shape[0]
        df = df.dropna()
        final_row_count = df.shape[0]
        rows_dropped = initial_row_count - final_row_count
        print(f"Number of rows dropped: {rows_dropped}")
    return df

In [None]:
def clean_yellow_parquet_files(file_paths, manhattan_zones):
    cleaned_dfs = []
    
    for file_path in file_paths:
        print(f"Processing file: {file_path}")
        
        # Read the parquet file
        df = pd.read_parquet(file_path)
        
        print("DF Shape OLD", df.shape)

        # Apply the cleaning functions
        df = renaming_yellow_to_standard(df)
        df = convert_float_to_int(df)
        df = drop_yellow_invalid_rows(df, manhattan_zones)
        df = drop_yellow_columns(df)
        df = drop_missing_values(df)

        print("DF Shape NEW", df.shape)

        
        # Append the cleaned DataFrame to the list
        cleaned_dfs.append(df)
        
        # Save the cleaned DataFrame back to a parquet file (optional)
        cleaned_file_path = file_path.replace('.parquet', '_cleaned.parquet')
        df.to_parquet(cleaned_file_path)
        print(f"Saved cleaned file: {cleaned_file_path}")
    
    # Concatenate all cleaned DataFrames into a single DataFrame
    final_df = pd.concat(cleaned_dfs, ignore_index=True)
    return final_df

In [None]:
def get_parquet_files(data_dir, date_range):
    all_files = []
    
    for date in date_range:
        search_pattern = os.path.join(data_dir, f"yellow_{date.strftime('%Y_%m')}*.parquet")
        files = glob.glob(search_pattern)
        all_files.extend(files)
    
    return all_files

# Usage example:
data_dir = "c:\\Users\\35385\\Desktop\\CS_Summer_2024\\Shared_GH\\New-York-App\\data-analytics\\cleaning\\..\\Datasets\\taxi_parquets\\"
date_range = pd.date_range(start="2021-01-01", end="2024-03-31", freq="MS")

file_paths = get_parquet_files(data_dir, date_range)

In [None]:
final_df = clean_yellow_parquet_files(file_paths, manhattan_zones)

# Save the final concatenated DataFrame to a parquet file (optional)
final_df.to_parquet('c:\\Users\\35385\\Desktop\\CS_Summer_2024\\Shared_GH\\New-York-App\\data-analytics\\cleaning\\..\\Datasets\\taxi_parquets\\yellow_final_cleaned.parquet')
print("Saved final concatenated DataFrame: yellow_final_cleaned.parquet")

In [None]:
cwd = os.getcwd()
print("Current Working Directory:", cwd)

# Define the directory where the data is located relative to the current working directory
print("Data Directory:", data_dir)

# Define the file paths relative to the data directory
yellow_final_cleaned_path = os.path.join(data_dir, "yellow_final_cleaned.parquet")

# Print the constructed file paths to verify
print("yellow_final_cleaned:", yellow_final_cleaned_path)


In [None]:
# Read the parquet files using the relative file paths
yellow_final_cleaned = pd.read_parquet(yellow_final_cleaned_path, engine='pyarrow')

In [None]:
yellow_final_cleaned.shape

In [None]:
yellow_final_cleaned.info()

In [None]:
yellow_final_cleaned.head(10)

In [None]:
# yellow_final_cleaned = calculate_zone_busy_in_chunks(yellow_final_cleaned)
# print(yellow_final_cleaned)

In [None]:
# data_dir = os.path.join(os.getcwd(), "..", "Datasets", "taxi_other")


# # Define the file path
# file_path = os.path.join(data_dir, "yellow_final_cleaned.csv")

# # Save the DataFrame to CSV
# yellow_final_cleaned.to_csv(file_path, index=False)

# print("DataFrame saved to:", file_path)

In [None]:
date_range = pd.date_range(start='2021-01', end='2024-04', freq='ME')
print("Date range:", date_range)

In [None]:
data_dir = os.path.join(os.getcwd(), "..", "Datasets", "taxi_parquets")
print(f"Data directory: {data_dir}")

# Check if the directory exists
if not os.path.exists(data_dir):
    print(f"Directory {data_dir} does not exist")
else:
    # List all files in the directory to check for existence and naming
    all_files_in_dir = os.listdir(data_dir)
    print(f"Files in directory {data_dir}: {all_files_in_dir}")

all_files = []


In [None]:
for date in date_range:
    search_pattern = os.path.join(data_dir, f"green_{date.strftime('%Y_%m')}*.parquet")
    print(f"Searching for files with pattern: {search_pattern}")
    files = glob.glob(search_pattern)
    if files:
        print(f"Files found for pattern {search_pattern}: {files}")
    all_files.extend(files)  # Add the found files to the list

print("All files found:", all_files)
print("Number of files found:", len(all_files))

In [None]:
def renaming_green_to_standard(df):
    """ 
    Functions for renaming the columns of a dataset or list of datasets to standard names, which will ease the cleaning process
    """
    if isinstance(df, pd.DataFrame):
        df.rename(columns={
            'lpep_pickup_datetime': 'pickup_datetime', 
            'lpep_dropoff_datetime': 'dropoff_datetime', 
            'PULocationID': 'pickup_zone', 
            'DOLocationID': 'dropoff_zone'
            }, inplace=True)
        return df

    else:
        print("Warning: The list contains non-DataFrame elements renaming_green_to_standard")

In [None]:
def drop_green_invalid_rows(df, manhattan_zones):

    if isinstance(df, pd.DataFrame):
        # Drop duplicate rows
        df = df.drop_duplicates()
            
        # Drop "ehail_fee" column due to missing values
        if "ehail_fee" in df.columns:
            df = df.drop(columns=["ehail_fee"])
            
        # Drop rows where passenger_count == 0 or >= 6
        df = df[(df["passenger_count"] > 0) & (df["passenger_count"] < 6)]
            
        # Drop rows where fare_amount or total_amount <= 0
        df = df[(df["fare_amount"] > 0) & (df["total_amount"] > 0)]
            
        # Drop rows where extra, mta_tax, tip_amount, tolls_amount, improvement_surcharge, or congestion_surcharge < 0
        df = df[(df["extra"] >= 0) & (df["mta_tax"] >= 0) & (df["tip_amount"] >= 0) & 
                (df["tolls_amount"] >= 0) & (df["improvement_surcharge"] >= 0) & 
                (df["congestion_surcharge"] >= 0)]
            
        # Drop rows where pickup_datetime == dropoff_datetime
        df = df[df["pickup_datetime"] != df["dropoff_datetime"]]
            
        # Drop rows where trip_distance <= 0
        df = df[df["trip_distance"] > 0]
            
        # Drop rows where RateCodeID != 1-6
        df = df[df["RatecodeID"].isin([1, 2, 3, 4, 5, 6])]
            
        # Drop rows where payment_type == 4
        df = df[df["payment_type"] != 4]
            
        # Drop rows where both pickup_zone and dropoff_zone are not Manhattan zones
        df = df[df["pickup_zone"].isin(manhattan_zones) | df["dropoff_zone"].isin(manhattan_zones)]
            
    else:
        print("Warning: Input is not a DataFrame drop_green_invalid_rows")

    return df

In [None]:
def drop_green_columns(df):
    columns_to_drop = ["VendorID", "trip_distance", "RatecodeID", "store_and_fwd_flag", "payment_type", 
                       "fare_amount", "extra", "mta_tax", "improvement_surcharge", "tip_amount", 
                       "tolls_amount", "total_amount", "congestion_surcharge", "trip_type"]
    
    # Drop only the columns that exist in the DataFrame
    existing_columns_to_drop = [col for col in columns_to_drop if col in df.columns]
    df.drop(columns=existing_columns_to_drop, inplace=True)

    return df

In [None]:
def clean_green_parquet_files(file_paths, manhattan_zones):
    cleaned_dfs = []
    
    for file_path in file_paths:
        print(f"Processing file: {file_path}")
        
        # Read the parquet file
        df = pd.read_parquet(file_path)
        
        print("DF Shape OLD", df.shape)

        # Apply the cleaning functions
        df = renaming_green_to_standard(df)
        df = convert_float_to_int(df)
        df = drop_green_invalid_rows(df, manhattan_zones)
        df = drop_green_columns(df)
        df = drop_missing_values(df)

        print("DF Shape NEW", df.shape)
        
        # Append the cleaned DataFrame to the list
        cleaned_dfs.append(df)
        
        # Save the cleaned DataFrame back to a parquet file (optional)
        cleaned_file_path = file_path.replace('.parquet', '_cleaned.parquet')
        df.to_parquet(cleaned_file_path)
        print(f"Saved cleaned file: {cleaned_file_path}")
    
    # Concatenate all cleaned DataFrames into a single DataFrame
    final_df = pd.concat(cleaned_dfs, ignore_index=True)
    return final_df

In [None]:
def get_parquet_files(data_dir, date_range):
    all_files = []
    
    for date in date_range:
        search_pattern = os.path.join(data_dir, f"green_{date.strftime('%Y_%m')}*.parquet")
        files = glob.glob(search_pattern)
        all_files.extend(files)
    
    return all_files

data_dir = "c:\\Users\\35385\\Desktop\\CS_Summer_2024\\Shared_GH\\New-York-App\\data-analytics\\cleaning\\..\\Datasets\\taxi_parquets\\"
date_range = pd.date_range(start="2021-01-01", end="2024-03-31", freq="MS")

file_paths = get_parquet_files(data_dir, date_range)

In [None]:
final_green_df = clean_green_parquet_files(file_paths, manhattan_zones)

# Save the final concatenated DataFrame to a parquet file (optional)
final_green_df.to_parquet('c:\\Users\\35385\\Desktop\\CS_Summer_2024\\Shared_GH\\New-York-App\\data-analytics\\cleaning\\..\\Datasets\\taxi_parquets\\green_final_cleaned.parquet')
print("Saved final concatenated DataFrame: green_final_cleaned.parquet")

In [None]:
""" 
Begin by loading 1 parquet file as pandas dataframe from each of the 4 TLC genres.
Error catching across OSes implemented: cwd, data directory, paths etc.
"""

cwd = os.getcwd()
print("Current Working Directory:", cwd)

# Define the directory where the data is located relative to the current working directory
print("Data Directory:", data_dir)

# Define the file paths relative to the data directory
green_final_cleaned_path = os.path.join(data_dir, "green_final_cleaned.parquet")

# Print the constructed file paths to verify
print("green_final_cleaned:", green_final_cleaned_path)


In [None]:
# Read the parquet files using the relative file paths
green_final_cleaned = pd.read_parquet(green_final_cleaned_path, engine='pyarrow')

In [None]:
green_final_cleaned.shape

In [None]:
green_final_cleaned.info()

In [None]:
green_final_cleaned.head(10)

In [None]:
# green_final_cleaned = calculate_zone_busy_in_chunks(green_final_cleaned)
# print(green_final_cleaned)

In [None]:
# data_dir = os.path.join(os.getcwd(), "..", "Datasets", "taxi_other")


# # Define the file path
# file_path = os.path.join(data_dir, "green_final_cleaned.csv")

# # Save the DataFrame to CSV
# green_final_cleaned.to_csv(file_path, index=False)

# print("DataFrame saved to:", file_path)

In [5]:
data_dir = os.path.join(os.getcwd(), "..", "Datasets", "taxi_parquets")
print(f"Data directory: {data_dir}")

# Check if the directory exists
if not os.path.exists(data_dir):
    print(f"Directory {data_dir} does not exist")
else:
    # List all files in the directory to check for existence and naming
    all_files_in_dir = os.listdir(data_dir)
    print(f"Files in directory {data_dir}: {all_files_in_dir}")
    print(len(all_files_in_dir))
all_files = []


Data directory: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\..\Datasets\taxi_parquets
Files in directory c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\..\Datasets\taxi_parquets: ['fhvhv_2021_01.parquet', 'fhvhv_2021_02.parquet', 'fhvhv_2021_03.parquet', 'fhvhv_2021_04.parquet', 'fhvhv_2021_05.parquet', 'fhvhv_2021_06.parquet', 'fhvhv_2021_07.parquet', 'fhvhv_2021_08.parquet', 'fhvhv_2021_09.parquet', 'fhvhv_2021_10.parquet', 'fhvhv_2021_11.parquet', 'fhvhv_2021_12.parquet', 'fhvhv_2022_01.parquet', 'fhvhv_2022_02.parquet', 'fhvhv_2022_03.parquet', 'fhvhv_2022_04.parquet', 'fhvhv_2022_05.parquet', 'fhvhv_2022_06.parquet', 'fhvhv_2022_07.parquet', 'fhvhv_2022_08.parquet', 'fhvhv_2022_09.parquet', 'fhvhv_2022_10.parquet', 'fhvhv_2022_11.parquet', 'fhvhv_2022_12.parquet', 'fhvhv_2023_01.parquet', 'fhvhv_2023_02.parquet', 'fhvhv_2023_03.parquet', 'fhvhv_2023_04.parquet', 'fhvhv_2023_05.parquet', 'fhvhv_2023_06.p

In [6]:
yellow_final_cleaned_path = os.path.join(data_dir, "yellow_final_cleaned.parquet")
green_final_cleaned_path = os.path.join(data_dir, "green_final_cleaned.parquet")

yellow_final_cleaned = pd.read_parquet(yellow_final_cleaned_path, engine='pyarrow')
green_final_cleaned = pd.read_parquet(green_final_cleaned_path, engine='pyarrow')

In [14]:
cutoff_date = pd.Timestamp('2021-01-01')

# Check for records before the cutoff date in yellow_final_cleaned
yellow_before_cutoff_pickup = yellow_final_cleaned[yellow_final_cleaned['pickup_datetime'] < cutoff_date]
yellow_before_cutoff_dropoff = yellow_final_cleaned[yellow_final_cleaned['dropoff_datetime'] < cutoff_date]

# Check for records before the cutoff date in green_final_cleaned
green_before_cutoff_pickup = green_final_cleaned[green_final_cleaned['pickup_datetime'] < cutoff_date]
green_before_cutoff_dropoff = green_final_cleaned[green_final_cleaned['dropoff_datetime'] < cutoff_date]

# Print the results
print("Yellow records with pickup_datetime before 2021-01-01:")
print(yellow_before_cutoff_pickup)

print("\nYellow records with dropoff_datetime before 2021-01-01:")
print(yellow_before_cutoff_dropoff)

print("\nGreen records with pickup_datetime before 2021-01-01:")
print(green_before_cutoff_pickup)

print("\nGreen records with dropoff_datetime before 2021-01-01:")
print(green_before_cutoff_dropoff)

Yellow records with pickup_datetime before 2021-01-01:
              pickup_datetime    dropoff_datetime  passenger_count  \
410       2020-12-31 23:59:06 2021-01-01 00:02:51                2   
476       2020-12-31 21:40:20 2020-12-31 22:16:09                1   
491       2020-12-31 23:57:17 2021-01-01 00:17:40                1   
568       2020-12-31 18:41:27 2021-01-01 17:52:03                1   
798       2020-12-31 23:55:40 2021-01-01 00:24:51                1   
...                       ...                 ...              ...   
94442717  2002-12-31 22:59:39 2002-12-31 23:05:41                1   
95496065  2009-01-01 23:30:39 2009-01-02 00:01:39                1   
97101103  2008-12-31 22:52:49 2008-12-31 23:04:09                1   
97753088  2009-01-01 00:02:13 2009-01-01 00:48:28                1   
100988757 2002-12-31 22:17:10 2002-12-31 22:42:24                1   

           pickup_zone  dropoff_zone  
410                237           236  
476                249    

In [7]:
# Renaming columns in green_final_cleaned to match yellow_final_cleaned order
green_final_cleaned.rename(columns={
    'pickup_datetime': 'pickup_datetime',
    'dropoff_datetime': 'dropoff_datetime',
    'pickup_zone': 'pickup_zone',
    'dropoff_zone': 'dropoff_zone',
    'passenger_count': 'passenger_count'
}, inplace=True)


In [8]:
# Combine the DataFrames
combined_df = pd.concat([yellow_final_cleaned, green_final_cleaned], ignore_index=True)

# Check the combined DataFrame
print(combined_df.info())
print(combined_df.head(10))


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103683400 entries, 0 to 103683399
Data columns (total 5 columns):
 #   Column            Dtype         
---  ------            -----         
 0   pickup_datetime   datetime64[us]
 1   dropoff_datetime  datetime64[us]
 2   passenger_count   int32         
 3   pickup_zone       int32         
 4   dropoff_zone      int32         
dtypes: datetime64[us](2), int32(3)
memory usage: 2.7 GB
None
      pickup_datetime    dropoff_datetime  passenger_count  pickup_zone  \
0 2021-01-01 00:30:10 2021-01-01 00:36:12                1          142   
1 2021-01-01 00:51:20 2021-01-01 00:52:19                1          238   
2 2021-01-01 00:31:49 2021-01-01 00:48:21                1           68   
3 2021-01-01 00:16:29 2021-01-01 00:24:30                1          224   
4 2021-01-01 00:12:29 2021-01-01 00:30:34                1           90   
5 2021-01-01 00:26:12 2021-01-01 00:39:46                2          263   
6 2021-01-01 00:15:52 2021-01-0

In [9]:
def calculate_zone_busy_in_chunks(df, chunk_size=10**6):
    zone_busy_list = []

    for i in range(0, len(df), chunk_size):
        chunk = df.iloc[i:i + chunk_size]

        # Combine pickup and dropoff data into a single DataFrame
        pickup_data = chunk[['pickup_datetime', 'passenger_count', 'pickup_zone']].rename(columns={'pickup_datetime': 'datetime', 'pickup_zone': 'zone'})
        dropoff_data = chunk[['dropoff_datetime', 'passenger_count', 'dropoff_zone']].rename(columns={'dropoff_datetime': 'datetime', 'dropoff_zone': 'zone'})
        combined_data = pd.concat([pickup_data, dropoff_data])

        # Round datetime to the nearest hour
        combined_data['datetime'] = combined_data['datetime'].dt.round('h')

        # Extract the required time components
        combined_data['datetime_formatted'] = combined_data['datetime'].dt.strftime('%Y-%m-%d-%H')
        combined_data['hour'] = combined_data['datetime'].dt.hour
        combined_data['day_of_week'] = combined_data['datetime'].dt.dayofweek
        combined_data['week'] = combined_data['datetime'].dt.isocalendar().week
        combined_data['month'] = combined_data['datetime'].dt.month - 1  # Convert to 0-11 for Jan-Dec
        combined_data['day_of_month'] = combined_data['datetime'].dt.day
        combined_data['year_month'] = combined_data['datetime'].dt.to_period('M').astype(str)

        # Group by datetime_formatted, hour, day_of_week, week, month, day_of_month, year_month, and zone, summing passenger counts
        zone_busy_chunk = combined_data.groupby(['datetime_formatted', 'hour', 'day_of_week', 'week', 'month', 'day_of_month', 'year_month', 'zone'], as_index=False)['passenger_count'].sum()
        zone_busy_list.append(zone_busy_chunk)

    # Concatenate all the chunk results
    zone_busy_df = pd.concat(zone_busy_list, ignore_index=True)

    # Final grouping to combine all the chunks
    zone_busy_df = zone_busy_df.groupby(['datetime_formatted', 'hour', 'day_of_week', 'week', 'month', 'day_of_month', 'year_month', 'zone'], as_index=False)['passenger_count'].sum()

    return zone_busy_df

In [10]:
#15 minute runtime
combined_df = calculate_zone_busy_in_chunks(combined_df)
print(combined_df)

        datetime_formatted  hour  day_of_week  week  month  day_of_month  \
0            2001-01-01-00     0            0     1      0             1   
1            2001-01-01-00     0            0     1      0             1   
2            2001-01-01-00     0            0     1      0             1   
3            2001-01-01-00     0            0     1      0             1   
4            2001-01-01-00     0            0     1      0             1   
...                    ...   ...          ...   ...    ...           ...   
3344022      2028-12-07-05     5            3    49     11             7   
3344023      2029-05-05-09     9            5    18      4             5   
3344024      2029-05-05-12    12            5    18      4             5   
3344025      2098-09-11-02     2            3    37      8            11   
3344026      2098-09-11-03     3            3    37      8            11   

        year_month  zone  passenger_count  
0          2001-01    43                1  

In [11]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3344027 entries, 0 to 3344026
Data columns (total 9 columns):
 #   Column              Dtype 
---  ------              ----- 
 0   datetime_formatted  object
 1   hour                int32 
 2   day_of_week         int32 
 3   week                UInt32
 4   month               int32 
 5   day_of_month        int32 
 6   year_month          object
 7   zone                int32 
 8   passenger_count     int32 
dtypes: UInt32(1), int32(6), object(2)
memory usage: 143.5+ MB


In [12]:
combined_df.head(10)

Unnamed: 0,datetime_formatted,hour,day_of_week,week,month,day_of_month,year_month,zone,passenger_count
0,2001-01-01-00,0,0,1,0,1,2001-01,43,1
1,2001-01-01-00,0,0,1,0,1,2001-01,48,1
2,2001-01-01-00,0,0,1,0,1,2001-01,132,2
3,2001-01-01-00,0,0,1,0,1,2001-01,161,2
4,2001-01-01-00,0,0,1,0,1,2001-01,239,2
5,2001-01-01-01,1,0,1,0,1,2001-01,97,1
6,2001-01-01-01,1,0,1,0,1,2001-01,143,1
7,2001-01-01-01,1,0,1,0,1,2001-01,230,2
8,2001-01-01-01,1,0,1,0,1,2001-01,237,1
9,2001-01-01-02,2,0,1,0,1,2001-01,231,2


In [15]:
combined_df.tail(10)

Unnamed: 0,datetime_formatted,hour,day_of_week,week,month,day_of_month,year_month,zone,passenger_count
3344017,2024-04-01-20,20,0,14,3,1,2024-04,264,1
3344018,2024-04-01-22,22,0,14,3,1,2024-04,161,3
3344019,2024-04-01-22,22,0,14,3,1,2024-04,249,1
3344020,2024-04-02-09,9,1,14,3,2,2024-04,132,1
3344021,2028-12-07-05,5,3,49,11,7,2028-12,132,3
3344022,2028-12-07-05,5,3,49,11,7,2028-12,170,3
3344023,2029-05-05-09,9,5,18,4,5,2029-05,231,1
3344024,2029-05-05-12,12,5,18,4,5,2029-05,249,1
3344025,2098-09-11-02,2,3,37,8,11,2098-09,163,5
3344026,2098-09-11-03,3,3,37,8,11,2098-09,231,5


In [17]:
combined_df['datetime_formatted'] = pd.to_datetime(combined_df['datetime_formatted'], format='%Y-%m-%d-%H')

start_date = pd.Timestamp('2021-01-01')
end_date = pd.Timestamp('2024-04-30 23:59:59')

# Filter the DataFrame to include only the dates within the specified range
combined_df = combined_df[(combined_df['datetime_formatted'] >= start_date) & (combined_df['datetime_formatted'] <= end_date)]

# Check the last 10 rows of the filtered DataFrame
print(combined_df.tail(10))

         datetime_formatted  hour  day_of_week  week  month  day_of_month  \
3344011 2024-04-01 20:00:00    20            0    14      3             1   
3344012 2024-04-01 20:00:00    20            0    14      3             1   
3344013 2024-04-01 20:00:00    20            0    14      3             1   
3344014 2024-04-01 20:00:00    20            0    14      3             1   
3344015 2024-04-01 20:00:00    20            0    14      3             1   
3344016 2024-04-01 20:00:00    20            0    14      3             1   
3344017 2024-04-01 20:00:00    20            0    14      3             1   
3344018 2024-04-01 22:00:00    22            0    14      3             1   
3344019 2024-04-01 22:00:00    22            0    14      3             1   
3344020 2024-04-02 09:00:00     9            1    14      3             2   

        year_month  zone  passenger_count  
3344011    2024-04   125                1  
3344012    2024-04   140                1  
3344013    2024-04  

In [20]:
data_dir = os.path.join(os.getcwd(), "..", "Datasets", "taxi_other")


# Define the file path
file_path = os.path.join(data_dir, "combined_df.csv")

# Save the DataFrame to CSV
combined_df.to_csv(file_path, index=False)

print("DataFrame saved to:", file_path)

DataFrame saved to: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\..\Datasets\taxi_other\combined_df.csv
