In [1]:
import os
from datetime import datetime
import pandas as pd
import glob


import pyarrow as pa
import pyarrow.parquet as pq


In [2]:
date_range = pd.date_range(start='2021-01', end='2024-04', freq='ME')
print("Date range:", date_range)

Date range: DatetimeIndex(['2021-01-31', '2021-02-28', '2021-03-31', '2021-04-30',
               '2021-05-31', '2021-06-30', '2021-07-31', '2021-08-31',
               '2021-09-30', '2021-10-31', '2021-11-30', '2021-12-31',
               '2022-01-31', '2022-02-28', '2022-03-31', '2022-04-30',
               '2022-05-31', '2022-06-30', '2022-07-31', '2022-08-31',
               '2022-09-30', '2022-10-31', '2022-11-30', '2022-12-31',
               '2023-01-31', '2023-02-28', '2023-03-31', '2023-04-30',
               '2023-05-31', '2023-06-30', '2023-07-31', '2023-08-31',
               '2023-09-30', '2023-10-31', '2023-11-30', '2023-12-31',
               '2024-01-31', '2024-02-29', '2024-03-31'],
              dtype='datetime64[ns]', freq='ME')


In [3]:
cwd = os.getcwd()
print("Current Working Directory:", cwd)

Current Working Directory: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning


In [4]:
data_dir = os.path.join(os.getcwd(), "..", "Datasets", "taxi_parquets")
print(f"Data directory: {data_dir}")

# Check if the directory exists
if not os.path.exists(data_dir):
    print(f"Directory {data_dir} does not exist")
else:
    # List all files in the directory to check for existence and naming
    all_files_in_dir = os.listdir(data_dir)
    print(f"Files in directory {data_dir}: {all_files_in_dir}")

all_files = []


Data directory: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\..\Datasets\taxi_parquets
Files in directory c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\..\Datasets\taxi_parquets: ['fhvhv_2021_01.parquet', 'fhvhv_2021_02.parquet', 'fhvhv_2021_03.parquet', 'fhvhv_2021_04.parquet', 'fhvhv_2021_05.parquet', 'fhvhv_2021_06.parquet', 'fhvhv_2021_07.parquet', 'fhvhv_2021_08.parquet', 'fhvhv_2021_09.parquet', 'fhvhv_2021_10.parquet', 'fhvhv_2021_11.parquet', 'fhvhv_2021_12.parquet', 'fhvhv_2022_01.parquet', 'fhvhv_2022_02.parquet', 'fhvhv_2022_03.parquet', 'fhvhv_2022_04.parquet', 'fhvhv_2022_05.parquet', 'fhvhv_2022_06.parquet', 'fhvhv_2022_07.parquet', 'fhvhv_2022_08.parquet', 'fhvhv_2022_09.parquet', 'fhvhv_2022_10.parquet', 'fhvhv_2022_11.parquet', 'fhvhv_2022_12.parquet', 'fhvhv_2023_01.parquet', 'fhvhv_2023_02.parquet', 'fhvhv_2023_03.parquet', 'fhvhv_2023_04.parquet', 'fhvhv_2023_05.parquet', 'fhvhv_2023_06.p

In [5]:
for date in date_range:
    search_pattern = os.path.join(data_dir, f"yellow_{date.strftime('%Y_%m')}*.parquet")
    print(f"Searching for files with pattern: {search_pattern}")
    files = glob.glob(search_pattern)
    if files:
        print(f"Files found for pattern {search_pattern}: {files}")
    all_files.extend(files)  # Add the found files to the list

print("All files found:", all_files)

Searching for files with pattern: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\..\Datasets\taxi_parquets\yellow_2021_01*.parquet
Files found for pattern c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\..\Datasets\taxi_parquets\yellow_2021_01*.parquet: ['c:\\Users\\35385\\Desktop\\CS_Summer_2024\\Shared_GH\\New-York-App\\data-analytics\\cleaning\\..\\Datasets\\taxi_parquets\\yellow_2021_01.parquet']
Searching for files with pattern: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\..\Datasets\taxi_parquets\yellow_2021_02*.parquet
Files found for pattern c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\..\Datasets\taxi_parquets\yellow_2021_02*.parquet: ['c:\\Users\\35385\\Desktop\\CS_Summer_2024\\Shared_GH\\New-York-App\\data-analytics\\cleaning\\..\\Datasets\\taxi_parquets\\yellow_2021_02.parquet']
Searching for files with pattern: c:\Users

In [6]:
print("Number of files found:", len(all_files))

Number of files found: 39


In [7]:
print("File path for yellow_2021_01:", r"/data-analytics/Datasets/taxi_parquets/yellow_2021_01.parquet")

File path for yellow_2021_01: /data-analytics/Datasets/taxi_parquets/yellow_2021_01.parquet


In [8]:
def renaming_yellow_to_standard(df):
    """ 
    Function for renaming the columns of a dataset to standard names, which will ease the cleaning process
    """
    if isinstance(df, pd.DataFrame):
        df.rename(columns={
            'tpep_pickup_datetime': 'pickup_datetime', 
            'tpep_dropoff_datetime': 'dropoff_datetime', 
            'PULocationID': 'pickup_zone', 
            'DOLocationID': 'dropoff_zone'
        }, inplace=True)
        return df
    else:
        print("Warning: The input is not a DataFrame in renaming_yellow_to_standard")


In [9]:
def convert_float_to_int(df):
    """ 
    Function for converting datatypes of specific columns of a DataFrame to appropriate types.
    """
    if isinstance(df, pd.DataFrame):
        print("Converting float columns to int...")
        if "RatecodeID" in df.columns:
            df["RatecodeID"] = df["RatecodeID"].fillna(0).astype("int32")
        if "passenger_count" in df.columns:
            df["passenger_count"] = df["passenger_count"].fillna(0).astype("int32")
        if "pickup_zone" in df.columns:
            df["pickup_zone"] = df["pickup_zone"].fillna(0).astype("int32")
        if "dropoff_zone" in df.columns:
            df["dropoff_zone"] = df["dropoff_zone"].fillna(0).astype("int32")
        return df
    else:
        print("Warning: The input is not a DataFrame in convert_float_to_int")
        return df

In [10]:
print("Current Working Directory:", cwd)

taxi_zone_dir = os.path.join(os.getcwd(), "..", "Datasets", "taxi_other")

# Define the directory where the data is located relative to the current working directory
print("Taxi Zone CSV Directory:", taxi_zone_dir)

# Define the file path relative to the data directory
taxi_zone_path = os.path.join(cwd, taxi_zone_dir, "taxi_zone_lookup.csv")

taxi_zone = pd.read_csv(taxi_zone_path, keep_default_na=True, delimiter=",", skipinitialspace=True, encoding="Windows-1252")

def valid_zones_1(df):
    manhattan_df = df[df["Borough"] == "Manhattan"]
    unique_zones = manhattan_df["LocationID"].unique()
    
    print(f"Number of Unique Zones: {len(unique_zones)}")
    print("List of Unique Zones:", unique_zones)

valid_zones_1(taxi_zone)
    

Current Working Directory: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning
Taxi Zone CSV Directory: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\..\Datasets\taxi_other
Number of Unique Zones: 69
List of Unique Zones: [  4  12  13  24  41  42  43  45  48  50  68  74  75  79  87  88  90 100
 103 104 105 107 113 114 116 120 125 127 128 137 140 141 142 143 144 148
 151 152 153 158 161 162 163 164 166 170 186 194 202 209 211 224 229 230
 231 232 233 234 236 237 238 239 243 244 246 249 261 262 263]


In [11]:
# Define a function to get unique zones for Manhattan
def get_manhattan_zones(df):
    manhattan_df = df[df["Borough"] == "Manhattan"]
    unique_zones = manhattan_df["LocationID"].unique()
    return set(unique_zones)

# Get the unique Manhattan zones from the taxi_zone DataFrame
manhattan_zones = get_manhattan_zones(taxi_zone)

In [12]:
# Define the function to check pickup and dropoff zones
def check_zones(df, manhattan_zones):
    # Check if both pickup_zone and dropoff_zone are not in manhattan_zones
    invalid_zones = df[~df["pickup_zone"].isin(manhattan_zones) & ~df["dropoff_zone"].isin(manhattan_zones)]
    
    print(f"Invalid zones count: {invalid_zones.shape[0]}")
    
    if not invalid_zones.empty:
        print("Examples of rows with invalid zones:")
        print(invalid_zones.head())  # Print first few invalid rows

In [13]:
def drop_yellow_invalid_rows(df, manhattan_zones):
    if isinstance(df, pd.DataFrame):
        # Drop duplicate rows
        df = df.drop_duplicates()
        
        if "airport_fee" in df.columns:
            df = df.drop("airport_fee", axis=1)
        if "Airport_fee" in df.columns:
            df = df.drop("Airport_fee", axis=1)
            
        # Drop rows where passenger_count == 0 or >= 6
        df = df[(df["passenger_count"] > 0) & (df["passenger_count"] < 6)]
        
        # Drop rows where fare_amount or total_amount <= 0
        df = df[(df["fare_amount"] > 0) & (df["total_amount"] > 0)]
        
        # Drop rows where extra, mta_tax, tip_amount, tolls_amount, improvement_surcharge, or congestion_surcharge < 0
        df = df[(df["extra"] >= 0) & (df["mta_tax"] >= 0) & (df["tip_amount"] >= 0) & 
                (df["tolls_amount"] >= 0) & (df["improvement_surcharge"] >= 0) & 
                (df["congestion_surcharge"] >= 0)]
        
        # Drop rows where pickup_datetime == dropoff_datetime
        df = df[df["pickup_datetime"] != df["dropoff_datetime"]]
        
        # Drop rows where trip_distance <= 0
        df = df[df["trip_distance"] > 0]
        
        # Drop rows where RateCodeID != 1-6
        df = df[df["RatecodeID"].isin([1, 2, 3, 4, 5, 6])]
        
        # Drop rows where payment_type == 4
        df = df[df["payment_type"] != 4]
        
        # Drop rows where both pickup_zone and dropoff_zone are not Manhattan zones
        df = df[df["pickup_zone"].isin(manhattan_zones) | df["dropoff_zone"].isin(manhattan_zones)]
        
    else:
        print("Warning: Input is not a DataFrame")

    return df

In [14]:
def drop_yellow_columns(df):
    columns_to_drop = ["VendorID", "trip_distance", "RatecodeID", "store_and_fwd_flag", "payment_type", 
                       "fare_amount", "extra", "mta_tax", "improvement_surcharge", "tip_amount", 
                       "tolls_amount", "total_amount", "congestion_surcharge"]
    
    # Drop only the columns that exist in the DataFrame
    existing_columns_to_drop = [col for col in columns_to_drop if col in df.columns]
    df.drop(columns=existing_columns_to_drop, inplace=True)

    return df

In [15]:
def drop_missing_values(df):
    """
    Drops any rows with missing values from the DataFrame.
    
    Parameters:
        df (pd.DataFrame): The DataFrame to clean.
        
    Returns:
        pd.DataFrame: The cleaned DataFrame.
        int: The number of rows that were dropped.
    """
    
    if isinstance(df, pd.DataFrame):
        initial_row_count = df.shape[0]
        df = df.dropna()
        final_row_count = df.shape[0]
        rows_dropped = initial_row_count - final_row_count
        print(f"Number of rows dropped: {rows_dropped}")
    return df

In [16]:
def clean_yellow_parquet_files(file_paths, manhattan_zones):
    cleaned_dfs = []
    
    for file_path in file_paths:
        print(f"Processing file: {file_path}")
        
        # Read the parquet file
        df = pd.read_parquet(file_path)
        
        print("DF Shape OLD", df.shape)

        # Apply the cleaning functions
        df = renaming_yellow_to_standard(df)
        df = convert_float_to_int(df)
        df = drop_yellow_invalid_rows(df, manhattan_zones)
        df = drop_yellow_columns(df)
        df = drop_missing_values(df)

        print("DF Shape NEW", df.shape)

        
        # Append the cleaned DataFrame to the list
        cleaned_dfs.append(df)
        
        # Save the cleaned DataFrame back to a parquet file (optional)
        cleaned_file_path = file_path.replace('.parquet', '_cleaned.parquet')
        df.to_parquet(cleaned_file_path)
        print(f"Saved cleaned file: {cleaned_file_path}")
    
    # Concatenate all cleaned DataFrames into a single DataFrame
    final_df = pd.concat(cleaned_dfs, ignore_index=True)
    return final_df

In [17]:
def get_parquet_files(data_dir, date_range):
    all_files = []
    
    for date in date_range:
        search_pattern = os.path.join(data_dir, f"yellow_{date.strftime('%Y_%m')}*.parquet")
        files = glob.glob(search_pattern)
        all_files.extend(files)
    
    return all_files

# Usage example:
data_dir = "c:\\Users\\35385\\Desktop\\CS_Summer_2024\\Shared_GH\\New-York-App\\data-analytics\\cleaning\\..\\Datasets\\taxi_parquets\\"
date_range = pd.date_range(start="2021-01-01", end="2024-03-31", freq="MS")

file_paths = get_parquet_files(data_dir, date_range)

In [18]:
final_df = clean_yellow_parquet_files(file_paths, manhattan_zones)

# Save the final concatenated DataFrame to a parquet file (optional)
final_df.to_parquet('c:\\Users\\35385\\Desktop\\CS_Summer_2024\\Shared_GH\\New-York-App\\data-analytics\\cleaning\\..\\Datasets\\taxi_parquets\\yellow_final_cleaned.parquet')
print("Saved final concatenated DataFrame: yellow_final_cleaned.parquet")

Processing file: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\..\Datasets\taxi_parquets\yellow_2021_01.parquet
DF Shape OLD (1369769, 19)
Converting float columns to int...
Number of rows dropped: 0
DF Shape NEW (1149499, 5)
Saved cleaned file: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\..\Datasets\taxi_parquets\yellow_2021_01_cleaned.parquet
Processing file: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\..\Datasets\taxi_parquets\yellow_2021_02.parquet
DF Shape OLD (1371709, 19)
Converting float columns to int...
Number of rows dropped: 0
DF Shape NEW (1160123, 5)
Saved cleaned file: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\..\Datasets\taxi_parquets\yellow_2021_02_cleaned.parquet
Processing file: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\..\Datasets\taxi_parquets\yellow_2021_03.par

In [19]:
cwd = os.getcwd()
print("Current Working Directory:", cwd)

# Define the directory where the data is located relative to the current working directory
print("Data Directory:", data_dir)

# Define the file paths relative to the data directory
yellow_final_cleaned_path = os.path.join(data_dir, "yellow_final_cleaned.parquet")

# Print the constructed file paths to verify
print("yellow_final_cleaned:", yellow_final_cleaned_path)


Current Working Directory: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning
Data Directory: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\..\Datasets\taxi_parquets\
yellow_final_cleaned: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\..\Datasets\taxi_parquets\yellow_final_cleaned.parquet


In [20]:
# Read the parquet files using the relative file paths
yellow_final_cleaned = pd.read_parquet(yellow_final_cleaned_path, engine='pyarrow')

In [22]:
yellow_final_cleaned.shape

(102318739, 5)

In [23]:
yellow_final_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102318739 entries, 0 to 102318738
Data columns (total 5 columns):
 #   Column            Dtype         
---  ------            -----         
 0   pickup_datetime   datetime64[us]
 1   dropoff_datetime  datetime64[us]
 2   passenger_count   int32         
 3   pickup_zone       int32         
 4   dropoff_zone      int32         
dtypes: datetime64[us](2), int32(3)
memory usage: 2.7 GB


In [24]:
yellow_final_cleaned.head(10)

Unnamed: 0,pickup_datetime,dropoff_datetime,passenger_count,pickup_zone,dropoff_zone
0,2021-01-01 00:30:10,2021-01-01 00:36:12,1,142,43
1,2021-01-01 00:51:20,2021-01-01 00:52:19,1,238,151
2,2021-01-01 00:31:49,2021-01-01 00:48:21,1,68,33
3,2021-01-01 00:16:29,2021-01-01 00:24:30,1,224,68
4,2021-01-01 00:12:29,2021-01-01 00:30:34,1,90,40
5,2021-01-01 00:26:12,2021-01-01 00:39:46,2,263,142
6,2021-01-01 00:15:52,2021-01-01 00:38:07,3,164,255
7,2021-01-01 00:10:46,2021-01-01 00:32:58,2,138,166
8,2021-01-01 00:31:06,2021-01-01 00:38:52,5,142,50
9,2021-01-01 00:42:11,2021-01-01 00:44:24,5,50,142


In [25]:
date_range = pd.date_range(start='2021-01', end='2024-04', freq='ME')
print("Date range:", date_range)

Date range: DatetimeIndex(['2021-01-31', '2021-02-28', '2021-03-31', '2021-04-30',
               '2021-05-31', '2021-06-30', '2021-07-31', '2021-08-31',
               '2021-09-30', '2021-10-31', '2021-11-30', '2021-12-31',
               '2022-01-31', '2022-02-28', '2022-03-31', '2022-04-30',
               '2022-05-31', '2022-06-30', '2022-07-31', '2022-08-31',
               '2022-09-30', '2022-10-31', '2022-11-30', '2022-12-31',
               '2023-01-31', '2023-02-28', '2023-03-31', '2023-04-30',
               '2023-05-31', '2023-06-30', '2023-07-31', '2023-08-31',
               '2023-09-30', '2023-10-31', '2023-11-30', '2023-12-31',
               '2024-01-31', '2024-02-29', '2024-03-31'],
              dtype='datetime64[ns]', freq='ME')


In [26]:
data_dir = os.path.join(os.getcwd(), "..", "Datasets", "taxi_parquets")
print(f"Data directory: {data_dir}")

# Check if the directory exists
if not os.path.exists(data_dir):
    print(f"Directory {data_dir} does not exist")
else:
    # List all files in the directory to check for existence and naming
    all_files_in_dir = os.listdir(data_dir)
    print(f"Files in directory {data_dir}: {all_files_in_dir}")

all_files = []


Data directory: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\..\Datasets\taxi_parquets
Files in directory c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\..\Datasets\taxi_parquets: ['fhvhv_2021_01.parquet', 'fhvhv_2021_02.parquet', 'fhvhv_2021_03.parquet', 'fhvhv_2021_04.parquet', 'fhvhv_2021_05.parquet', 'fhvhv_2021_06.parquet', 'fhvhv_2021_07.parquet', 'fhvhv_2021_08.parquet', 'fhvhv_2021_09.parquet', 'fhvhv_2021_10.parquet', 'fhvhv_2021_11.parquet', 'fhvhv_2021_12.parquet', 'fhvhv_2022_01.parquet', 'fhvhv_2022_02.parquet', 'fhvhv_2022_03.parquet', 'fhvhv_2022_04.parquet', 'fhvhv_2022_05.parquet', 'fhvhv_2022_06.parquet', 'fhvhv_2022_07.parquet', 'fhvhv_2022_08.parquet', 'fhvhv_2022_09.parquet', 'fhvhv_2022_10.parquet', 'fhvhv_2022_11.parquet', 'fhvhv_2022_12.parquet', 'fhvhv_2023_01.parquet', 'fhvhv_2023_02.parquet', 'fhvhv_2023_03.parquet', 'fhvhv_2023_04.parquet', 'fhvhv_2023_05.parquet', 'fhvhv_2023_06.p

In [27]:
for date in date_range:
    search_pattern = os.path.join(data_dir, f"green_{date.strftime('%Y_%m')}*.parquet")
    print(f"Searching for files with pattern: {search_pattern}")
    files = glob.glob(search_pattern)
    if files:
        print(f"Files found for pattern {search_pattern}: {files}")
    all_files.extend(files)  # Add the found files to the list

print("All files found:", all_files)
print("Number of files found:", len(all_files))

Searching for files with pattern: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\..\Datasets\taxi_parquets\green_2021_01*.parquet
Files found for pattern c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\..\Datasets\taxi_parquets\green_2021_01*.parquet: ['c:\\Users\\35385\\Desktop\\CS_Summer_2024\\Shared_GH\\New-York-App\\data-analytics\\cleaning\\..\\Datasets\\taxi_parquets\\green_2021_01.parquet']
Searching for files with pattern: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\..\Datasets\taxi_parquets\green_2021_02*.parquet
Files found for pattern c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\..\Datasets\taxi_parquets\green_2021_02*.parquet: ['c:\\Users\\35385\\Desktop\\CS_Summer_2024\\Shared_GH\\New-York-App\\data-analytics\\cleaning\\..\\Datasets\\taxi_parquets\\green_2021_02.parquet']
Searching for files with pattern: c:\Users\35385

In [28]:
def renaming_green_to_standard(df):
    """ 
    Functions for renaming the columns of a dataset or list of datasets to standard names, which will ease the cleaning process
    """
    if isinstance(df, pd.DataFrame):
        df.rename(columns={
            'lpep_pickup_datetime': 'pickup_datetime', 
            'lpep_dropoff_datetime': 'dropoff_datetime', 
            'PULocationID': 'pickup_zone', 
            'DOLocationID': 'dropoff_zone'
            }, inplace=True)
        return df

    else:
        print("Warning: The list contains non-DataFrame elements renaming_green_to_standard")

In [29]:
def drop_green_invalid_rows(df, manhattan_zones):

    if isinstance(df, pd.DataFrame):
        # Drop duplicate rows
        df = df.drop_duplicates()
            
        # Drop "ehail_fee" column due to missing values
        if "ehail_fee" in df.columns:
            df = df.drop(columns=["ehail_fee"])
            
        # Drop rows where passenger_count == 0 or >= 6
        df = df[(df["passenger_count"] > 0) & (df["passenger_count"] < 6)]
            
        # Drop rows where fare_amount or total_amount <= 0
        df = df[(df["fare_amount"] > 0) & (df["total_amount"] > 0)]
            
        # Drop rows where extra, mta_tax, tip_amount, tolls_amount, improvement_surcharge, or congestion_surcharge < 0
        df = df[(df["extra"] >= 0) & (df["mta_tax"] >= 0) & (df["tip_amount"] >= 0) & 
                (df["tolls_amount"] >= 0) & (df["improvement_surcharge"] >= 0) & 
                (df["congestion_surcharge"] >= 0)]
            
        # Drop rows where pickup_datetime == dropoff_datetime
        df = df[df["pickup_datetime"] != df["dropoff_datetime"]]
            
        # Drop rows where trip_distance <= 0
        df = df[df["trip_distance"] > 0]
            
        # Drop rows where RateCodeID != 1-6
        df = df[df["RatecodeID"].isin([1, 2, 3, 4, 5, 6])]
            
        # Drop rows where payment_type == 4
        df = df[df["payment_type"] != 4]
            
        # Drop rows where both pickup_zone and dropoff_zone are not Manhattan zones
        df = df[df["pickup_zone"].isin(manhattan_zones) | df["dropoff_zone"].isin(manhattan_zones)]
            
    else:
        print("Warning: Input is not a DataFrame drop_green_invalid_rows")

    return df

In [30]:
def drop_green_columns(df):
    columns_to_drop = ["VendorID", "trip_distance", "RatecodeID", "store_and_fwd_flag", "payment_type", 
                       "fare_amount", "extra", "mta_tax", "improvement_surcharge", "tip_amount", 
                       "tolls_amount", "total_amount", "congestion_surcharge", "trip_type"]
    
    # Drop only the columns that exist in the DataFrame
    existing_columns_to_drop = [col for col in columns_to_drop if col in df.columns]
    df.drop(columns=existing_columns_to_drop, inplace=True)

    return df

In [31]:
def clean_green_parquet_files(file_paths, manhattan_zones):
    cleaned_dfs = []
    
    for file_path in file_paths:
        print(f"Processing file: {file_path}")
        
        # Read the parquet file
        df = pd.read_parquet(file_path)
        
        print("DF Shape OLD", df.shape)

        # Apply the cleaning functions
        df = renaming_green_to_standard(df)
        df = convert_float_to_int(df)
        df = drop_green_invalid_rows(df, manhattan_zones)
        df = drop_green_columns(df)
        df = drop_missing_values(df)

        print("DF Shape NEW", df.shape)
        
        # Append the cleaned DataFrame to the list
        cleaned_dfs.append(df)
        
        # Save the cleaned DataFrame back to a parquet file (optional)
        cleaned_file_path = file_path.replace('.parquet', '_cleaned.parquet')
        df.to_parquet(cleaned_file_path)
        print(f"Saved cleaned file: {cleaned_file_path}")
    
    # Concatenate all cleaned DataFrames into a single DataFrame
    final_df = pd.concat(cleaned_dfs, ignore_index=True)
    return final_df

In [32]:
def get_parquet_files(data_dir, date_range):
    all_files = []
    
    for date in date_range:
        search_pattern = os.path.join(data_dir, f"green_{date.strftime('%Y_%m')}*.parquet")
        files = glob.glob(search_pattern)
        all_files.extend(files)
    
    return all_files

data_dir = "c:\\Users\\35385\\Desktop\\CS_Summer_2024\\Shared_GH\\New-York-App\\data-analytics\\cleaning\\..\\Datasets\\taxi_parquets\\"
date_range = pd.date_range(start="2021-01-01", end="2024-03-31", freq="MS")

file_paths = get_parquet_files(data_dir, date_range)

In [33]:
final_green_df = clean_green_parquet_files(file_paths, manhattan_zones)

# Save the final concatenated DataFrame to a parquet file (optional)
final_green_df.to_parquet('c:\\Users\\35385\\Desktop\\CS_Summer_2024\\Shared_GH\\New-York-App\\data-analytics\\cleaning\\..\\Datasets\\taxi_parquets\\green_final_cleaned.parquet')
print("Saved final concatenated DataFrame: green_final_cleaned.parquet")

Processing file: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\..\Datasets\taxi_parquets\green_2021_01.parquet
DF Shape OLD (76518, 20)
Converting float columns to int...
Number of rows dropped: 0
DF Shape NEW (25598, 5)
Saved cleaned file: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\..\Datasets\taxi_parquets\green_2021_01_cleaned.parquet
Processing file: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\..\Datasets\taxi_parquets\green_2021_02.parquet
DF Shape OLD (64572, 20)
Converting float columns to int...
Number of rows dropped: 0
DF Shape NEW (22740, 5)
Saved cleaned file: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\..\Datasets\taxi_parquets\green_2021_02_cleaned.parquet
Processing file: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\..\Datasets\taxi_parquets\green_2021_03.parquet
DF Shape

In [34]:
cwd = os.getcwd()
print("Current Working Directory:", cwd)

# Define the directory where the data is located relative to the current working directory
print("Data Directory:", data_dir)

# Define the file paths relative to the data directory
green_final_cleaned_path = os.path.join(data_dir, "green_final_cleaned.parquet")

# Print the constructed file paths to verify
print("green_final_cleaned:", green_final_cleaned_path)


Current Working Directory: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning
Data Directory: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\..\Datasets\taxi_parquets\
green_final_cleaned: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\..\Datasets\taxi_parquets\green_final_cleaned.parquet


In [35]:
# Read the parquet files using the relative file paths
green_final_cleaned = pd.read_parquet(green_final_cleaned_path, engine='pyarrow')

In [36]:
green_final_cleaned.shape

(1364661, 5)

In [37]:
green_final_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1364661 entries, 0 to 1364660
Data columns (total 5 columns):
 #   Column            Non-Null Count    Dtype         
---  ------            --------------    -----         
 0   pickup_datetime   1364661 non-null  datetime64[us]
 1   dropoff_datetime  1364661 non-null  datetime64[us]
 2   pickup_zone       1364661 non-null  int32         
 3   dropoff_zone      1364661 non-null  int32         
 4   passenger_count   1364661 non-null  int32         
dtypes: datetime64[us](2), int32(3)
memory usage: 36.4 MB


In [38]:
green_final_cleaned.head(10)

Unnamed: 0,pickup_datetime,dropoff_datetime,pickup_zone,dropoff_zone,passenger_count
0,2021-01-01 00:15:56,2021-01-01 00:19:52,43,151,1
1,2021-01-01 00:25:59,2021-01-01 00:34:44,166,239,1
2,2021-01-01 00:45:57,2021-01-01 00:51:55,41,42,1
3,2020-12-31 23:57:51,2021-01-01 00:04:56,168,75,1
4,2021-01-01 00:31:14,2021-01-01 00:55:07,244,244,2
5,2021-01-01 00:08:50,2021-01-01 00:21:56,75,213,1
6,2021-01-01 00:35:13,2021-01-01 00:44:44,74,238,1
7,2021-01-01 00:39:57,2021-01-01 00:55:25,74,60,1
8,2021-01-01 00:51:27,2021-01-01 00:57:20,42,41,2
9,2021-01-01 00:32:07,2021-01-01 00:42:54,74,116,1


In [39]:
# Read the parquet files using the relative file paths
yellow_final_cleaned = pd.read_parquet(yellow_final_cleaned_path, engine='pyarrow')
green_final_cleaned = pd.read_parquet(green_final_cleaned_path, engine='pyarrow')


In [40]:
green_final_cleaned.info()
yellow_final_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1364661 entries, 0 to 1364660
Data columns (total 5 columns):
 #   Column            Non-Null Count    Dtype         
---  ------            --------------    -----         
 0   pickup_datetime   1364661 non-null  datetime64[us]
 1   dropoff_datetime  1364661 non-null  datetime64[us]
 2   pickup_zone       1364661 non-null  int32         
 3   dropoff_zone      1364661 non-null  int32         
 4   passenger_count   1364661 non-null  int32         
dtypes: datetime64[us](2), int32(3)
memory usage: 36.4 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102318739 entries, 0 to 102318738
Data columns (total 5 columns):
 #   Column            Dtype         
---  ------            -----         
 0   pickup_datetime   datetime64[us]
 1   dropoff_datetime  datetime64[us]
 2   passenger_count   int32         
 3   pickup_zone       int32         
 4   dropoff_zone      int32         
dtypes: datetime64[us](2), int32(3)
memory usage: 2.7 GB

In [41]:
print(green_final_cleaned.shape)
print(yellow_final_cleaned.shape)

(1364661, 5)
(102318739, 5)


In [42]:
# Earliest and latest dates in green_final_cleaned
green_pickup_earliest = green_final_cleaned['pickup_datetime'].min()
green_pickup_latest = green_final_cleaned['pickup_datetime'].max()
green_dropoff_earliest = green_final_cleaned['dropoff_datetime'].min()
green_dropoff_latest = green_final_cleaned['dropoff_datetime'].max()

print("Green Final Cleaned:")
print(f"Earliest Pickup Date: {green_pickup_earliest}")
print(f"Latest Pickup Date: {green_pickup_latest}")
print(f"Earliest Dropoff Date: {green_dropoff_earliest}")
print(f"Latest Dropoff Date: {green_dropoff_latest}")

# Earliest and latest dates in yellow_final_cleaned
yellow_pickup_earliest = yellow_final_cleaned['pickup_datetime'].min()
yellow_pickup_latest = yellow_final_cleaned['pickup_datetime'].max()
yellow_dropoff_earliest = yellow_final_cleaned['dropoff_datetime'].min()
yellow_dropoff_latest = yellow_final_cleaned['dropoff_datetime'].max()

print("\nYellow Final Cleaned:")
print(f"Earliest Pickup Date: {yellow_pickup_earliest}")
print(f"Latest Pickup Date: {yellow_pickup_latest}")
print(f"Earliest Dropoff Date: {yellow_dropoff_earliest}")
print(f"Latest Dropoff Date: {yellow_dropoff_latest}")


Green Final Cleaned:
Earliest Pickup Date: 2008-12-31 17:04:15
Latest Pickup Date: 2024-04-01 00:01:45
Earliest Dropoff Date: 2008-12-31 17:55:15
Latest Dropoff Date: 2024-04-01 00:17:31

Yellow Final Cleaned:
Earliest Pickup Date: 2001-01-01 00:03:14
Latest Pickup Date: 2098-09-11 02:23:31
Earliest Dropoff Date: 2001-01-01 00:16:31
Latest Dropoff Date: 2098-09-11 02:52:04


In [43]:
yellow_final_cleaned.head(10)

Unnamed: 0,pickup_datetime,dropoff_datetime,passenger_count,pickup_zone,dropoff_zone
0,2021-01-01 00:30:10,2021-01-01 00:36:12,1,142,43
1,2021-01-01 00:51:20,2021-01-01 00:52:19,1,238,151
2,2021-01-01 00:31:49,2021-01-01 00:48:21,1,68,33
3,2021-01-01 00:16:29,2021-01-01 00:24:30,1,224,68
4,2021-01-01 00:12:29,2021-01-01 00:30:34,1,90,40
5,2021-01-01 00:26:12,2021-01-01 00:39:46,2,263,142
6,2021-01-01 00:15:52,2021-01-01 00:38:07,3,164,255
7,2021-01-01 00:10:46,2021-01-01 00:32:58,2,138,166
8,2021-01-01 00:31:06,2021-01-01 00:38:52,5,142,50
9,2021-01-01 00:42:11,2021-01-01 00:44:24,5,50,142


In [44]:
green_final_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1364661 entries, 0 to 1364660
Data columns (total 5 columns):
 #   Column            Non-Null Count    Dtype         
---  ------            --------------    -----         
 0   pickup_datetime   1364661 non-null  datetime64[us]
 1   dropoff_datetime  1364661 non-null  datetime64[us]
 2   pickup_zone       1364661 non-null  int32         
 3   dropoff_zone      1364661 non-null  int32         
 4   passenger_count   1364661 non-null  int32         
dtypes: datetime64[us](2), int32(3)
memory usage: 36.4 MB


In [45]:
green_final_cleaned.head(10)

Unnamed: 0,pickup_datetime,dropoff_datetime,pickup_zone,dropoff_zone,passenger_count
0,2021-01-01 00:15:56,2021-01-01 00:19:52,43,151,1
1,2021-01-01 00:25:59,2021-01-01 00:34:44,166,239,1
2,2021-01-01 00:45:57,2021-01-01 00:51:55,41,42,1
3,2020-12-31 23:57:51,2021-01-01 00:04:56,168,75,1
4,2021-01-01 00:31:14,2021-01-01 00:55:07,244,244,2
5,2021-01-01 00:08:50,2021-01-01 00:21:56,75,213,1
6,2021-01-01 00:35:13,2021-01-01 00:44:44,74,238,1
7,2021-01-01 00:39:57,2021-01-01 00:55:25,74,60,1
8,2021-01-01 00:51:27,2021-01-01 00:57:20,42,41,2
9,2021-01-01 00:32:07,2021-01-01 00:42:54,74,116,1


In [46]:
yellow_final_cleaned.tail(10)

Unnamed: 0,pickup_datetime,dropoff_datetime,passenger_count,pickup_zone,dropoff_zone
102318729,2024-03-31 23:43:12,2024-03-31 23:49:31,1,100,48
102318730,2024-03-31 23:47:44,2024-03-31 23:56:10,1,161,48
102318731,2024-03-31 23:06:30,2024-03-31 23:23:03,1,138,237
102318732,2024-03-31 23:29:21,2024-03-31 23:44:47,1,249,140
102318733,2024-03-31 23:58:11,2024-04-01 00:03:23,1,234,249
102318734,2024-03-31 23:21:44,2024-03-31 23:46:33,1,162,162
102318735,2024-03-31 23:17:01,2024-03-31 23:21:24,1,249,234
102318736,2024-03-31 23:04:38,2024-03-31 23:12:39,2,246,230
102318737,2024-03-31 23:09:34,2024-03-31 23:14:58,2,186,249
102318738,2024-03-31 23:44:13,2024-03-31 23:48:30,1,68,48


In [47]:
green_final_cleaned.tail(10)

Unnamed: 0,pickup_datetime,dropoff_datetime,pickup_zone,dropoff_zone,passenger_count
1364651,2024-03-31 23:47:18,2024-04-01 00:06:47,36,224,1
1364652,2024-03-31 23:11:46,2024-03-31 23:23:37,74,239,1
1364653,2024-03-31 23:15:22,2024-03-31 23:19:59,74,74,1
1364654,2024-03-31 23:33:39,2024-03-31 23:40:56,74,74,1
1364655,2024-03-31 23:28:54,2024-03-31 23:35:11,74,236,4
1364656,2024-03-31 23:38:13,2024-04-01 00:17:31,130,48,1
1364657,2024-03-31 23:28:04,2024-03-31 23:32:35,74,75,1
1364658,2024-03-31 23:22:56,2024-03-31 23:25:37,41,41,1
1364659,2024-03-31 23:28:58,2024-03-31 23:43:42,74,151,1
1364660,2024-03-31 23:36:25,2024-03-31 23:49:22,74,244,1


In [49]:
def chunk_dataframe(df, chunk_size):
    """Split DataFrame into chunks."""
    for i in range(0, len(df), chunk_size):
        yield df.iloc[i:i + chunk_size]

In [50]:
def process_chunk(chunk):
    """Process a single chunk of data."""
    # Combine pickup and dropoff data into a single DataFrame
    pickup_data = chunk[['pickup_datetime', 'passenger_count', 'pickup_zone']].rename(columns={'pickup_datetime': 'datetime', 'pickup_zone': 'zone'})
    dropoff_data = chunk[['dropoff_datetime', 'passenger_count', 'dropoff_zone']].rename(columns={'dropoff_datetime': 'datetime', 'dropoff_zone': 'zone'})
    combined_data = pd.concat([pickup_data, dropoff_data])

    # Round datetime to the nearest hour
    combined_data['datetime'] = combined_data['datetime'].dt.round('h')

    # Extract the required time components
    combined_data['datetime_formatted'] = combined_data['datetime'].dt.strftime('%Y-%m-%d-%H')
    combined_data['hour'] = combined_data['datetime'].dt.hour
    combined_data['day_of_week'] = combined_data['datetime'].dt.dayofweek
    combined_data['week'] = combined_data['datetime'].dt.isocalendar().week
    combined_data['month'] = combined_data['datetime'].dt.month - 1  # Convert to 0-11 for Jan-Dec
    combined_data['day_of_month'] = combined_data['datetime'].dt.day
    combined_data['year_month'] = combined_data['datetime'].dt.to_period('M').astype(str)
    
    return combined_data

In [51]:
def group_data(combined_data):
    """Group combined data and sum passenger counts."""
    return combined_data.groupby(
        ['datetime_formatted', 'hour', 'day_of_week', 'week', 'month', 'day_of_month', 'year_month', 'zone'],
        as_index=False
    )['passenger_count'].sum()

In [52]:
def calculate_zone_busy_in_chunks(df, chunk_size=10**6):
    """Main function to calculate zone busy data in chunks."""
    zone_busy_list = []

    for chunk in chunk_dataframe(df, chunk_size):
        combined_data = process_chunk(chunk)
        zone_busy_chunk = group_data(combined_data)
        zone_busy_list.append(zone_busy_chunk)

    # Concatenate all the chunk results
    zone_busy_df = pd.concat(zone_busy_list, ignore_index=True)

    # Final grouping to combine all the chunks
    zone_busy_df = group_data(zone_busy_df)

    return zone_busy_df

In [53]:
yellow_reformat = calculate_zone_busy_in_chunks(yellow_final_cleaned)
print(yellow_reformat)

        datetime_formatted  hour  day_of_week  week  month  day_of_month  \
0            2001-01-01-00     0            0     1      0             1   
1            2001-01-01-00     0            0     1      0             1   
2            2001-01-01-00     0            0     1      0             1   
3            2001-01-01-00     0            0     1      0             1   
4            2001-01-01-00     0            0     1      0             1   
...                    ...   ...          ...   ...    ...           ...   
3288608      2028-12-07-05     5            3    49     11             7   
3288609      2029-05-05-09     9            5    18      4             5   
3288610      2029-05-05-12    12            5    18      4             5   
3288611      2098-09-11-02     2            3    37      8            11   
3288612      2098-09-11-03     3            3    37      8            11   

        year_month  zone  passenger_count  
0          2001-01    43                1  

In [54]:
green_reformat = calculate_zone_busy_in_chunks(green_final_cleaned)
print(green_reformat)

       datetime_formatted  hour  day_of_week  week  month  day_of_month  \
0           2008-12-31-17    17            2     1     11            31   
1           2008-12-31-18    18            2     1     11            31   
2           2008-12-31-19    19            2     1     11            31   
3           2008-12-31-20    20            2     1     11            31   
4           2008-12-31-23    23            2     1     11            31   
...                   ...   ...          ...   ...    ...           ...   
732962      2024-04-01-00     0            0    14      3             1   
732963      2024-04-01-00     0            0    14      3             1   
732964      2024-04-01-00     0            0    14      3             1   
732965      2024-04-01-00     0            0    14      3             1   
732966      2024-04-01-00     0            0    14      3             1   

       year_month  zone  passenger_count  
0         2008-12    74                1  
1         200

In [57]:
start_date = '2021-01-01'
end_date = '2024-04-01'

# Function to filter dates within the specified range
def filter_dates(df):
    # Convert 'datetime_formatted' to datetime format
    df['datetime_formatted'] = pd.to_datetime(df['datetime_formatted'], format='%Y-%m-%d-%H')

    # Filter the DataFrame to include only the dates within the specified range
    filtered_df = df[(df['datetime_formatted'] >= start_date) & (df['datetime_formatted'] <= end_date)]
    
    return filtered_df

In [58]:
yellow_reformat = filter_dates(yellow_reformat)
yellow_reformat

Unnamed: 0,datetime_formatted,hour,day_of_week,week,month,day_of_month,year_month,zone,passenger_count
882,2021-01-01,0,4,53,0,1,2021-01,4,4
883,2021-01-01,0,4,53,0,1,2021-01,13,3
884,2021-01-01,0,4,53,0,1,2021-01,17,1
885,2021-01-01,0,4,53,0,1,2021-01,24,3
886,2021-01-01,0,4,53,0,1,2021-01,33,2
...,...,...,...,...,...,...,...,...,...
3288526,2024-04-01,0,0,14,3,1,2024-04,261,11
3288527,2024-04-01,0,0,14,3,1,2024-04,262,35
3288528,2024-04-01,0,0,14,3,1,2024-04,263,56
3288529,2024-04-01,0,0,14,3,1,2024-04,264,8


In [59]:
green_reformat = filter_dates(green_reformat)
green_reformat

Unnamed: 0,datetime_formatted,hour,day_of_week,week,month,day_of_month,year_month,zone,passenger_count
21,2021-01-01,0,4,53,0,1,2021-01,41,1
22,2021-01-01,0,4,53,0,1,2021-01,42,1
23,2021-01-01,0,4,53,0,1,2021-01,43,1
24,2021-01-01,0,4,53,0,1,2021-01,74,2
25,2021-01-01,0,4,53,0,1,2021-01,75,4
...,...,...,...,...,...,...,...,...,...
732962,2024-04-01,0,0,14,3,1,2024-04,166,5
732963,2024-04-01,0,0,14,3,1,2024-04,224,1
732964,2024-04-01,0,0,14,3,1,2024-04,230,1
732965,2024-04-01,0,0,14,3,1,2024-04,236,5


In [60]:
green_reformat.info()
yellow_reformat.info()

<class 'pandas.core.frame.DataFrame'>
Index: 732946 entries, 21 to 732966
Data columns (total 9 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   datetime_formatted  732946 non-null  datetime64[ns]
 1   hour                732946 non-null  int32         
 2   day_of_week         732946 non-null  int32         
 3   week                732946 non-null  UInt32        
 4   month               732946 non-null  int32         
 5   day_of_month        732946 non-null  int32         
 6   year_month          732946 non-null  object        
 7   zone                732946 non-null  int32         
 8   passenger_count     732946 non-null  int32         
dtypes: UInt32(1), datetime64[ns](1), int32(6), object(1)
memory usage: 37.0+ MB
<class 'pandas.core.frame.DataFrame'>
Index: 3287649 entries, 882 to 3288530
Data columns (total 9 columns):
 #   Column              Dtype         
---  ------              -----       

In [61]:
print(green_reformat.shape)
print(yellow_reformat.shape)

(732946, 9)
(3287649, 9)


In [62]:
yellow_reformat.head(10)

Unnamed: 0,datetime_formatted,hour,day_of_week,week,month,day_of_month,year_month,zone,passenger_count
882,2021-01-01,0,4,53,0,1,2021-01,4,4
883,2021-01-01,0,4,53,0,1,2021-01,13,3
884,2021-01-01,0,4,53,0,1,2021-01,17,1
885,2021-01-01,0,4,53,0,1,2021-01,24,3
886,2021-01-01,0,4,53,0,1,2021-01,33,2
887,2021-01-01,0,4,53,0,1,2021-01,37,1
888,2021-01-01,0,4,53,0,1,2021-01,40,1
889,2021-01-01,0,4,53,0,1,2021-01,41,11
890,2021-01-01,0,4,53,0,1,2021-01,42,1
891,2021-01-01,0,4,53,0,1,2021-01,43,29


In [63]:
green_reformat.head(10)

Unnamed: 0,datetime_formatted,hour,day_of_week,week,month,day_of_month,year_month,zone,passenger_count
21,2021-01-01,0,4,53,0,1,2021-01,41,1
22,2021-01-01,0,4,53,0,1,2021-01,42,1
23,2021-01-01,0,4,53,0,1,2021-01,43,1
24,2021-01-01,0,4,53,0,1,2021-01,74,2
25,2021-01-01,0,4,53,0,1,2021-01,75,4
26,2021-01-01,0,4,53,0,1,2021-01,116,3
27,2021-01-01,0,4,53,0,1,2021-01,151,1
28,2021-01-01,0,4,53,0,1,2021-01,152,1
29,2021-01-01,0,4,53,0,1,2021-01,166,2
30,2021-01-01,0,4,53,0,1,2021-01,168,1


In [64]:
yellow_reformat.tail(10)

Unnamed: 0,datetime_formatted,hour,day_of_week,week,month,day_of_month,year_month,zone,passenger_count
3288521,2024-04-01,0,0,14,3,1,2024-04,249,72
3288522,2024-04-01,0,0,14,3,1,2024-04,255,2
3288523,2024-04-01,0,0,14,3,1,2024-04,256,1
3288524,2024-04-01,0,0,14,3,1,2024-04,257,2
3288525,2024-04-01,0,0,14,3,1,2024-04,260,5
3288526,2024-04-01,0,0,14,3,1,2024-04,261,11
3288527,2024-04-01,0,0,14,3,1,2024-04,262,35
3288528,2024-04-01,0,0,14,3,1,2024-04,263,56
3288529,2024-04-01,0,0,14,3,1,2024-04,264,8
3288530,2024-04-01,0,0,14,3,1,2024-04,265,2


In [65]:
green_reformat.tail(10)

Unnamed: 0,datetime_formatted,hour,day_of_week,week,month,day_of_month,year_month,zone,passenger_count
732957,2024-04-01,0,0,14,3,1,2024-04,75,3
732958,2024-04-01,0,0,14,3,1,2024-04,88,2
732959,2024-04-01,0,0,14,3,1,2024-04,130,1
732960,2024-04-01,0,0,14,3,1,2024-04,141,1
732961,2024-04-01,0,0,14,3,1,2024-04,151,2
732962,2024-04-01,0,0,14,3,1,2024-04,166,5
732963,2024-04-01,0,0,14,3,1,2024-04,224,1
732964,2024-04-01,0,0,14,3,1,2024-04,230,1
732965,2024-04-01,0,0,14,3,1,2024-04,236,5
732966,2024-04-01,0,0,14,3,1,2024-04,244,1


In [66]:
combined_df = pd.concat([green_reformat, yellow_reformat])

# Group by all columns except 'passenger_count' and sum 'passenger_count'
combined_df = combined_df.groupby(
    ['datetime_formatted', 'hour', 'day_of_week', 'week', 'month', 'day_of_month', 'year_month', 'zone'], 
    as_index=False
)['passenger_count'].sum()

# Print the combined DataFrame info to verify
print(combined_df.info())

# Print the first few rows of the combined DataFrame to verify
print(combined_df.head(10))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3343052 entries, 0 to 3343051
Data columns (total 9 columns):
 #   Column              Dtype         
---  ------              -----         
 0   datetime_formatted  datetime64[ns]
 1   hour                int32         
 2   day_of_week         int32         
 3   week                UInt32        
 4   month               int32         
 5   day_of_month        int32         
 6   year_month          object        
 7   zone                int32         
 8   passenger_count     int32         
dtypes: UInt32(1), datetime64[ns](1), int32(6), object(1)
memory usage: 143.5+ MB
None
  datetime_formatted  hour  day_of_week  week  month  day_of_month year_month  \
0         2021-01-01     0            4    53      0             1    2021-01   
1         2021-01-01     0            4    53      0             1    2021-01   
2         2021-01-01     0            4    53      0             1    2021-01   
3         2021-01-01     0           

In [67]:
data_dir = os.path.join(os.getcwd(), "..", "Datasets", "taxi_other")


# Define the file path
file_path = os.path.join(data_dir, "combined_df.csv")

# Save the DataFrame to CSV
combined_df.to_csv(file_path, index=False)

print("DataFrame saved to:", file_path)

DataFrame saved to: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\..\Datasets\taxi_other\combined_df.csv


In [68]:
combined_df.to_parquet('c:\\Users\\35385\\Desktop\\CS_Summer_2024\\Shared_GH\\New-York-App\\data-analytics\\cleaning\\..\\Datasets\\taxi_parquets\\combined_df.parquet')
print("Saved final concatenated DataFrame: combined_df.parquet")

Saved final concatenated DataFrame: combined_df.parquet
