"""
File for Cleaning Taxi Data of the Green Taxis.
Begin by loading 1 parquet file as pandas dataframe.
Look at each the dataframe, as a csv and through python
Implement Crisp-DM data cleaning methodology -> Data Quality Report, Data Quality Plan

"""

In [4]:
import os
import pandas as pd
import glob

In [5]:
data_dir = os.path.join(os.getcwd(), "Datasets/taxi_parquets")
all_files = glob.glob(os.path.join(data_dir, "*.parquet"))
print("All files found:", all_files)

All files found: ['c:\\Users\\35385\\Desktop\\CS_Summer_2024\\Shared_GH\\New-York-App\\data-analytics\\Datasets/taxi_parquets\\fhvhv_2021-01.parquet', 'c:\\Users\\35385\\Desktop\\CS_Summer_2024\\Shared_GH\\New-York-App\\data-analytics\\Datasets/taxi_parquets\\fhvhv_2021-02.parquet', 'c:\\Users\\35385\\Desktop\\CS_Summer_2024\\Shared_GH\\New-York-App\\data-analytics\\Datasets/taxi_parquets\\fhvhv_2021-03.parquet', 'c:\\Users\\35385\\Desktop\\CS_Summer_2024\\Shared_GH\\New-York-App\\data-analytics\\Datasets/taxi_parquets\\fhvhv_2021-04.parquet', 'c:\\Users\\35385\\Desktop\\CS_Summer_2024\\Shared_GH\\New-York-App\\data-analytics\\Datasets/taxi_parquets\\fhvhv_2021-05.parquet', 'c:\\Users\\35385\\Desktop\\CS_Summer_2024\\Shared_GH\\New-York-App\\data-analytics\\Datasets/taxi_parquets\\fhvhv_2021-06.parquet', 'c:\\Users\\35385\\Desktop\\CS_Summer_2024\\Shared_GH\\New-York-App\\data-analytics\\Datasets/taxi_parquets\\fhvhv_2021-07.parquet', 'c:\\Users\\35385\\Desktop\\CS_Summer_2024\\Shared_

In [7]:
print("File path for green_2021_01:", r"/data-analytics/Datasets/taxi_parquets/green_2021-01.parquet")

File path for green_2021_01: /data-analytics/Datasets/taxi_parquets/green_2021-01.parquet


In [9]:
""" 
Begin by loading 1 parquet file as pandas dataframe from each of the 4 TLC genres.
Error catching across OSes implemented: cwd, data directory, paths etc.
"""

cwd = os.getcwd()
print("Current Working Directory:", cwd)

# Define the directory where the data is located relative to the current working directory
data_dir = "Datasets/taxi_parquets"
print("Data Directory:", data_dir)

# Define the file paths relative to the data directory
green_2021_01_path = os.path.join(cwd, data_dir, "green_2021-01.parquet")

print("green_2021_01_path:", green_2021_02_path)

Current Working Directory: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics
Data Directory: Datasets/taxi_parquets
green_2021_01_path: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\Datasets/taxi_parquets\green_2021-01.parquet


In [10]:
green_2021_01 = pd.read_parquet(green_2021_01_path, engine='pyarrow')

In [12]:
""" 
Save dataframe to CSV, for alternative and efficient analysis
"""

directory_path = os.path.join("Datasets", "taxi_other")

# Print target directory path (error catching)
print("Directory path to save CSV files:", directory_path)

# Verify the directory exists
if not os.path.isdir(directory_path):
    raise OSError(f"Directory does not exist: '{directory_path}'")

green_file_path = os.path.join(directory_path, "green_2021_01.csv")

green_2021_01.to_csv(green_file_path, index=False)

Directory path to save CSV files: Datasets\taxi_other


In [13]:
green_2021_01 = pd.read_parquet(green_2021_01_path, engine='pyarrow')

In [None]:
yellow_2021_01.head(10)

In [None]:
yellow_2021_01.info()

In [None]:
def renaming_green_to_standard(dfs):
    """ 
    Functions for renaming the columns of a dataset or list of datasets to standard names, which will ease the cleaning process
    """
    if isinstance(dfs, pd.DataFrame):
        dfs = [dfs]  # Convert single DataFrame to a list of one DataFrame
    for df in dfs:
        if isinstance(df, pd.DataFrame):
            df.rename(columns={
                'lpep_pickup_datetime': 'pickup_datetime', 
                'lpep_dropoff_datetime': 'dropoff_datetime', 
                'PULocationID': 'pickup_zone', 
                'DOLocationID': 'dropoff_zone'
            }, inplace=True)
        else:
            print("Warning: The list contains non-DataFrame elements")

renaming_yellow_to_standard(yellow_2021_01)

In [None]:
def convert_float_to_int(dfs):
    """ 
    Function for converting datatypes of specific columns of a DataFrame or list of DataFrames to appropriate types.
    """
    if isinstance(dfs, pd.DataFrame):
        dfs = [dfs]  # Convert single DataFrame to a list of one DataFrame
    for df in dfs:
        if isinstance(df, pd.DataFrame):
            if "RatecodeID" in df.columns:
                df["RatecodeID"] = df["RatecodeID"].fillna(0).astype("int")
            if "passenger_count" in df.columns:
                df["passenger_count"] = df["passenger_count"].fillna(0).astype("int")
        else:
            print("Warning: The list contains non-DataFrame elements")

convert_float_to_int(yellow_2021_01)

In [None]:
def duplicated_rows(dfs):
    if isinstance(dfs, pd.DataFrame):
        dfs = [dfs]  # Convert single DataFrame to a list of one DataFrame
    for df in dfs:
        if isinstance(df, pd.DataFrame):
            duplicate_rows = df[df.duplicated()]
            if not duplicate_rows.empty:
                print("Duplicate rows found:\n", duplicate_rows)
            else:
                print("No duplicate rows found")