"""
File for Cleaning Taxi Data of the Yellow Taxis.
Begin by loading 1 parquet file as pandas dataframe from each of the 4 TLC genres.
Look at each of the dataframes, as a csv and through python
Implement Crisp-DM data cleaning methodology -> Data Quality Report, Data Quality Plan

"""

In [228]:
import os
import pandas as pd
import glob

- TLC = Taxi and Limousine Commission

- Yellow = hail or prearrange

- Green = Not certain Manhattan Areas (below 110th St. on the West Side, and below 96th St. on the East Side, or at either LaGuardia or JFK airports)

- FHV (For Hire Vehicles) = Prearranged, Limousines, Black Cars, Livery (Regular), FHVHV

- **FHVHV**/ HVFHV/ HVFHS (For Hire Vehicle High Volume/ High Volume FHV/ High Volume For Hire Service) = "FHV Bases/ Businesses that dispatch more than 10,000 trips per day" = Lyft/ Uber/ Juno/ Via

- See also:
    https://www.nyc.gov/site/tlc/passengers/your-ride.page

In [231]:
data_dir = os.path.join(os.getcwd(), "Datasets/taxi_parquets")
all_files = glob.glob(os.path.join(data_dir, "*.parquet"))
print("All files found:", all_files)
print(len(all_files))

All files found: ['c:\\Users\\35385\\Desktop\\CS_Summer_2024\\Shared_GH\\New-York-App\\data-analytics\\Datasets/taxi_parquets\\fhvhv_2021_01.parquet', 'c:\\Users\\35385\\Desktop\\CS_Summer_2024\\Shared_GH\\New-York-App\\data-analytics\\Datasets/taxi_parquets\\fhvhv_2021_02.parquet', 'c:\\Users\\35385\\Desktop\\CS_Summer_2024\\Shared_GH\\New-York-App\\data-analytics\\Datasets/taxi_parquets\\fhvhv_2021_03.parquet', 'c:\\Users\\35385\\Desktop\\CS_Summer_2024\\Shared_GH\\New-York-App\\data-analytics\\Datasets/taxi_parquets\\fhvhv_2021_04.parquet', 'c:\\Users\\35385\\Desktop\\CS_Summer_2024\\Shared_GH\\New-York-App\\data-analytics\\Datasets/taxi_parquets\\fhvhv_2021_05.parquet', 'c:\\Users\\35385\\Desktop\\CS_Summer_2024\\Shared_GH\\New-York-App\\data-analytics\\Datasets/taxi_parquets\\fhvhv_2021_06.parquet', 'c:\\Users\\35385\\Desktop\\CS_Summer_2024\\Shared_GH\\New-York-App\\data-analytics\\Datasets/taxi_parquets\\fhvhv_2021_07.parquet', 'c:\\Users\\35385\\Desktop\\CS_Summer_2024\\Shared_

In [232]:
print("File path for yellow_2021_01:", r"/data-analytics/Datasets/taxi_parquets/yellow_2021_01.parquet")

File path for fhvhv_2021_02: /data-analytics/Datasets/taxi_parquets/fhvhv_2021_02.parquet
File path for fhv_2021_02: /data-analytics/Datasets/taxi_parquets/fhv_2021-02_parquet
File path for yellow_2021_01: /data-analytics/Datasets/taxi_parquets/yellow_2021_01.parquet


In [233]:
""" 
Begin by loading 1 parquet file as pandas dataframe from each of the 4 TLC genres.
Error catching across OSes implemented: cwd, data directory, paths etc.
"""

cwd = os.getcwd()
print("Current Working Directory:", cwd)

# Define the directory where the data is located relative to the current working directory
data_dir = "Datasets/taxi_parquets"
print("Data Directory:", data_dir)

# Define the file paths relative to the data directory
fhvhv_2021_02_path = os.path.join(cwd, data_dir, "fhvhv_2021_02.parquet")
yellow_2021_01_path = os.path.join(cwd, data_dir, "yellow_2021_01.parquet")

# Print the constructed file paths to verify
print("fhvhv_2021_02_path:", fhvhv_2021_02_path)
print("yellow_2021_01_path:", yellow_2021_01_path)


Current Working Directory: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics
Data Directory: Datasets/taxi_parquets
fhvhv_2021_02_path: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\Datasets/taxi_parquets\fhvhv_2021_02.parquet
fhv_2021_02_path: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\Datasets/taxi_parquets\fhv_2021_02.parquet
yellow_2021_01_path: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\Datasets/taxi_parquets\yellow_2021_01.parquet


In [212]:
# Read the parquet files using the relative file paths
yellow_2021_01 = pd.read_parquet(yellow_2021_01_path, engine='pyarrow')

In [6]:
""" 
Save dataframes to CSVs, for alternative and efficient analysis
Similar error catching as above
(Runtime = ~2 mins)
"""

directory_path = os.path.join("Datasets", "taxi_other")

# Print target directory path (error catching)
print("Directory path to save CSV files:", directory_path)

# Verify the directory exists
if not os.path.isdir(directory_path):
    raise OSError(f"Directory does not exist: '{directory_path}'")

# Define file paths for each CSV
yellow_file_path = os.path.join(directory_path, "yellow_2021_01.csv")
fhv_file_path = os.path.join(directory_path, "fhv_2021_02.csv")
fhvhv_file_path = os.path.join(directory_path, "fhvhv_2021_02.csv")

# Save each dataframe to its respective CSV file
yellow_2021_01.to_csv(yellow_file_path, index=False)
fhvhv_2021_02.to_csv(fhvhv_file_path, index=False)

Directory path to save CSV files: Datasets\taxi_other


In [213]:
yellow_2021_01 = pd.read_parquet(yellow_2021_01_path, engine='pyarrow')

**Investigating Yellow Taxi Datasets**

In [214]:
yellow_2021_01.head(10)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,1,2021-01-01 00:30:10,2021-01-01 00:36:12,1.0,2.1,1.0,N,142,43,2,8.0,3.0,0.5,0.0,0.0,0.3,11.8,2.5,
1,1,2021-01-01 00:51:20,2021-01-01 00:52:19,1.0,0.2,1.0,N,238,151,2,3.0,0.5,0.5,0.0,0.0,0.3,4.3,0.0,
2,1,2021-01-01 00:43:30,2021-01-01 01:11:06,1.0,14.7,1.0,N,132,165,1,42.0,0.5,0.5,8.65,0.0,0.3,51.95,0.0,
3,1,2021-01-01 00:15:48,2021-01-01 00:31:01,0.0,10.6,1.0,N,138,132,1,29.0,0.5,0.5,6.05,0.0,0.3,36.35,0.0,
4,2,2021-01-01 00:31:49,2021-01-01 00:48:21,1.0,4.94,1.0,N,68,33,1,16.5,0.5,0.5,4.06,0.0,0.3,24.36,2.5,
5,1,2021-01-01 00:16:29,2021-01-01 00:24:30,1.0,1.6,1.0,N,224,68,1,8.0,3.0,0.5,2.35,0.0,0.3,14.15,2.5,
6,1,2021-01-01 00:00:28,2021-01-01 00:17:28,1.0,4.1,1.0,N,95,157,2,16.0,0.5,0.5,0.0,0.0,0.3,17.3,0.0,
7,1,2021-01-01 00:12:29,2021-01-01 00:30:34,1.0,5.7,1.0,N,90,40,2,18.0,3.0,0.5,0.0,0.0,0.3,21.8,2.5,
8,1,2021-01-01 00:39:16,2021-01-01 01:00:13,1.0,9.1,1.0,N,97,129,4,27.5,0.5,0.5,0.0,0.0,0.3,28.8,0.0,
9,1,2021-01-01 00:26:12,2021-01-01 00:39:46,2.0,2.7,1.0,N,263,142,1,12.0,3.0,0.5,3.15,0.0,0.3,18.95,2.5,


In [215]:
yellow_2021_01.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1369769 entries, 0 to 1369768
Data columns (total 19 columns):
 #   Column                 Non-Null Count    Dtype         
---  ------                 --------------    -----         
 0   VendorID               1369769 non-null  int64         
 1   tpep_pickup_datetime   1369769 non-null  datetime64[us]
 2   tpep_dropoff_datetime  1369769 non-null  datetime64[us]
 3   passenger_count        1271417 non-null  float64       
 4   trip_distance          1369769 non-null  float64       
 5   RatecodeID             1271417 non-null  float64       
 6   store_and_fwd_flag     1271417 non-null  object        
 7   PULocationID           1369769 non-null  int64         
 8   DOLocationID           1369769 non-null  int64         
 9   payment_type           1369769 non-null  int64         
 10  fare_amount            1369769 non-null  float64       
 11  extra                  1369769 non-null  float64       
 12  mta_tax                13697

In [216]:
def renaming_yellow_to_standard(dfs):
    """ 
    Functions for renaming the columns of a dataset or list of datasets to standard names, which will ease the cleaning process
    """
    if isinstance(dfs, pd.DataFrame):
        dfs = [dfs]  # Convert single DataFrame to a list of one DataFrame
    for df in dfs:
        if isinstance(df, pd.DataFrame):
            df.rename(columns={
                'tpep_pickup_datetime': 'pickup_datetime', 
                'tpep_dropoff_datetime': 'dropoff_datetime', 
                'PULocationID': 'pickup_zone', 
                'DOLocationID': 'dropoff_zone'
            }, inplace=True)
        else:
            print("Warning: The list contains non-DataFrame elements")

renaming_yellow_to_standard(yellow_2021_01)

In [217]:
def convert_float_to_int(dfs):
    """ 
    Function for converting datatypes of specific columns of a DataFrame or list of DataFrames to appropriate types.
    """
    if isinstance(dfs, pd.DataFrame):
        dfs = [dfs]  # Convert single DataFrame to a list of one DataFrame
    for df in dfs:
        if isinstance(df, pd.DataFrame):
            if "RatecodeID" in df.columns:
                df["RatecodeID"] = df["RatecodeID"].fillna(0).astype("int")
            if "passenger_count" in df.columns:
                df["passenger_count"] = df["passenger_count"].fillna(0).astype("int")
        else:
            print("Warning: The list contains non-DataFrame elements")

convert_float_to_int(yellow_2021_01)

In [201]:
def duplicated_rows(dfs):
    if isinstance(dfs, pd.DataFrame):
        dfs = [dfs]  # Convert single DataFrame to a list of one DataFrame
    for df in dfs:
        if isinstance(df, pd.DataFrame):
            duplicate_rows = df[df.duplicated()]
            if not duplicate_rows.empty:
                print("Duplicate rows found:\n", duplicate_rows)
            else:
                print("No duplicate rows found")
duplicated_rows(yellow_2021_01)


No duplicate rows found


In [202]:
def passenger_counts(dfs):
    if isinstance(dfs, pd.DataFrame):
        dfs = [dfs]  # Convert single DataFrame to a list of one DataFrame
    for df in dfs:
        if isinstance(df, pd.DataFrame):
            passenger_count_counts = df['passenger_count'].value_counts()
            if not passenger_count_counts.empty:
                print("Count for each unique value in 'passenger_count' column: in")
                print(passenger_count_counts)

passenger_counts(yellow_2021_01)

Count for each unique value in 'passenger_count' column: in
passenger_count
1    966236
2    161671
0    125078
3     43935
5     31089
6     25362
4     16391
7         5
8         2
Name: count, dtype: int64


In [203]:
def count_invalid_fares(dfs):
    """
    Count rows where any fare-related column has an invalid value.
    
    """
    if isinstance(dfs, pd.DataFrame):
        dfs = [dfs]  # Convert single DataFrame to a list of one DataFrame
    for df in dfs:
        if isinstance(df, pd.DataFrame):

    # Columns to check for invalid fare amounts, <= 0; Need a fare for a valid trip
            fare_columns = ['fare_amount', 'total_amount']

    # Additional columns to check for negative values, < 0 as "0" is a valid value
            additional_fare_columns = ['extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge', 'congestion_surcharge', 'airport_fee']

    # Count rows where any fare-related column has an invalid value
            invalid_fare_counts = df[
                (df[fare_columns] <= 0).any(axis=1) | 
                (df[additional_fare_columns] < 0).any(axis=1)].shape[0]
            print("Number of rows with invalid fares: ", invalid_fare_counts)

            # Find + print rows where with invalid fares
            invalid_fare_rows = df[
            (df[fare_columns] <= 0).any(axis=1) |
            (df[additional_fare_columns] < 0).any(axis=1)]

            # Print 5 sample rows with invalid fares
            sample_rows = invalid_fare_rows.sample(n=5, random_state=42)
            print("5 Sample Rows with Invalid Fares:")
            print(sample_rows)

count_invalid_fares(yellow_2021_01)


Number of rows with invalid fares:  7417
5 Sample Rows with Invalid Fares:
        VendorID     pickup_datetime    dropoff_datetime  passenger_count  \
961032         2 2021-01-24 19:24:40 2021-01-24 19:28:15                1   
791413         2 2021-01-20 19:20:59 2021-01-20 19:24:32                2   
699436         1 2021-01-18 19:26:01 2021-01-18 19:27:13                1   
288947         2 2021-01-08 17:58:16 2021-01-08 18:10:21                1   
107851         2 2021-01-04 16:59:01 2021-01-04 17:04:26                1   

        trip_distance  RatecodeID store_and_fwd_flag  pickup_zone  \
961032           0.90           1                  N          186   
791413           0.51           1                  N          162   
699436           0.10           5                  N          236   
288947           1.40           1                  N          264   
107851           1.44           1                  N          148   

        dropoff_zone  payment_type  fare_amount

In [58]:
def time_travel(dfs):    
    if isinstance(dfs, pd.DataFrame):
        dfs = [dfs]  # Convert single DataFrame to a list of one DataFrame
    for df in dfs:
        if isinstance(df, pd.DataFrame):
            time_travel = df[df['pickup_datetime'] > df['dropoff_datetime']].shape[0]
            print("Did anyone Time Travel?")
            print(time_travel)

            # Find + print rows where pickup time is after dropoff time
            time_travel_rows = df[df['pickup_datetime'] > df['dropoff_datetime']]
            if not time_travel_rows.empty:
                print("One of the rows where time travel occurred:")
                print(time_travel_rows.iloc[0])
            else:
                print("No rows where time travel occurred.")            
        else:
            print("Warning: The list contains non-DataFrame elements")

time_travel(yellow_2021_01)

Did anyone Time Travel?
5642
One of the rows where time travel occurred:
VendorID                                   1
pickup_datetime          2021-01-05 17:43:19
dropoff_datetime         2021-01-05 17:39:06
passenger_count                            1
trip_distance                            0.0
RatecodeID                                 1
store_and_fwd_flag                         N
pickup_zone                              145
dropoff_zone                             145
payment_type                               2
fare_amount                              4.0
extra                                    1.0
mta_tax                                  0.5
tip_amount                               0.0
tolls_amount                             0.0
improvement_surcharge                    0.3
total_amount                             5.8
congestion_surcharge                     0.0
airport_fee                              NaN
Name: 151936, dtype: object


In [59]:
def immovable_objects(dfs):    
    if isinstance(dfs, pd.DataFrame):
        dfs = [dfs]  # Convert single DataFrame to a list of one DataFrame
    for df in dfs:
        if isinstance(df, pd.DataFrame):
            negative_distances = df[df['trip_distance'] <= 0].shape[0]
            print("Travelling no distances?")
            print(negative_distances)

            # Find + print rows where pickup time is after dropoff time
            negative_distances_rows = df[df['trip_distance'] <= 0]
            if not negative_distances_rows.empty:
                print("One of the rows of the immovable objects:")
                print(negative_distances_rows.iloc[0]) #change this number for other examples
            else:
                print("No rows where time travel occurred.")
        else:
            print("Warning: The list contains non-DataFrame elements")

immovable_objects(yellow_2021_01)

Travelling no distances?
19952
One of the rows of the immovable objects:
VendorID                                   1
pickup_datetime          2021-01-01 00:03:13
dropoff_datetime         2021-01-01 00:03:19
passenger_count                            1
trip_distance                            0.0
RatecodeID                                 1
store_and_fwd_flag                         N
pickup_zone                              169
dropoff_zone                             169
payment_type                               3
fare_amount                              0.0
extra                                    0.0
mta_tax                                  0.0
tip_amount                               0.0
tolls_amount                             0.0
improvement_surcharge                    0.0
total_amount                             0.0
congestion_surcharge                     0.0
airport_fee                              NaN
Name: 38, dtype: object


In [60]:
def check_dispute_payments(dfs):
    """
    Count rows where payment_type is 4 (Dispute) and print sample rows.
    """
    if isinstance(dfs, pd.DataFrame):
        dfs = [dfs]  # Convert single DataFrame to a list of one DataFrame
    
    for df in dfs:
        if isinstance(df, pd.DataFrame):
            # Count rows where payment_type is 4 (Dispute)
            dispute_count = df[df['payment_type'] == 4].shape[0]
            print("Number of rows with payment_type 4 (Dispute):", dispute_count)

            # Find rows where payment_type is 4 (Dispute)
            dispute_rows = df[df['payment_type'] == 4]

            # Print 5 sample rows with payment_type 4 (Dispute)
            sample_rows = dispute_rows.sample(n=5, random_state=42)
            print("5 Sample Rows with payment_type 4 (Dispute):")
            print(sample_rows)

check_dispute_payments(yellow_2021_01)

Number of rows with payment_type 4 (Dispute): 5667
5 Sample Rows with payment_type 4 (Dispute):
         VendorID     pickup_datetime    dropoff_datetime  passenger_count  \
1047827         1 2021-01-26 19:22:23 2021-01-26 19:23:29                1   
964262          2 2021-01-24 22:25:40 2021-01-24 22:32:23                1   
1102473         1 2021-01-28 03:27:12 2021-01-28 03:30:50                1   
995123          1 2021-01-25 16:32:28 2021-01-25 16:36:57                1   
900844          2 2021-01-23 09:14:12 2021-01-23 09:45:20                2   

         trip_distance  RatecodeID store_and_fwd_flag  pickup_zone  \
1047827           2.10           1                  N          262   
964262            0.99           1                  N          100   
1102473           0.50           1                  N          239   
995123            0.80           1                  N          234   
900844            2.39           1                  N          263   

         dropo

In [61]:
def check_airport_fee_values(dfs):
    """
    Print all unique values in the 'airport_fee' column and their counts.
    """
    if isinstance(dfs, pd.DataFrame):
        dfs = [dfs]  # Convert single DataFrame to a list of one DataFrame

    for df in dfs:
        if isinstance(df, pd.DataFrame):
            # Get the unique values and their counts in the 'airport_fee' column
            unique_values_counts = df['airport_fee'].value_counts()
            
            # Print the unique values and their counts
            print("Unique values in the 'airport_fee' column and their counts:")
            print(unique_values_counts)

check_airport_fee_values(yellow_2021_01)


Unique values in the 'airport_fee' column and their counts:
airport_fee
0.0    5
Name: count, dtype: int64


If the trip would result in the driver’s having to operate the taxicab for more than 12 consecutive hours, which is prohibited, then that driver may refuse to take a passenger to these destinations..

In [125]:
print("Current Working Directory:", cwd)

# Define the directory where the data is located relative to the current working directory
taxi_zone_dir = "Datasets/taxi_other"
print("Taxi Zone CSV Directory:", taxi_zone_dir)

# Define the file path relative to the data directory
taxi_zone_path = os.path.join(cwd, taxi_zone_dir, "taxi_zone_lookup.csv")

taxi_zone = pd.read_csv(taxi_zone_path, keep_default_na=True, delimiter=",", skipinitialspace=True, encoding="Windows-1252")

def valid_zones_1(df):
    manhattan_df = df[df["Borough"] == "Manhattan"]
    unique_zones = manhattan_df["LocationID"].unique()
    
    print(f"Number of Unique Zones: {len(unique_zones)}")
    print("List of Unique Zones:", unique_zones)

valid_zones_1(taxi_zone)
    

Current Working Directory: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics
Taxi Zone CSV Directory: Datasets/taxi_other
Number of Unique Zones: 69
List of Unique Zones: [  4  12  13  24  41  42  43  45  48  50  68  74  75  79  87  88  90 100
 103 104 105 107 113 114 116 120 125 127 128 137 140 141 142 143 144 148
 151 152 153 158 161 162 163 164 166 170 186 194 202 209 211 224 229 230
 231 232 233 234 236 237 238 239 243 244 246 249 261 262 263]


In [63]:
print("Current Working Directory:", cwd)

# Define the directory where the data is located relative to the current working directory
taxi_zone_dir = "Datasets/taxi_other"
print("Taxi Zone CSV Directory:", taxi_zone_dir)

# Define the file path relative to the data directory
taxi_zone_alternate_path = os.path.join(cwd, taxi_zone_dir, "taxi_zones_alternate.csv")

taxi_zone_alternate = pd.read_csv(taxi_zone_alternate_path, keep_default_na=True, delimiter=",", skipinitialspace=True, encoding="Windows-1252")

def valid_zones_alternate(df):
    manhattan_df = df[df["borough"] == "Manhattan"] # Change in capitalisation of "Borough"
    unique_zones = manhattan_df["LocationID"].unique()
    
    print(f"Number of Unique Zones: {len(unique_zones)}")
    print("List of Unique Zones:", unique_zones)

valid_zones_alternate(taxi_zone_alternate)

Current Working Directory: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics
Taxi Zone CSV Directory: Datasets/taxi_other
Number of Unique Zones: 67
List of Unique Zones: [  4  24  12  13  41  45  42  43  48  50  68  79  74  75  87  88  90 125
 100 103 107 113 114 116 120 127 128 151 140 137 141 142 152 143 144 148
 153 158 161 162 163 164 170 166 186 194 202 209 211 224 229 230 231 239
 232 233 234 236 237 238 263 243 244 246 249 261 262]


In [64]:
cwd = os.getcwd()
print("Current Working Directory:", cwd)

# Load the CSV files
taxi_zone1 = pd.read_csv(taxi_zone_path, keep_default_na=True, delimiter=",", skipinitialspace=True, encoding="Windows-1252")
taxi_zone2 = pd.read_csv(taxi_zone_alternate_path, keep_default_na=True, delimiter=",", skipinitialspace=True, encoding="Windows-1252")

# Define a function to get unique zones for Manhattan
def get_unique_zones(df, borough_col):
    manhattan_df = df[df[borough_col] == "Manhattan"]
    unique_zones = manhattan_df["LocationID"].unique()
    return set(unique_zones)

# Get unique zones for Manhattan from both CSVs
unique_zones1 = get_unique_zones(taxi_zone1, "Borough")
unique_zones2 = get_unique_zones(taxi_zone2, "borough")

# Print the unique zones and their counts
print(f"Number of Unique Zones in CSV 1: {len(unique_zones1)}")
print(f"List of Unique Zones in CSV 1: {sorted(unique_zones1)}")

print(f"Number of Unique Zones in CSV 2: {len(unique_zones2)}")
print(f"List of Unique Zones in CSV 2: {sorted(unique_zones2)}")

# Identify the differences
zones_only_in_csv1 = unique_zones1 - unique_zones2
zones_only_in_csv2 = unique_zones2 - unique_zones1

print(f"Zones only in CSV 1: {sorted(zones_only_in_csv1)}")
print(f"Zones only in CSV 2: {sorted(zones_only_in_csv2)}")

# Analyze the differences to determine correctness
if zones_only_in_csv1:
    print(f"Zones present only in the first CSV: {zones_only_in_csv1}")

if zones_only_in_csv2:
    print(f"Zones present only in the second CSV: {zones_only_in_csv2}")

Current Working Directory: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics
Number of Unique Zones in CSV 1: 69
List of Unique Zones in CSV 1: [4, 12, 13, 24, 41, 42, 43, 45, 48, 50, 68, 74, 75, 79, 87, 88, 90, 100, 103, 104, 105, 107, 113, 114, 116, 120, 125, 127, 128, 137, 140, 141, 142, 143, 144, 148, 151, 152, 153, 158, 161, 162, 163, 164, 166, 170, 186, 194, 202, 209, 211, 224, 229, 230, 231, 232, 233, 234, 236, 237, 238, 239, 243, 244, 246, 249, 261, 262, 263]
Number of Unique Zones in CSV 2: 67
List of Unique Zones in CSV 2: [4, 12, 13, 24, 41, 42, 43, 45, 48, 50, 68, 74, 75, 79, 87, 88, 90, 100, 103, 107, 113, 114, 116, 120, 125, 127, 128, 137, 140, 141, 142, 143, 144, 148, 151, 152, 153, 158, 161, 162, 163, 164, 166, 170, 186, 194, 202, 209, 211, 224, 229, 230, 231, 232, 233, 234, 236, 237, 238, 239, 243, 244, 246, 249, 261, 262, 263]
Zones only in CSV 1: [104, 105]
Zones only in CSV 2: []
Zones present only in the first CSV: {104, 105}


Use CSV 1, "taxi_zone_lookup.csv". It has 2 zones not in "taxi_zones_alternate.csv", and these zones are present at least in the fhv parquets.

In [184]:
# Define a function to get unique zones for Manhattan
def get_manhattan_zones(df):
    manhattan_df = df[df["Borough"] == "Manhattan"]
    unique_zones = manhattan_df["LocationID"].unique()
    return set(unique_zones)

# Get the unique Manhattan zones from the taxi_zone DataFrame
manhattan_zones = get_manhattan_zones(taxi_zone)

In [218]:
# Define the function to check pickup and dropoff zones
def check_zones(df, manhattan_zones):
    # Check if both pickup_zone and dropoff_zone are not in manhattan_zones
    invalid_zones = df[~df["pickup_zone"].isin(manhattan_zones) & ~df["dropoff_zone"].isin(manhattan_zones)]
    
    print(f"Invalid zones count: {invalid_zones.shape[0]}")
    
    if not invalid_zones.empty:
        print("Examples of rows with invalid zones:")
        print(invalid_zones.head())  # Print first few invalid rows

In [219]:
check_zones(yellow_2021_01, manhattan_zones)


Invalid zones count: 96421
Examples of rows with invalid zones:
    VendorID     pickup_datetime    dropoff_datetime  passenger_count  \
2          1 2021-01-01 00:43:30 2021-01-01 01:11:06                1   
3          1 2021-01-01 00:15:48 2021-01-01 00:31:01                0   
6          1 2021-01-01 00:00:28 2021-01-01 00:17:28                1   
8          1 2021-01-01 00:39:16 2021-01-01 01:00:13                1   
11         2 2021-01-01 00:46:36 2021-01-01 00:53:45                2   

    trip_distance  RatecodeID store_and_fwd_flag  pickup_zone  dropoff_zone  \
2           14.70           1                  N          132           165   
3           10.60           1                  N          138           132   
6            4.10           1                  N           95           157   
8            9.10           1                  N           97           129   
11           1.21           1                  N          255            80   

    payment_type  fare

In [167]:
def verifying_invalid_zones(df, manhattan_zones, taxi_zone_csv):
    # Identify invalid rows where both pickup_zone and dropoff_zone are not in manhattan_zones
    invalid_zones = df[~df["pickup_zone"].isin(manhattan_zones) & ~df["dropoff_zone"].isin(manhattan_zones)]
    
    # If there are no invalid rows, print and return
    if invalid_zones.empty:
        print("No invalid zones found.")
        return
    
    # Merge invalid zones with taxi_zone_csv to get borough information for both pickup_zone and dropoff_zone
    invalid_zones_merged = invalid_zones.merge(taxi_zone_csv, left_on="pickup_zone", right_on="LocationID", how="left", suffixes=('_pickup', '_dropoff'))
    invalid_zones_merged = invalid_zones_merged.merge(taxi_zone_csv, left_on="dropoff_zone", right_on="LocationID", how="left", suffixes=('_pickup', '_dropoff'))

    # Concatenate Borough columns for analysis
    invalid_zones_merged["Borough_combined"] = invalid_zones_merged["Borough_pickup"] + " (pickup), " + invalid_zones_merged["Borough_dropoff"] + " (dropoff)"
    
    # Count the combined Borough information
    combined_borough_counts = invalid_zones_merged["Borough_combined"].value_counts()

    print("Invalid Rows Borough Counts:")
    print(combined_borough_counts)

manhattan_zones = get_manhattan_zones(taxi_zone)

# Call the verifying_invalid_zones function
verifying_invalid_zones(yellow_2021_01, manhattan_zones, taxi_zone)

Invalid Rows Borough Counts:
Borough_combined
Queens (pickup), Queens (dropoff)                  27520
Brooklyn (pickup), Brooklyn (dropoff)              18404
Queens (pickup), Brooklyn (dropoff)                14348
Bronx (pickup), Bronx (dropoff)                     8570
Unknown (pickup), Unknown (dropoff)                 5555
Queens (pickup), Bronx (dropoff)                    3768
Brooklyn (pickup), Queens (dropoff)                 2600
Bronx (pickup), Queens (dropoff)                    1092
Brooklyn (pickup), Bronx (dropoff)                  1082
Bronx (pickup), Brooklyn (dropoff)                  1030
Unknown (pickup), Queens (dropoff)                   383
Queens (pickup), Unknown (dropoff)                   357
Unknown (pickup), Brooklyn (dropoff)                 337
Queens (pickup), Staten Island (dropoff)             249
Brooklyn (pickup), Staten Island (dropoff)           174
Staten Island (pickup), Queens (dropoff)             106
Staten Island (pickup), Brooklyn (dropoff)

- **Data Quality Report**
- **Data Integrity Checks** 

Having investigated the CSVs, a number of data inconsistincies are present. They are summarised below:

- Check 0: No duplicate rows present

- Check 1: There are a number of trips with 0 passengers, as well as trip with more than 5 passengers. According to the TLC data dictionary, 5 is the maximum amount of passengers allowed. 0 might represent trips that didn't occur or were cancelled, or serve another purpose. As our goal is to track Busy-ness in NYC, values of 0, 6, 7 and 8 cannot be counted as valid.
    - Drop rows from dataset where "passenger_count" == 0, 6, 7, 8.

- Check 2: There are a number of trips with negative fare amounts. Negative fares could represent refunds or errors in the data entry, and are not likely to represent actual trips where passengers were transported. Since our goal is to track busyness in NYC, negative fare amounts cannot be considered valid data points.
    - Drop rows from the dataset where "fare_amount" or "total_amount" <= 0 (0 is not a valid value)
    - Drop rows from the dataset where "extra", "mta_tax", "tip_amount'", "tolls_amount", "improvement_surcharge", "congestion_surcharge", "airport_fee" < 0 ("0" is a valid value).
    
- Check 3: There are a number of trips where the pickup time is exactly the same as the dropoff time. This means no time has elapsed for the trip, which is suspicious and likely indicates invalid data points. There are also rows where the pickup time is later in time than the dropoff time (implying time travel). Further checks reveal additional inconsistencies:
    - Further Checks:
        -> The trip_distance is variable, ranging from 0 miles to over 13 miles, which is implausible for zero or negative elapsed time.
        -> The total_amount is also variable, with fares ranging from $6 to over $48, despite zero minutes of travel time.
        -> These data points are difficult to trust and it is hard to believe that they represent valid trips.
            - Drop rows from the dataset where pickup_datetime == dropoff_datetime.

- Check 4: There are some journeys with 0 trip distance, potentially indicating cancelled trips, or invalid ones. These rows do not represent busy-ness, as it is difficult to say that a trip occurred at all. Confusingly, these trips show a variation in total_amount from (-492.8 to +900.35).
    - Drop rows from the dataset where "trip_distance" <= 0.
 
- Check 5: According to the Yellow Taxi Data Dictionary (appended at bottom), RateCodeID, the final rate code in effect at the end of the trip, should have a value of 1-6. There are a number of rows with a value of "99". Some of these rows have varied and valid datetimes, total_amount, passenger_count,  and trip_distance. It is unclear what code "99" means, but according to the Data Dictionary, only rows with values of 1-6 are valid.
    - Drop rows from the dataset where RateCodeID != 1-6.

- Check 6: Disputed payments (payment_type == 4) often indicate issues with the fare. These might be unpaid, mispaid, or late charges. Disputes may suggest problems like incorrect dropoff locations or other inaccuracies in the journey data. The validity of these trips is questionable, as they may not accurately represent completed journeys.
    - Drop rows from the dataset where payment_type == 4.

- Check 7: The "airport_fee" contained only 5 entries with value 0, and the rest of the entries were NaN. This fee might be infrequently applied or waived altogether in the majority of recorded taxi journeys. 
    - Drop "airport_fee" column from dataset due to missing values.

- Check 8: According to the accompanying "taxi_zone_lookup.csv" file, both the pickup_zone and dropoff_zone should be in Manhattan zones. Rows where neither pickup_zone nor dropoff_zone are in Manhattan zones are considered invalid.
    - Drop rows from the dataset where both pickup_zone and dropoff_zone are not Manhattan zones.

See also: 
    https://rules.cityofnewyork.us/rule/taximeter-rate-of-fare-and-various-surcharges/
    https://www.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_yellow.pdf
    https://www.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_green.pdf
    https://www.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_fhv.pdf
    https://www.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_hvfhs.pdf
    https://www.nyc.gov/site/tlc/passengers/passenger-frequently-asked-questions.page 
    https://www.nyc.gov/site/tlc/passengers/taxi-fare.page 
    https://www.nyc.gov/assets/tlc/downloads/pdf/archived_public_notices/public_notice_09_17_09.pdf
    https://www.nyc.gov/assets/tlc/downloads/pdf/trip_record_user_guide.pdf 
    https://data.cityofnewyork.us/City-Government/NTA-map/d3qk-pfyz
    

In [220]:
def drop_yellow_invalid_rows(dfs, manhattan_zones, taxi_zone_csv):
    if isinstance(dfs, pd.DataFrame):
        dfs = [dfs]  # Convert single DataFrame to a list of one DataFrame

    cleaned_dfs = []
    
    for df in dfs:
        if isinstance(df, pd.DataFrame):
            # Drop duplicate rows
            df = df.drop_duplicates()
            
            # Drop "airport_fee" column due to missing values
            if "airport_fee" in df.columns:
                df = df.drop(columns=["airport_fee"])
            
            # Drop rows where passenger_count == 0 or >= 6
            df = df[(df["passenger_count"] > 0) & (df["passenger_count"] < 6)]
            
            # Drop rows where fare_amount or total_amount <= 0
            df = df[(df["fare_amount"] > 0) & (df["total_amount"] > 0)]
            
            # Drop rows where extra, mta_tax, tip_amount, tolls_amount, improvement_surcharge, or congestion_surcharge < 0
            df = df[(df["extra"] >= 0) & (df["mta_tax"] >= 0) & (df["tip_amount"] >= 0) & 
                    (df["tolls_amount"] >= 0) & (df["improvement_surcharge"] >= 0) & 
                    (df["congestion_surcharge"] >= 0)]
            
            # Drop rows where pickup_datetime == dropoff_datetime
            df = df[df["pickup_datetime"] != df["dropoff_datetime"]]
            
            # Drop rows where trip_distance <= 0
            df = df[df["trip_distance"] > 0]
            
            # Drop rows where RateCodeID != 1-6
            df = df[df["RatecodeID"].isin([1, 2, 3, 4, 5, 6])]
            
            # Drop rows where payment_type == 4
            df = df[df["payment_type"] != 4]
            
            # Drop rows where both pickup_zone and dropoff_zone are not Manhattan zones
            df = df[df["pickup_zone"].isin(manhattan_zones) | df["dropoff_zone"].isin(manhattan_zones)]
            
            # Append the cleaned DataFrame to the list
            cleaned_dfs.append(df)
        else:
            print("Warning: The list contains non-DataFrame elements")
    
    # Return the cleaned DataFrame(s)
    if len(cleaned_dfs) == 1:
        return cleaned_dfs[0]
    else:
        return cleaned_dfs

In [221]:
manhattan_zones = get_manhattan_zones(taxi_zone)

# Call the verifying_invalid_zones function
yellow_2021_01 = drop_yellow_invalid_rows(yellow_2021_01, manhattan_zones, taxi_zone)


In [222]:
yellow_2021_01.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1149499 entries, 0 to 1271416
Data columns (total 18 columns):
 #   Column                 Non-Null Count    Dtype         
---  ------                 --------------    -----         
 0   VendorID               1149499 non-null  int64         
 1   pickup_datetime        1149499 non-null  datetime64[us]
 2   dropoff_datetime       1149499 non-null  datetime64[us]
 3   passenger_count        1149499 non-null  int32         
 4   trip_distance          1149499 non-null  float64       
 5   RatecodeID             1149499 non-null  int32         
 6   store_and_fwd_flag     1149499 non-null  object        
 7   pickup_zone            1149499 non-null  int64         
 8   dropoff_zone           1149499 non-null  int64         
 9   payment_type           1149499 non-null  int64         
 10  fare_amount            1149499 non-null  float64       
 11  extra                  1149499 non-null  float64       
 12  mta_tax                1149499 no

In [223]:
def drop_yellow_columns(df):
    columns_to_drop = ["VendorID", "trip_distance", "RatecodeID", "store_and_fwd_flag", "payment_type", 
                       "fare_amount", "extra", "mta_tax", "improvement_surcharge", "tip_amount", 
                       "tolls_amount", "total_amount", "congestion_surcharge", "airport_fee"]
    
    # Drop only the columns that exist in the DataFrame
    existing_columns_to_drop = [col for col in columns_to_drop if col in df.columns]
    df.drop(columns=existing_columns_to_drop, inplace=True)

    # Return the modified DataFrame
    return df

In [224]:
yellow_2021_01 = drop_yellow_columns(yellow_2021_01)

In [225]:
yellow_2021_01.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1149499 entries, 0 to 1271416
Data columns (total 5 columns):
 #   Column            Non-Null Count    Dtype         
---  ------            --------------    -----         
 0   pickup_datetime   1149499 non-null  datetime64[us]
 1   dropoff_datetime  1149499 non-null  datetime64[us]
 2   passenger_count   1149499 non-null  int32         
 3   pickup_zone       1149499 non-null  int64         
 4   dropoff_zone      1149499 non-null  int64         
dtypes: datetime64[us](2), int32(1), int64(2)
memory usage: 48.2 MB


In [226]:
def drop_missing_values(dfs):
    """
    Drops any rows with missing values from the DataFrame.
    
    Parameters:
        df (pd.DataFrame): The DataFrame to clean.
        
    Returns:
        pd.DataFrame: The cleaned DataFrame.
        int: The number of rows that were dropped.
    """
    
    if isinstance(dfs, pd.DataFrame):
        dfs = [dfs]  # Convert single DataFrame to a list of one DataFrame

    cleaned_dfs = []
    
    for df in dfs:
        if isinstance(df, pd.DataFrame):    # Count the number of rows before dropping missing values
            initial_row_count = df.shape[0]
    
            # Drop rows with missing values
            df = df.dropna()
    
            # Count the number of rows after dropping missing values
            final_row_count = df.shape[0]
    
            # Calculate the number of rows that were dropped
            rows_dropped = initial_row_count - final_row_count
        print(f"Number of rows dropped: {rows_dropped}")
        return df

In [227]:
drop_missing_values(yellow_2021_01)

Number of rows dropped: 0


Unnamed: 0,pickup_datetime,dropoff_datetime,passenger_count,pickup_zone,dropoff_zone
0,2021-01-01 00:30:10,2021-01-01 00:36:12,1,142,43
1,2021-01-01 00:51:20,2021-01-01 00:52:19,1,238,151
4,2021-01-01 00:31:49,2021-01-01 00:48:21,1,68,33
5,2021-01-01 00:16:29,2021-01-01 00:24:30,1,224,68
7,2021-01-01 00:12:29,2021-01-01 00:30:34,1,90,40
...,...,...,...,...,...
1271412,2021-01-31 23:58:47,2021-02-01 00:04:40,3,41,74
1271413,2021-01-31 23:07:54,2021-01-31 23:19:42,1,113,141
1271414,2021-01-31 23:30:45,2021-01-31 23:35:13,1,233,237
1271415,2021-01-31 23:09:52,2021-01-31 23:51:56,2,56,68


In [193]:
def calculate_zone_busy(yellow_df):
    # Combine pickup and dropoff data into a single DataFrame; Renamed and combined into a single datetime column
    pickup_data = yellow_df[['pickup_datetime', 'passenger_count', 'pickup_zone']].rename(columns={'pickup_datetime': 'datetime', 'pickup_zone': 'zone'})
    dropoff_data = yellow_df[['dropoff_datetime', 'passenger_count', 'dropoff_zone']].rename(columns={'dropoff_datetime': 'datetime', 'dropoff_zone': 'zone'})
    combined_data = pd.concat([pickup_data, dropoff_data])
    
    # Round datetime to the nearest hour
    combined_data['datetime'] = combined_data['datetime'].dt.round('h')
    
    # Group by hour and zone, summing passenger counts
    zone_busy_df = combined_data.groupby(['datetime', 'zone'])['passenger_count'].sum().reset_index()
    
    return zone_busy_df

In [194]:
yellow_2021_01 = calculate_zone_busy(yellow_2021_01)

In [195]:
yellow_2021_01.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73636 entries, 0 to 73635
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   datetime         73636 non-null  datetime64[us]
 1   zone             73636 non-null  int64         
 2   passenger_count  73636 non-null  int32         
dtypes: datetime64[us](1), int32(1), int64(1)
memory usage: 1.4 MB


In [196]:
yellow_2021_01.head(10)

Unnamed: 0,datetime,zone,passenger_count
0,2020-10-13 12:00:00,234,1
1,2020-12-31 14:00:00,170,1
2,2020-12-31 14:00:00,226,1
3,2020-12-31 18:00:00,48,1
4,2020-12-31 18:00:00,68,1
5,2020-12-31 18:00:00,142,2
6,2020-12-31 18:00:00,239,1
7,2020-12-31 19:00:00,48,2
8,2020-12-31 19:00:00,90,1
9,2020-12-31 19:00:00,233,1


In [234]:
def passenger_counts(dfs):
    if isinstance(dfs, pd.DataFrame):
        dfs = [dfs]  # Convert single DataFrame to a list of one DataFrame
    for df in dfs:
        if isinstance(df, pd.DataFrame):
            passenger_count_counts = df['passenger_count'].value_counts()
            if not passenger_count_counts.empty:
                print("Count for each unique value in 'passenger_count' column: in")
                print(passenger_count_counts)

passenger_counts(yellow_2021_01)

Count for each unique value in 'passenger_count' column: in
passenger_count
1    909754
2    153143
3     41637
5     29616
4     15349
Name: count, dtype: int64


In [157]:
directory_path = os.path.join("Datasets", "taxi_other")


# Define the file path
file_path = os.path.join(directory_path, "yellow_2021_01_cleaned.csv")

# Save the DataFrame to CSV
yellow_2021_01.to_csv(file_path, index=False)

print("DataFrame saved to:", file_path)

DataFrame saved to: Datasets\taxi_other\yellow_2021_01_cleaned.csv


I have 156 parquet files, and 39 of these are "yellow" taxi files. This is a lot of data. I want to load the files 1 by 1, give them appropriate names, and clean them. I want to do it 1 by 1 so that not too much memory is used

In [53]:
fhvhv_2021_02.head(10)

Unnamed: 0,hvfhs_license_num,dispatching_base_num,originating_base_num,request_datetime,on_scene_datetime,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,trip_miles,...,sales_tax,congestion_surcharge,airport_fee,tips,driver_pay,shared_request_flag,shared_match_flag,access_a_ride_flag,wav_request_flag,wav_match_flag
0,HV0003,B02764,B02764,2021-01-31 23:59:00,2021-02-01 00:10:19,2021-02-01 00:10:40,2021-02-01 00:21:09,35,39,2.06,...,1.52,0.0,,0.0,9.79,N,N,,N,N
1,HV0003,B02764,B02764,2021-02-01 00:13:35,2021-02-01 00:25:23,2021-02-01 00:27:23,2021-02-01 00:44:01,39,35,3.15,...,2.85,0.0,,0.0,24.01,N,N,,N,N
2,HV0005,B02510,,2021-02-01 00:12:55,NaT,2021-02-01 00:28:38,2021-02-01 00:38:27,39,91,1.776,...,1.12,0.0,,0.0,6.91,N,N,N,N,N
3,HV0005,B02510,,2021-02-01 00:36:01,NaT,2021-02-01 00:43:37,2021-02-01 01:23:20,91,228,13.599,...,2.91,0.0,,7.0,35.05,N,N,N,N,N
4,HV0003,B02872,B02872,2021-01-31 23:57:50,2021-02-01 00:08:25,2021-02-01 00:08:42,2021-02-01 00:17:57,126,250,2.62,...,1.38,0.0,,0.0,8.53,N,N,,N,N
5,HV0003,B02872,B02872,2021-02-01 00:11:48,2021-02-01 00:24:25,2021-02-01 00:26:02,2021-02-01 00:42:51,208,243,6.89,...,1.77,0.0,,0.0,16.05,N,N,,N,N
6,HV0003,B02872,B02872,2021-02-01 00:39:45,2021-02-01 00:44:57,2021-02-01 00:45:50,2021-02-01 01:02:50,243,220,4.26,...,3.76,0.0,,0.0,25.42,N,N,,N,N
7,HV0003,B02764,B02764,2021-01-31 23:55:59,2021-02-01 00:04:42,2021-02-01 00:06:42,2021-02-01 00:31:50,49,37,2.95,...,2.4,0.0,,0.0,22.29,N,N,,N,N
8,HV0003,B02764,B02764,2021-02-01 00:27:54,2021-02-01 00:33:12,2021-02-01 00:34:34,2021-02-01 00:58:13,37,76,3.41,...,2.03,0.0,,0.0,23.77,N,N,,N,N
9,HV0005,B02510,,2021-01-31 23:56:04,NaT,2021-02-01 00:03:43,2021-02-01 00:39:37,80,241,15.998,...,4.44,0.0,,0.0,35.8,N,N,N,N,N


In [24]:
fhvhv_2021_02.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11613942 entries, 0 to 11613941
Data columns (total 24 columns):
 #   Column                Dtype         
---  ------                -----         
 0   hvfhs_license_num     object        
 1   dispatching_base_num  object        
 2   originating_base_num  object        
 3   request_datetime      datetime64[us]
 4   on_scene_datetime     datetime64[us]
 5   pickup_datetime       datetime64[us]
 6   dropoff_datetime      datetime64[us]
 7   PULocationID          int64         
 8   DOLocationID          int64         
 9   trip_miles            float64       
 10  trip_time             int64         
 11  base_passenger_fare   float64       
 12  tolls                 float64       
 13  bcf                   float64       
 14  sales_tax             float64       
 15  congestion_surcharge  float64       
 16  airport_fee           float64       
 17  tips                  float64       
 18  driver_pay            float64       
 19

In [25]:
fhv_2021_02.head(10)

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number
0,B00013,2021-02-01 00:01:00,2021-02-01 01:33:00,,,,B00014
1,B00021,2021-02-01 00:55:40,2021-02-01 01:06:20,173.0,82.0,,B00021
2,B00021,2021-02-01 00:14:03,2021-02-01 00:28:37,173.0,56.0,,B00021
3,B00021,2021-02-01 00:27:48,2021-02-01 00:35:45,82.0,129.0,,B00021
4,B00037,2021-02-01 00:12:50,2021-02-01 00:26:38,,225.0,,B00037
5,B00037,2021-02-01 00:00:37,2021-02-01 00:09:35,,61.0,,B00037
6,B00112,2021-02-01 00:30:25,2021-02-01 00:57:23,,26.0,,B00112
7,B00149,2021-02-01 00:43:16,2021-02-01 01:03:16,,72.0,,B00149
8,B00221,2021-02-01 00:20:45,2021-02-01 00:21:15,,244.0,,B00221
9,B00225,2021-02-01 00:23:27,2021-02-01 00:55:46,,169.0,,B00225


In [26]:
fhv_2021_02.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1037692 entries, 0 to 1037691
Data columns (total 7 columns):
 #   Column                  Non-Null Count    Dtype         
---  ------                  --------------    -----         
 0   dispatching_base_num    1037692 non-null  object        
 1   pickup_datetime         1037692 non-null  datetime64[us]
 2   dropOff_datetime        1037692 non-null  datetime64[us]
 3   PUlocationID            153001 non-null   float64       
 4   DOlocationID            885340 non-null   float64       
 5   SR_Flag                 0 non-null        object        
 6   Affiliated_base_number  1037692 non-null  object        
dtypes: datetime64[us](2), float64(2), object(3)
memory usage: 55.4+ MB


Functions for renaming the columns of the 4 collections of datasets to standard names, which will ease the cleaning process

In [None]:
def renaming_fhv_to_standard(df_list):
    for df in df_list:
        for columns in df:
            df = df.rename(columns={'dropOff_datetime': 'dropoff_datetime', 'PULocationID': 'pickup_zone', 'DOLocationID': 'dropoff_zone'})
        df.drop(columns=["ehail_fee", "trip_type"], inplace=True)

- Data (Columns) Kept:

- https://www.nyc.gov/assets/tlc/images/content/pages/about/taxi_zone_map_manhattan.jpg

- Yellow = pickup_datetime, dropoff_datetime, passenger_count, pickup_zone and dropoff_zone
- https://www.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_yellow.pdf 

- Green = pickup_datetime, dropoff_datetime, passenger_count, pickup_zone, dropoff_zone
- https://www.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_green.pdf 

- FHV = pickup_datetime, dropoff_datetime, pickup_zone, dropoff_zone, passenger_count (added)
- https://www.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_fhv.pdf 

- fhvhv = pickup_datetime, dropoff_datetime, pickup_zone, dropoff_zone, passenger_count (added)
- https://www.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_hvfhs.pdf

In [None]:
def drop_fhv_columns(df):
        df.drop(columns=["Dispatching_base_num", "SR_Flag"], inplace=True)
        

In [None]:
def drop_fhvhv_columns(df):
    df.drop(columns=["Hvfhs_license_num", "Dispatching_base_num", "originating_base_num", "request_datetime", "on_scene_datetime", "trip_miles", "trip_time", "base_passenger_fare", "tolls", "bcf", "sales_tax", "congestion_surcharge", "airport_fee", "tips", "driver_pay", "shared_request_flag", "shared_match_flag", "access_a_ride_flag", "wav_request_flag", "wav_match_flag"], inplace=True)



In [45]:
def load_parquet_files(file_list):
    """
    Load a list of parquet files into pandas DataFrames.
    
    Parameters:
    file_list (list of str): List of file paths to be loaded.
    
    Returns:
    List of pandas DataFrames loaded from the parquet files.
    """
    dataframes = []
    for file in file_list:
        df = pd.read_parquet(file, engine='pyarrow')
        dataframes.append(df)
    return dataframes

In [None]:
file_categories = {
    "fhv": [],
    "fhvhv": [],
    "green": [],
    "yellow": []
}
for file in all_files:
    if "fhv_" in file:
        file_categories["fhv"].append(file)
    elif "fhvhv_" in file:
        file_categories["fhvhv"].append(file)
    elif "green_" in file:
        file_categories["green"].append(file)
    elif "yellow_" in file:
        file_categories["yellow"].append(file)

# Print the sorted file lists
print("FHV Files:", file_categories["fhv"])
print("FHVHV Files:", file_categories["fhvhv"])
print("Green Files:", file_categories["green"])
print("Yellow Files:", file_categories["yellow"])

In [None]:
fhv_dfs = load_parquet_files(file_categories["fhv"])
fhvhv_dfs = load_parquet_files(file_categories["fhvhv"])
green_dfs = load_parquet_files(file_categories["green"])
yellow_dfs = load_parquet_files(file_categories["yellow"])


In [None]:
# Print the number of DataFrames loaded for each category
print(f"FHV DataFrames Loaded: {len(fhv_dfs)}")
print(f"FHVHV DataFrames Loaded: {len(fhvhv_dfs)}")
print(f"Green DataFrames Loaded: {len(green_dfs)}")
print(f"Yellow DataFrames Loaded: {len(yellow_dfs)}")

# Print the first few rows of the first dataframe in each list as a sanity check
if fhv_dfs:
    print("FHV DataFrame Sample:", fhv_dfs[0].head())
if fhvhv_dfs:
    print("FHVHV DataFrame Sample:", fhvhv_dfs[0].head())
if green_dfs:
    print("Green DataFrame Sample:", green_dfs[0].head())
if yellow_dfs:
    print("Yellow DataFrame Sample:", yellow_dfs[0].head())