"""
File for Cleaning Taxi Data.
Begin by loading 1 parquet file as pandas dataframe from each of the 4 TLC genres.
Look at each of the dataframes, as a csv and through python
Implement Crisp-DM data cleaning methodology -> Data Quality Report, Data Quality Plan

"""

In [48]:
import os
import pandas as pd
import glob


- TLC = Taxi and Limousine Commission

- Yellow = hail or prearrange

- Green = Not certain Manhattan Areas (below 110th St. on the West Side, and below 96th St. on the East Side, or at either LaGuardia or JFK airports)

- FHV (For Hire Vehicles) = Prearranged, Limousines, Black Cars, Livery (Regular), FHVHV

- **FHVHV**/ HVFHV/ HVFHS (For Hire Vehicle High Volume/ High Volume FHV/ High Volume For Hire Service) = "FHV Bases/ Businesses that dispatch more than 10,000 trips per day" = Lyft/ Uber/ Juno/ Via

- See also:
    https://www.nyc.gov/site/tlc/passengers/your-ride.page

In [49]:
data_dir = os.path.join(os.getcwd(), "Datasets/taxi_parquets")
all_files = glob.glob(os.path.join(data_dir, "*.parquet"))
print("All files found:", all_files)

All files found: ['c:\\Users\\35385\\Desktop\\CS_Summer_2024\\Shared_GH\\New-York-App\\data-analytics\\Datasets/taxi_parquets\\fhvhv_2021-01.parquet', 'c:\\Users\\35385\\Desktop\\CS_Summer_2024\\Shared_GH\\New-York-App\\data-analytics\\Datasets/taxi_parquets\\fhvhv_2021-02.parquet', 'c:\\Users\\35385\\Desktop\\CS_Summer_2024\\Shared_GH\\New-York-App\\data-analytics\\Datasets/taxi_parquets\\fhvhv_2021-03.parquet', 'c:\\Users\\35385\\Desktop\\CS_Summer_2024\\Shared_GH\\New-York-App\\data-analytics\\Datasets/taxi_parquets\\fhvhv_2021-04.parquet', 'c:\\Users\\35385\\Desktop\\CS_Summer_2024\\Shared_GH\\New-York-App\\data-analytics\\Datasets/taxi_parquets\\fhvhv_2021-05.parquet', 'c:\\Users\\35385\\Desktop\\CS_Summer_2024\\Shared_GH\\New-York-App\\data-analytics\\Datasets/taxi_parquets\\fhvhv_2021-06.parquet', 'c:\\Users\\35385\\Desktop\\CS_Summer_2024\\Shared_GH\\New-York-App\\data-analytics\\Datasets/taxi_parquets\\fhvhv_2021-07.parquet', 'c:\\Users\\35385\\Desktop\\CS_Summer_2024\\Shared_

- **Data Quality Report**
-**Data Integrity Checks** (To-Do List)


- Check 1: Trips with 0 passengers = doesn't represent "Busy-ness", drop from dataset.
- Check 2: Negative Fare Amounts = Refunds potentially, maybe not valid trips, can't be sure these datapoints are valid for representing "Busy-ness", drop from dataset.
- Check 3: Pickup Time == Dropoff time -> No time has elapsed (not a case of Store_and_fwd_flag)... Valid Trips? Further checks needed
    Further Checks:
        -> trip_distance is variable -> Journeys 0 Miles to 13+ Miles (in 0 minutes)
        -> total_amount is variable -> fares costing $6 - $48+ (in 0 minutes)
        -> difficult to trust these datapoints, drop from dataset where Pickup Time == Dropoff time
- Check 4: pickup time < dropoff time = time travel, drop from dataset.
- Check 5: negative trip distance, drop from dataset.
- Check 6: Invalid datapoints for that month (e.g. Non-Jan-2021 Months in Jan-2021 file), drop from dataset.
- Check 7: Non Manhattan locations -> pickup or dropoff not in Manhattan Zones (use accompanying "taxi_zone_lookup.csv" file), drop from dataset.
- Check 8: RateCodeID != 1-6 (Value of "99" present), drop from dataset.
- Check 9: payment type == 4 (Dispute) -> unpaid, mispaid, late charge -> often nagative fares (refund?) questionable validity as to if the trip was accurately captured, drop from dataset.
- Check 10: Extra -> should only be 0.5 or 1 or 1.5 (rush hour into overnight or vice versa), drop from dataset.
- Check 11: if MTA Tax != 50c drop from dataset.
- Check 12: improvement_surcharge != 30c -> invalid pre dec 19, 2022, != $1 on/ after Dec 19 2022 -> https://www.nyc.gov/site/tlc/passengers/taxi-fare.page , drop from dataset.
- Check 13: Tip amount < 0 == potential data entry error for that row, software bug, refund/ adjustment/ rectify errors, drop from dataset.
- Check 14: Tolls amount < 0 == data entry error, drop from dataset.
- Check 15: congestion_surcharge != 2.5 or 0.75 for yellow taxis -> "Plus New York State Congestion Surcharge of $2.50 (Yellow Taxi) or $2.75 (Green Taxi and FHV) or 75 cents (any shared ride) for all trips that begin, end or pass through Manhattan south of 96th Street.", drop from dataset.
- Check 16: Airport_fee == 1.25, provided trip zone ends in Manhattan
- Check 17: Yellow Taxis, no more than 5 passengers

See also: 
    https://rules.cityofnewyork.us/rule/taximeter-rate-of-fare-and-various-surcharges/
    https://www.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_yellow.pdf
    https://www.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_green.pdf
    https://www.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_fhv.pdf
    https://www.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_hvfhs.pdf
    https://www.nyc.gov/site/tlc/passengers/passenger-frequently-asked-questions.page 
    https://www.nyc.gov/site/tlc/passengers/taxi-fare.page 
    https://www.nyc.gov/assets/tlc/downloads/pdf/archived_public_notices/public_notice_09_17_09.pdf
    https://www.nyc.gov/assets/tlc/downloads/pdf/trip_record_user_guide.pdf 
    https://data.cityofnewyork.us/City-Government/NTA-map/d3qk-pfyz
    

In [51]:
print("File path for fhvhv_2021_02:", r"/data-analytics/Datasets/taxi_parquets/fhvhv_2021-02.parquet")
print("File path for fhv_2021_02:", r"/data-analytics/Datasets/taxi_parquets/fhv_2021-02.parquet")
print("File path for green_2021_02:", r"/data-analytics/Datasets/taxi_parquets/green_2021-02.parquet")
print("File path for yellow_2021_01:", r"/data-analytics/Datasets/taxi_parquets/yellow_2021-01.parquet")


File path for fhvhv_2021_02: /data-analytics/Datasets/taxi_parquets/fhvhv_2021-02.parquet
File path for fhv_2021_02: /data-analytics/Datasets/taxi_parquets/fhv_2021-02.parquet
File path for green_2021_02: /data-analytics/Datasets/taxi_parquets/green_2021-02.parquet
File path for yellow_2021_01: /data-analytics/Datasets/taxi_parquets/yellow_2021-01.parquet


In [20]:
""" 
Begin by loading 1 parquet file as pandas dataframe from each of the 4 TLC genres.
Error catching across OSes implemented: cwd, data directory, paths etc.
"""

cwd = os.getcwd()
print("Current Working Directory:", cwd)

# Define the directory where the data is located relative to the current working directory
data_dir = "Datasets/taxi_parquets"
print("Data Directory:", data_dir)

# Define the file paths relative to the data directory
fhvhv_2021_02_path = os.path.join(cwd, data_dir, "fhvhv_2021-02.parquet")
fhv_2021_02_path = os.path.join(cwd, data_dir, "fhv_2021-02.parquet")
green_2021_02_path = os.path.join(cwd, data_dir, "green_2021-02.parquet")
yellow_2021_01_path = os.path.join(cwd, data_dir, "yellow_2021-01.parquet")

# Print the constructed file paths to verify
print("fhvhv_2021_02_path:", fhvhv_2021_02_path)
print("fhv_2021_02_path:", fhv_2021_02_path)
print("green_2021_02_path:", green_2021_02_path)
print("yellow_2021_01_path:", yellow_2021_01_path)



Current Working Directory: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics
Data Directory: Datasets/taxi_parquets
fhvhv_2021_02_path: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\Datasets/taxi_parquets\fhvhv_2021-02.parquet
fhv_2021_02_path: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\Datasets/taxi_parquets\fhv_2021-02.parquet
green_2021_02_path: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\Datasets/taxi_parquets\green_2021-02.parquet
yellow_2021_01_path: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\Datasets/taxi_parquets\yellow_2021-01.parquet


In [32]:
# Read the parquet files using the relative file paths
fhvhv_2021_02 = pd.read_parquet(fhvhv_2021_02_path, engine='pyarrow')
fhv_2021_02 = pd.read_parquet(fhv_2021_02_path, engine='pyarrow')
green_2021_02 = pd.read_parquet(green_2021_02_path, engine='pyarrow')
yellow_2021_01 = pd.read_parquet(yellow_2021_01_path, engine='pyarrow')

In [22]:
""" 
Save dataframes to CSVs, for alternative and efficient analysis
Similar error catching as above
(Runtime = ~2 mins)
"""

directory_path = os.path.join("Datasets", "taxi_other")

# Print target directory path (error catching)
print("Directory path to save CSV files:", directory_path)

# Verify the directory exists
if not os.path.isdir(directory_path):
    raise OSError(f"Directory does not exist: '{directory_path}'")

# Define file paths for each CSV
yellow_file_path = os.path.join(directory_path, "yellow_2021_01.csv")
green_file_path = os.path.join(directory_path, "green_2021_02.csv")
fhv_file_path = os.path.join(directory_path, "fhv_2021_02.csv")
fhvhv_file_path = os.path.join(directory_path, "fhvhv_2021_02.csv")

# Save each dataframe to its respective CSV file
yellow_2021_01.to_csv(yellow_file_path, index=False)
green_2021_02.to_csv(green_file_path, index=False)
fhv_2021_02.to_csv(fhv_file_path, index=False)
fhvhv_2021_02.to_csv(fhvhv_file_path, index=False)



Directory path to save CSV files: Datasets\taxi_other


**Investigating Yellow Taxi Datasets**

In [54]:
yellow_2021_01.head(10)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,1,2021-01-01 00:30:10,2021-01-01 00:36:12,1.0,2.1,1.0,N,142,43,2,8.0,3.0,0.5,0.0,0.0,0.3,11.8,2.5,
1,1,2021-01-01 00:51:20,2021-01-01 00:52:19,1.0,0.2,1.0,N,238,151,2,3.0,0.5,0.5,0.0,0.0,0.3,4.3,0.0,
2,1,2021-01-01 00:43:30,2021-01-01 01:11:06,1.0,14.7,1.0,N,132,165,1,42.0,0.5,0.5,8.65,0.0,0.3,51.95,0.0,
3,1,2021-01-01 00:15:48,2021-01-01 00:31:01,0.0,10.6,1.0,N,138,132,1,29.0,0.5,0.5,6.05,0.0,0.3,36.35,0.0,
4,2,2021-01-01 00:31:49,2021-01-01 00:48:21,1.0,4.94,1.0,N,68,33,1,16.5,0.5,0.5,4.06,0.0,0.3,24.36,2.5,
5,1,2021-01-01 00:16:29,2021-01-01 00:24:30,1.0,1.6,1.0,N,224,68,1,8.0,3.0,0.5,2.35,0.0,0.3,14.15,2.5,
6,1,2021-01-01 00:00:28,2021-01-01 00:17:28,1.0,4.1,1.0,N,95,157,2,16.0,0.5,0.5,0.0,0.0,0.3,17.3,0.0,
7,1,2021-01-01 00:12:29,2021-01-01 00:30:34,1.0,5.7,1.0,N,90,40,2,18.0,3.0,0.5,0.0,0.0,0.3,21.8,2.5,
8,1,2021-01-01 00:39:16,2021-01-01 01:00:13,1.0,9.1,1.0,N,97,129,4,27.5,0.5,0.5,0.0,0.0,0.3,28.8,0.0,
9,1,2021-01-01 00:26:12,2021-01-01 00:39:46,2.0,2.7,1.0,N,263,142,1,12.0,3.0,0.5,3.15,0.0,0.3,18.95,2.5,


In [55]:
yellow_2021_01.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1369769 entries, 0 to 1369768
Data columns (total 19 columns):
 #   Column                 Non-Null Count    Dtype         
---  ------                 --------------    -----         
 0   VendorID               1369769 non-null  int64         
 1   tpep_pickup_datetime   1369769 non-null  datetime64[us]
 2   tpep_dropoff_datetime  1369769 non-null  datetime64[us]
 3   passenger_count        1271417 non-null  float64       
 4   trip_distance          1369769 non-null  float64       
 5   RatecodeID             1271417 non-null  float64       
 6   store_and_fwd_flag     1271417 non-null  object        
 7   PULocationID           1369769 non-null  int64         
 8   DOLocationID           1369769 non-null  int64         
 9   payment_type           1369769 non-null  int64         
 10  fare_amount            1369769 non-null  float64       
 11  extra                  1369769 non-null  float64       
 12  mta_tax                13697

In [68]:
# Count for each unique value in the 'passenger_count' column
passenger_count_counts = yellow_2021_01['passenger_count'].value_counts()

# Print the count for each unique value
print("Count for each unique value in 'passenger_count' column:")
print(passenger_count_counts)

Count for each unique value in 'passenger_count' column:
passenger_count
1.0    966236
2.0    161671
3.0     43935
5.0     31089
0.0     26726
6.0     25362
4.0     16391
7.0         5
8.0         2
Name: count, dtype: int64


- **Data Quality Report**
-**Data Integrity Checks** 

Having investigated the CSVs, a number of data inconsistincies are present. They are summarised below:


- 1: There are a number of trips with 0 passengers, as well as trip with more than 5 passengers. According to the TLC data dictionary, 5 is the maximum amount of passengers allowed. 0 might represent trips that didn't occur or were cancelled, or serve another purpose. As our goal is to track Busy-ness in NYC, values of 0, 6, 7 and 8 cannot be counted as valid.
    - Drop rows from dataset where "passenger_count" == 0, 6, 7, 8.

- Check 2: Negative Fare Amounts = Refunds potentially, maybe not valid trips, can't be sure these datapoints are valid for representing "Busy-ness", drop from dataset.
- Check 3: Pickup Time == Dropoff time -> No time has elapsed (not a case of Store_and_fwd_flag)... Valid Trips? Further checks needed
    Further Checks:
        -> trip_distance is variable -> Journeys 0 Miles to 13+ Miles (in 0 minutes)
        -> total_amount is variable -> fares costing $6 - $48+ (in 0 minutes)
        -> difficult to trust these datapoints, drop from dataset where Pickup Time == Dropoff time
- Check 4: pickup time < dropoff time = time travel, drop from dataset.
- Check 5: negative trip distance, drop from dataset.
- Check 6: Invalid datapoints for that month (e.g. Non-Jan-2021 Months in Jan-2021 file), drop from dataset.
- Check 7: Non Manhattan locations -> pickup or dropoff not in Manhattan Zones (use accompanying "taxi_zone_lookup.csv" file), drop from dataset.
- Check 8: RateCodeID != 1-6 (Value of "99" present), drop from dataset.
- Check 9: payment type == 4 (Dispute) -> unpaid, mispaid, late charge -> often nagative fares (refund?) questionable validity as to if the trip was accurately captured, drop from dataset.
- Check 10: Extra -> should only be 0.5 or 1 or 1.5 (rush hour into overnight or vice versa), drop from dataset.
- Check 11: if MTA Tax != 50c drop from dataset.
- Check 12: improvement_surcharge != 30c -> invalid pre dec 19, 2022, != $1 on/ after Dec 19 2022 -> https://www.nyc.gov/site/tlc/passengers/taxi-fare.page , drop from dataset.
- Check 13: Tip amount < 0 == potential data entry error for that row, software bug, refund/ adjustment/ rectify errors, drop from dataset.
- Check 14: Tolls amount < 0 == data entry error, drop from dataset.
- Check 15: congestion_surcharge != 2.5 or 0.75 for yellow taxis -> "Plus New York State Congestion Surcharge of $2.50 (Yellow Taxi) or $2.75 (Green Taxi and FHV) or 75 cents (any shared ride) for all trips that begin, end or pass through Manhattan south of 96th Street.", drop from dataset.
- Check 16: Airport_fee == 1.25, provided trip zone ends in Manhattan

See also: 
    https://rules.cityofnewyork.us/rule/taximeter-rate-of-fare-and-various-surcharges/
    https://www.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_yellow.pdf
    https://www.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_green.pdf
    https://www.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_fhv.pdf
    https://www.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_hvfhs.pdf
    https://www.nyc.gov/site/tlc/passengers/passenger-frequently-asked-questions.page 
    https://www.nyc.gov/site/tlc/passengers/taxi-fare.page 
    https://www.nyc.gov/assets/tlc/downloads/pdf/archived_public_notices/public_notice_09_17_09.pdf
    https://www.nyc.gov/assets/tlc/downloads/pdf/trip_record_user_guide.pdf 
    https://data.cityofnewyork.us/City-Government/NTA-map/d3qk-pfyz
    

- **Data Quality Plan**

- Check 1: Trips with 0 passengers = doesn't represent "Busy-ness", drop from dataset.
- Check 2: Negative Fare Amounts = Refunds potentially, maybe not valid trips, can't be sure these datapoints are valid for representing "Busy-ness", drop from dataset.
- Check 3: Pickup Time == Dropoff time -> No time has elapsed (not a case of Store_and_fwd_flag)... Valid Trips? Further checks needed
    Further Checks:
        -> trip_distance is variable -> Journeys 0 Miles to 13+ Miles (in 0 minutes)
        -> total_amount is variable -> fares costing $6 - $48+ (in 0 minutes)
        -> difficult to trust these datapoints, drop from dataset where Pickup Time == Dropoff time
- Check 4: pickup time < dropoff time = time travel, drop from dataset.
- Check 5: negative trip distance, drop from dataset.
- Check 6: Invalid datapoints for that month (e.g. Non-Jan-2021 Months in Jan-2021 file), drop from dataset.
- Check 7: Non Manhattan locations -> pickup or dropoff not in Manhattan Zones (use accompanying "taxi_zone_lookup.csv" file), drop from dataset.
- Check 8: RateCodeID != 1-6 (Value of "99" present), drop from dataset.
- Check 9: payment type == 4 (Dispute) -> unpaid, mispaid, late charge -> often nagative fares (refund?) questionable validity as to if the trip was accurately captured, drop from dataset.
- Check 10: Extra -> should only be 0.5 or 1 or 1.5 (rush hour into overnight or vice versa), drop from dataset.
- Check 11: if MTA Tax != 50c drop from dataset.
- Check 12: improvement_surcharge != 30c -> invalid pre dec 19, 2022, != $1 on/ after Dec 19 2022 -> https://www.nyc.gov/site/tlc/passengers/taxi-fare.page , drop from dataset.
- Check 13: Tip amount < 0 == potential data entry error for that row, software bug, refund/ adjustment/ rectify errors, drop from dataset.
- Check 14: Tolls amount < 0 == data entry error, drop from dataset.
- Check 15: congestion_surcharge != 2.5 or 0.75 for yellow taxis -> "Plus New York State Congestion Surcharge of $2.50 (Yellow Taxi) or $2.75 (Green Taxi and FHV) or 75 cents (any shared ride) for all trips that begin, end or pass through Manhattan south of 96th Street.", drop from dataset.
- Check 16: Airport_fee == 1.25, provided trip zone ends in Manhattan
- Check 17: Yellow Taxis, no more than 5 passengers

In [53]:
fhvhv_2021_02.head(10)

Unnamed: 0,hvfhs_license_num,dispatching_base_num,originating_base_num,request_datetime,on_scene_datetime,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,trip_miles,...,sales_tax,congestion_surcharge,airport_fee,tips,driver_pay,shared_request_flag,shared_match_flag,access_a_ride_flag,wav_request_flag,wav_match_flag
0,HV0003,B02764,B02764,2021-01-31 23:59:00,2021-02-01 00:10:19,2021-02-01 00:10:40,2021-02-01 00:21:09,35,39,2.06,...,1.52,0.0,,0.0,9.79,N,N,,N,N
1,HV0003,B02764,B02764,2021-02-01 00:13:35,2021-02-01 00:25:23,2021-02-01 00:27:23,2021-02-01 00:44:01,39,35,3.15,...,2.85,0.0,,0.0,24.01,N,N,,N,N
2,HV0005,B02510,,2021-02-01 00:12:55,NaT,2021-02-01 00:28:38,2021-02-01 00:38:27,39,91,1.776,...,1.12,0.0,,0.0,6.91,N,N,N,N,N
3,HV0005,B02510,,2021-02-01 00:36:01,NaT,2021-02-01 00:43:37,2021-02-01 01:23:20,91,228,13.599,...,2.91,0.0,,7.0,35.05,N,N,N,N,N
4,HV0003,B02872,B02872,2021-01-31 23:57:50,2021-02-01 00:08:25,2021-02-01 00:08:42,2021-02-01 00:17:57,126,250,2.62,...,1.38,0.0,,0.0,8.53,N,N,,N,N
5,HV0003,B02872,B02872,2021-02-01 00:11:48,2021-02-01 00:24:25,2021-02-01 00:26:02,2021-02-01 00:42:51,208,243,6.89,...,1.77,0.0,,0.0,16.05,N,N,,N,N
6,HV0003,B02872,B02872,2021-02-01 00:39:45,2021-02-01 00:44:57,2021-02-01 00:45:50,2021-02-01 01:02:50,243,220,4.26,...,3.76,0.0,,0.0,25.42,N,N,,N,N
7,HV0003,B02764,B02764,2021-01-31 23:55:59,2021-02-01 00:04:42,2021-02-01 00:06:42,2021-02-01 00:31:50,49,37,2.95,...,2.4,0.0,,0.0,22.29,N,N,,N,N
8,HV0003,B02764,B02764,2021-02-01 00:27:54,2021-02-01 00:33:12,2021-02-01 00:34:34,2021-02-01 00:58:13,37,76,3.41,...,2.03,0.0,,0.0,23.77,N,N,,N,N
9,HV0005,B02510,,2021-01-31 23:56:04,NaT,2021-02-01 00:03:43,2021-02-01 00:39:37,80,241,15.998,...,4.44,0.0,,0.0,35.8,N,N,N,N,N


In [24]:
fhvhv_2021_02.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11613942 entries, 0 to 11613941
Data columns (total 24 columns):
 #   Column                Dtype         
---  ------                -----         
 0   hvfhs_license_num     object        
 1   dispatching_base_num  object        
 2   originating_base_num  object        
 3   request_datetime      datetime64[us]
 4   on_scene_datetime     datetime64[us]
 5   pickup_datetime       datetime64[us]
 6   dropoff_datetime      datetime64[us]
 7   PULocationID          int64         
 8   DOLocationID          int64         
 9   trip_miles            float64       
 10  trip_time             int64         
 11  base_passenger_fare   float64       
 12  tolls                 float64       
 13  bcf                   float64       
 14  sales_tax             float64       
 15  congestion_surcharge  float64       
 16  airport_fee           float64       
 17  tips                  float64       
 18  driver_pay            float64       
 19

In [25]:
fhv_2021_02.head(10)

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number
0,B00013,2021-02-01 00:01:00,2021-02-01 01:33:00,,,,B00014
1,B00021,2021-02-01 00:55:40,2021-02-01 01:06:20,173.0,82.0,,B00021
2,B00021,2021-02-01 00:14:03,2021-02-01 00:28:37,173.0,56.0,,B00021
3,B00021,2021-02-01 00:27:48,2021-02-01 00:35:45,82.0,129.0,,B00021
4,B00037,2021-02-01 00:12:50,2021-02-01 00:26:38,,225.0,,B00037
5,B00037,2021-02-01 00:00:37,2021-02-01 00:09:35,,61.0,,B00037
6,B00112,2021-02-01 00:30:25,2021-02-01 00:57:23,,26.0,,B00112
7,B00149,2021-02-01 00:43:16,2021-02-01 01:03:16,,72.0,,B00149
8,B00221,2021-02-01 00:20:45,2021-02-01 00:21:15,,244.0,,B00221
9,B00225,2021-02-01 00:23:27,2021-02-01 00:55:46,,169.0,,B00225


In [26]:
fhv_2021_02.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1037692 entries, 0 to 1037691
Data columns (total 7 columns):
 #   Column                  Non-Null Count    Dtype         
---  ------                  --------------    -----         
 0   dispatching_base_num    1037692 non-null  object        
 1   pickup_datetime         1037692 non-null  datetime64[us]
 2   dropOff_datetime        1037692 non-null  datetime64[us]
 3   PUlocationID            153001 non-null   float64       
 4   DOlocationID            885340 non-null   float64       
 5   SR_Flag                 0 non-null        object        
 6   Affiliated_base_number  1037692 non-null  object        
dtypes: datetime64[us](2), float64(2), object(3)
memory usage: 55.4+ MB


In [27]:
green_2021_02.head(10)

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,2,2021-02-01 00:34:03,2021-02-01 00:51:58,N,1.0,130,205,5.0,3.66,14.0,0.5,0.5,10.0,0.0,,0.3,25.3,1.0,1.0,0.0
1,2,2021-02-01 00:04:00,2021-02-01 00:10:30,N,1.0,152,244,1.0,1.1,6.5,0.5,0.5,0.0,0.0,,0.3,7.8,2.0,1.0,0.0
2,2,2021-02-01 00:18:51,2021-02-01 00:34:06,N,1.0,152,48,1.0,4.93,16.5,0.5,0.5,0.0,0.0,,0.3,20.55,2.0,1.0,2.75
3,2,2021-02-01 00:53:27,2021-02-01 01:11:41,N,1.0,152,241,1.0,6.7,21.0,0.5,0.5,0.0,0.0,,0.3,22.3,2.0,1.0,0.0
4,2,2021-02-01 00:57:46,2021-02-01 01:06:44,N,1.0,75,42,1.0,1.89,8.5,0.5,0.5,2.45,0.0,,0.3,12.25,1.0,1.0,0.0
5,2,2021-02-01 00:33:03,2021-02-01 00:40:54,N,5.0,197,219,1.0,3.3,15.0,0.0,0.0,0.76,0.0,,0.3,16.06,1.0,2.0,0.0
6,2,2021-02-01 00:18:43,2021-02-01 00:28:25,N,1.0,82,223,1.0,2.51,10.0,0.5,0.5,1.5,0.0,,0.3,12.8,1.0,1.0,0.0
7,2,2021-02-01 00:06:50,2021-02-01 00:18:07,N,1.0,130,130,1.0,1.68,9.0,0.5,0.5,5.08,0.0,,0.3,15.38,1.0,1.0,0.0
8,2,2021-02-01 01:25:20,2021-02-01 01:34:04,N,1.0,94,241,1.0,1.44,7.5,0.5,0.5,0.0,0.0,,0.3,8.8,2.0,1.0,0.0
9,2,2021-02-01 02:56:55,2021-02-01 02:58:38,N,5.0,78,78,1.0,0.0,24.0,0.0,0.0,0.0,0.0,,0.3,24.3,2.0,2.0,0.0


In [28]:
green_2021_02.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64572 entries, 0 to 64571
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   VendorID               64572 non-null  int64         
 1   lpep_pickup_datetime   64572 non-null  datetime64[us]
 2   lpep_dropoff_datetime  64572 non-null  datetime64[us]
 3   store_and_fwd_flag     35463 non-null  object        
 4   RatecodeID             35463 non-null  float64       
 5   PULocationID           64572 non-null  int64         
 6   DOLocationID           64572 non-null  int64         
 7   passenger_count        35463 non-null  float64       
 8   trip_distance          64572 non-null  float64       
 9   fare_amount            64572 non-null  float64       
 10  extra                  64572 non-null  float64       
 11  mta_tax                64572 non-null  float64       
 12  tip_amount             64572 non-null  float64       
 13  t

Functions for renaming the columns of the 4 collections of datasets to standard names, which will ease the cleaning process

In [None]:
def renaming_yellow_to_standard(df_list):
    for df in df_list:
        for columns in df:
            df = df.rename(columns={'tpep_pickup_datetime': 'pickup_datetime', 'tpep_dropoff_datetime': 'dropoff_datetime', 'PULocationID': 'pickup_zone', 'DOLocationID': 'dropoff_zone'})
        #df.drop(columns=["airport_fee"], inplace=True)

In [None]:
def renaming_green_to_standard(df_list):
    for df in df_list:
        for columns in df:
            df = df.rename(columns={'lpep_pickup_datetime': 'pickup_datetime', 'lpep_dropoff_datetime': 'dropoff_datetime', 'PULocationID': 'pickup_zone', 'DOLocationID': 'dropoff_zone'})
        #df.drop(columns=["ehail_fee", "trip_type"], inplace=True)

In [None]:
def renaming_fhv_to_standard(df_list):
    for df in df_list:
        for columns in df:
            df = df.rename(columns={'dropOff_datetime': 'dropoff_datetime', 'PULocationID': 'pickup_zone', 'DOLocationID': 'dropoff_zone'})
        df.drop(columns=["ehail_fee", "trip_type"], inplace=True)

- Data (Columns) Kept:

- https://www.nyc.gov/assets/tlc/images/content/pages/about/taxi_zone_map_manhattan.jpg

- Yellow = pickup_datetime, dropoff_datetime, Passenger_count, pickup_zone and dropoff_zone
- https://www.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_yellow.pdf 

- Green = pickup_datetime, dropoff_datetime, Passenger_count, pickup_zone, dropoff_zone
- https://www.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_green.pdf 

- FHV = pickup_datetime, dropoff_datetime, pickup_zone, dropoff_zone, Passenger_count (added)
- https://www.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_fhv.pdf 

- fhvhv = pickup_datetime, dropoff_datetime, pickup_zone, dropoff_zone, Passenger_count (added)
- https://www.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_hvfhs.pdf

In [None]:
def integrity_check_1(df):
    potential_errors = df.loc[(df["passenger_count"] <= 0)]
    print("Number of bad Datapoints for integrity_check_1", potential_errors.shape[0])
    print("Shape before dropping", df.shape)
    df.drop(potential_errors.index, inplace=True)
    print("Shape after dropping", df.shape)

In [None]:
def integrity_check_2(df):
    potential_errors = df.loc[(df["total_amount"] <= 0)]
    print("Number of bad Datapoints for integrity_check_2", potential_errors.shape[0])
    print("Shape before dropping", df.shape)
    df.drop(potential_errors.index, inplace=True)
    print("Shape after dropping", df.shape)

In [None]:
def integrity_check_3(df):
    potential_errors = df.loc[((df["pickup_datetime"] == df["dropoff_datetime"]) & (df["total_amount"] > 0))]
    print("Number of bad Datapoints for integrity_check_3", potential_errors.shape[0])
    print("Shape before dropping", df.shape)
    df.drop(potential_errors.index, inplace=True)
    print("Shape after dropping", df.shape)

In [None]:
def integrity_check_4(df):
    potential_errors = df.loc[((df["RatecodeID"] == 99) & (df["total_amount"] > 0))]
    print("Number of bad Datapoints for integrity_check_4", potential_errors.shape[0])
    print("Shape before dropping", df.shape)
    df.drop(potential_errors.index, inplace=True)
    print("Shape after dropping", df.shape)

In [None]:
def drop_yellow_columns(df):
    df.drop(columns=["VendorID", "Trip_distance", "RateCodeID", "Store_and_fwd_flag", "Payment_type", "Fare_amount", "Extra", "MTA_tax", "Improvement_surcharge", "Tip_amount", "Tolls_amount", "Total_amount", "Congestion_Surcharge", "Airport_fee"], inplace=True)


In [None]:
def drop_green_columns(df):
    df.drop(columns=["VendorID", "Trip_distance", "RateCodeID", "Store_and_fwd_flag", "Payment_type", "Fare_amount", "Extra", "MTA_tax", "Improvement_surcharge", "Tip_amount", "Tolls_amount", "Total_amount", "Trip_type"], inplace=True)


In [None]:
def drop_fhv_columns(df):
        df.drop(columns=["Dispatching_base_num", "SR_Flag"], inplace=True)
        

In [None]:
def drop_fhvhv_columns(df):
    df.drop(columns=["Hvfhs_license_num", "Dispatching_base_num", "originating_base_num", "request_datetime", "on_scene_datetime", "trip_miles", "trip_time", "base_passenger_fare", "tolls", "bcf", "sales_tax", "congestion_surcharge", "airport_fee", "tips", "driver_pay", "shared_request_flag", "shared_match_flag", "access_a_ride_flag", "wav_request_flag", "wav_match_flag"], inplace=True)



In [None]:
taxi_zone = pd.read_csv(r"C:\Users\35385\Desktop\CS_Summer_2024\Datasets\taxi_zone_lookup.csv", keep_default_na=True, delimiter=",", skipinitialspace=True, encoding="Windows-1252")

def valid_zones(df):
    manhattan_df = df[df["Borough"] == "Manhattan"]
    unique_zones = manhattan_df["LocationID"].unique()
    
    print(f"Number of Unique Zones: {len(unique_zones)}")
    print("List of Unique Zones:", unique_zones)
    

In [45]:
def load_parquet_files(file_list):
    """
    Load a list of parquet files into pandas DataFrames.
    
    Parameters:
    file_list (list of str): List of file paths to be loaded.
    
    Returns:
    List of pandas DataFrames loaded from the parquet files.
    """
    dataframes = []
    for file in file_list:
        df = pd.read_parquet(file, engine='pyarrow')
        dataframes.append(df)
    return dataframes

In [None]:
file_categories = {
    "fhv": [],
    "fhvhv": [],
    "green": [],
    "yellow": []
}
for file in all_files:
    if "fhv_" in file:
        file_categories["fhv"].append(file)
    elif "fhvhv_" in file:
        file_categories["fhvhv"].append(file)
    elif "green_" in file:
        file_categories["green"].append(file)
    elif "yellow_" in file:
        file_categories["yellow"].append(file)

# Print the sorted file lists
print("FHV Files:", file_categories["fhv"])
print("FHVHV Files:", file_categories["fhvhv"])
print("Green Files:", file_categories["green"])
print("Yellow Files:", file_categories["yellow"])

In [None]:
fhv_dfs = load_parquet_files(file_categories["fhv"])
fhvhv_dfs = load_parquet_files(file_categories["fhvhv"])
green_dfs = load_parquet_files(file_categories["green"])
yellow_dfs = load_parquet_files(file_categories["yellow"])


In [None]:
# Print the number of DataFrames loaded for each category
print(f"FHV DataFrames Loaded: {len(fhv_dfs)}")
print(f"FHVHV DataFrames Loaded: {len(fhvhv_dfs)}")
print(f"Green DataFrames Loaded: {len(green_dfs)}")
print(f"Yellow DataFrames Loaded: {len(yellow_dfs)}")

# Print the first few rows of the first dataframe in each list as a sanity check
if fhv_dfs:
    print("FHV DataFrame Sample:", fhv_dfs[0].head())
if fhvhv_dfs:
    print("FHVHV DataFrame Sample:", fhvhv_dfs[0].head())
if green_dfs:
    print("Green DataFrame Sample:", green_dfs[0].head())
if yellow_dfs:
    print("Yellow DataFrame Sample:", yellow_dfs[0].head())