"""
File for Cleaning Taxi Data of the Green Taxis.
Begin by loading 1 parquet file as pandas dataframe.
Look at each the dataframe, as a csv and through python
Implement Crisp-DM data cleaning methodology -> Data Quality Report, Data Quality Plan

"""

In [15]:
import os
import pandas as pd
import glob

In [16]:
data_dir = os.path.join(os.getcwd(), "Datasets/taxi_parquets")
all_files = glob.glob(os.path.join(data_dir, "*.parquet"))
print("All files found:", all_files)

All files found: []


In [17]:
print("File path for green_2021_01:", r"/data-analytics/Datasets/taxi_parquets/green_2021_01.parquet")

File path for green_2021_01: /data-analytics/Datasets/taxi_parquets/green_2021_01.parquet


In [18]:
""" 
Begin by loading 1 parquet file as pandas dataframe from each of the 4 TLC genres.
Error catching across OSes implemented: cwd, data directory, paths etc.
"""

cwd = os.getcwd()
print("Current Working Directory:", cwd)

# Define the directory where the data is located relative to the current working directory
data_dir = "Datasets/taxi_parquets"
print("Data Directory:", data_dir)

# Define the file paths relative to the data directory
green_2021_01_path = os.path.join(cwd, data_dir, "green_2021_01.parquet")

print("green_2021_01_path:", green_2021_01_path)

Current Working Directory: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning
Data Directory: Datasets/taxi_parquets
green_2021_01_path: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\Datasets/taxi_parquets\green_2021_01.parquet


In [19]:
green_2021_01 = pd.read_parquet(green_2021_01_path, engine='pyarrow')

FileNotFoundError: [Errno 2] No such file or directory: 'c:\\Users\\35385\\Desktop\\CS_Summer_2024\\Shared_GH\\New-York-App\\data-analytics\\cleaning\\Datasets/taxi_parquets\\green_2021_01.parquet'

In [12]:
""" 
Save dataframe to CSV, for alternative and efficient analysis
"""

directory_path = os.path.join("Datasets", "taxi_other")

# Print target directory path (error catching)
print("Directory path to save CSV files:", directory_path)

# Verify the directory exists
if not os.path.isdir(directory_path):
    raise OSError(f"Directory does not exist: '{directory_path}'")

green_file_path = os.path.join(directory_path, "green_2021_01.csv")

green_2021_01.to_csv(green_file_path, index=False)

Directory path to save CSV files: Datasets\taxi_other


In [45]:
green_2021_01 = pd.read_parquet(green_2021_01_path, engine='pyarrow')

In [14]:
green_2021_01.head(10)

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,2,2021-01-01 00:15:56,2021-01-01 00:19:52,N,1.0,43,151,1.0,1.01,5.5,0.5,0.5,0.0,0.0,,0.3,6.8,2.0,1.0,0.0
1,2,2021-01-01 00:25:59,2021-01-01 00:34:44,N,1.0,166,239,1.0,2.53,10.0,0.5,0.5,2.81,0.0,,0.3,16.86,1.0,1.0,2.75
2,2,2021-01-01 00:45:57,2021-01-01 00:51:55,N,1.0,41,42,1.0,1.12,6.0,0.5,0.5,1.0,0.0,,0.3,8.3,1.0,1.0,0.0
3,2,2020-12-31 23:57:51,2021-01-01 00:04:56,N,1.0,168,75,1.0,1.99,8.0,0.5,0.5,0.0,0.0,,0.3,9.3,2.0,1.0,0.0
4,2,2021-01-01 00:16:36,2021-01-01 00:16:40,N,2.0,265,265,3.0,0.0,-52.0,0.0,-0.5,0.0,0.0,,-0.3,-52.8,3.0,1.0,0.0
5,2,2021-01-01 00:16:36,2021-01-01 00:16:40,N,2.0,265,265,3.0,0.0,52.0,0.0,0.5,0.0,0.0,,0.3,52.8,2.0,1.0,0.0
6,2,2021-01-01 00:19:14,2021-01-01 00:19:21,N,5.0,265,265,1.0,0.0,180.0,0.0,0.0,36.06,0.0,,0.3,216.36,1.0,2.0,0.0
7,2,2021-01-01 00:26:31,2021-01-01 00:28:50,N,1.0,75,75,6.0,0.45,3.5,0.5,0.5,0.96,0.0,,0.3,5.76,1.0,1.0,0.0
8,2,2021-01-01 00:57:46,2021-01-01 00:57:57,N,1.0,225,225,1.0,0.0,2.5,0.5,0.5,0.0,0.0,,0.3,3.8,2.0,1.0,0.0
9,2,2021-01-01 00:58:32,2021-01-01 01:32:34,N,1.0,225,265,1.0,12.19,38.0,0.5,0.5,2.75,0.0,,0.3,42.05,1.0,1.0,0.0


In [46]:
green_2021_01.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76518 entries, 0 to 76517
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   VendorID               76518 non-null  int64         
 1   lpep_pickup_datetime   76518 non-null  datetime64[us]
 2   lpep_dropoff_datetime  76518 non-null  datetime64[us]
 3   store_and_fwd_flag     40471 non-null  object        
 4   RatecodeID             40471 non-null  float64       
 5   PULocationID           76518 non-null  int64         
 6   DOLocationID           76518 non-null  int64         
 7   passenger_count        40471 non-null  float64       
 8   trip_distance          76518 non-null  float64       
 9   fare_amount            76518 non-null  float64       
 10  extra                  76518 non-null  float64       
 11  mta_tax                76518 non-null  float64       
 12  tip_amount             76518 non-null  float64       
 13  t

In [20]:
def renaming_green_to_standard(dfs):
    """ 
    Functions for renaming the columns of a dataset or list of datasets to standard names, which will ease the cleaning process
    """
    if isinstance(dfs, pd.DataFrame):
        dfs = [dfs]  # Convert single DataFrame to a list of one DataFrame
    for df in dfs:
        if isinstance(df, pd.DataFrame):
            df.rename(columns={
                'lpep_pickup_datetime': 'pickup_datetime', 
                'lpep_dropoff_datetime': 'dropoff_datetime', 
                'PULocationID': 'pickup_zone', 
                'DOLocationID': 'dropoff_zone'
            }, inplace=True)
        else:
            print("Warning: The list contains non-DataFrame elements")

In [None]:
renaming_green_to_standard(green_2021_01)

In [21]:
def convert_float_to_int(dfs):
    """ 
    Function for converting datatypes of specific columns of a DataFrame or list of DataFrames to appropriate types.
    """
    if isinstance(dfs, pd.DataFrame):
        dfs = [dfs]  # Convert single DataFrame to a list of one DataFrame
    for df in dfs:
        if isinstance(df, pd.DataFrame):
            if "RatecodeID" in df.columns:
                df["RatecodeID"] = df["RatecodeID"].fillna(0).astype("int")
            if "passenger_count" in df.columns:
                df["passenger_count"] = df["passenger_count"].fillna(0).astype("int")
        else:
            print("Warning: The list contains non-DataFrame elements")

In [None]:
convert_float_to_int(green_2021_01)

In [18]:
def duplicated_rows(dfs):
    if isinstance(dfs, pd.DataFrame):
        dfs = [dfs]  # Convert single DataFrame to a list of one DataFrame
    for df in dfs:
        if isinstance(df, pd.DataFrame):
            duplicate_rows = df[df.duplicated()]
            if not duplicate_rows.empty:
                print("Duplicate rows found:\n", duplicate_rows)
            else:
                print("No duplicate rows found")

duplicated_rows(green_2021_01)

No duplicate rows found


In [25]:
def passenger_counts(df):
    if isinstance(df, pd.DataFrame):
        passenger_count_counts = df['passenger_count'].value_counts()
        if not passenger_count_counts.empty:
            print("Count for each unique value in 'passenger_count' column: in")
            print(passenger_count_counts)

In [None]:
passenger_counts(green_2021_01)

In [27]:
def count_invalid_fares(dfs):
    """
    Count rows where any fare-related column has an invalid value.
    
    """
    if isinstance(dfs, pd.DataFrame):
        dfs = [dfs]  # Convert single DataFrame to a list of one DataFrame
    for df in dfs:
        if isinstance(df, pd.DataFrame):

    # Columns to check for invalid fare amounts, <= 0; Need a fare for a valid trip
            fare_columns = ['fare_amount', 'total_amount']

    # Additional columns to check for negative values, < 0 as "0" is a valid value
            additional_fare_columns = ['extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge', 'congestion_surcharge']

    # Count rows where any fare-related column has an invalid value
            invalid_fare_counts = df[
                (df[fare_columns] <= 0).any(axis=1) | 
                (df[additional_fare_columns] < 0).any(axis=1)].shape[0]
            print("Number of rows with invalid fares: ", invalid_fare_counts)

            # Find + print rows where with invalid fares
            invalid_fare_rows = df[
            (df[fare_columns] <= 0).any(axis=1) |
            (df[additional_fare_columns] < 0).any(axis=1)]

            # Print 5 sample rows with invalid fares
            sample_rows = invalid_fare_rows.sample(n=5, random_state=42)
            print("5 Sample Rows with Invalid Fares:")
            print(sample_rows)

In [None]:
count_invalid_fares(green_2021_01)

In [22]:
def time_travel(dfs):    
    if isinstance(dfs, pd.DataFrame):
        dfs = [dfs]  # Convert single DataFrame to a list of one DataFrame
    for df in dfs:
        if isinstance(df, pd.DataFrame):
            time_travel = df[df['pickup_datetime'] > df['dropoff_datetime']].shape[0]
            print("Did anyone Time Travel?")
            print(time_travel)

            # Find + print rows where pickup time is after dropoff time
            time_travel_rows = df[df['pickup_datetime'] > df['dropoff_datetime']]
            if not time_travel_rows.empty:
                print("One of the rows where time travel occurred:")
                print(time_travel_rows.iloc[0])
            else:
                print("No rows where time travel occurred.")            
        else:
            print("Warning: The list contains non-DataFrame elements")

time_travel(green_2021_01)

Did anyone Time Travel?
0
No rows where time travel occurred.


In [26]:
def immovable_objects(dfs):    
    if isinstance(dfs, pd.DataFrame):
        dfs = [dfs]  # Convert single DataFrame to a list of one DataFrame
    for df in dfs:
        if isinstance(df, pd.DataFrame):
            negative_distances = df[df['trip_distance'] <= 0].shape[0]
            print("Travelling no distances?")
            print(negative_distances)

            # Find + print rows where pickup time is after dropoff time
            negative_distances_rows = df[df['trip_distance'] <= 0]
            if not negative_distances_rows.empty:
                print("One of the rows of the immovable objects:")
                print(negative_distances_rows.iloc[19]) #change this number for other examples
            else:
                print("No rows where time travel occurred.")
        else:
            print("Warning: The list contains non-DataFrame elements")

immovable_objects(green_2021_01)

Travelling no distances?
2553
One of the rows of the immovable objects:
VendorID                                   1
pickup_datetime          2021-01-01 04:24:35
dropoff_datetime         2021-01-01 04:33:20
store_and_fwd_flag                         N
RatecodeID                                 1
pickup_zone                              248
dropoff_zone                             169
passenger_count                            1
trip_distance                            0.0
fare_amount                             17.2
extra                                    0.0
mta_tax                                  0.5
tip_amount                               0.0
tolls_amount                             0.0
ehail_fee                               None
improvement_surcharge                    0.3
total_amount                            18.0
payment_type                             1.0
trip_type                                1.0
congestion_surcharge                     0.0
Name: 94, dtype: object


In [27]:
def check_dispute_payments(dfs):
    """
    Count rows where payment_type is 4 (Dispute) and print sample rows.
    """
    if isinstance(dfs, pd.DataFrame):
        dfs = [dfs]  # Convert single DataFrame to a list of one DataFrame
    
    for df in dfs:
        if isinstance(df, pd.DataFrame):
            # Count rows where payment_type is 4 (Dispute)
            dispute_count = df[df['payment_type'] == 4].shape[0]
            print("Number of rows with payment_type 4 (Dispute):", dispute_count)

            # Find rows where payment_type is 4 (Dispute)
            dispute_rows = df[df['payment_type'] == 4]

            # Print 5 sample rows with payment_type 4 (Dispute)
            sample_rows = dispute_rows.sample(n=5, random_state=42)
            print("5 Sample Rows with payment_type 4 (Dispute):")
            print(sample_rows)

check_dispute_payments(green_2021_01)

Number of rows with payment_type 4 (Dispute): 79
5 Sample Rows with payment_type 4 (Dispute):
       VendorID     pickup_datetime    dropoff_datetime store_and_fwd_flag  \
17697         2 2021-01-14 15:42:42 2021-01-14 15:43:23                  N   
433           1 2021-01-01 16:03:43 2021-01-01 16:13:37                  N   
15814         1 2021-01-13 11:16:16 2021-01-13 11:34:36                  N   
17708         1 2021-01-14 15:04:16 2021-01-14 15:10:23                  N   
13498         2 2021-01-11 18:16:50 2021-01-11 18:17:24                  N   

       RatecodeID  pickup_zone  dropoff_zone  passenger_count  trip_distance  \
17697           1          244           244                1           0.04   
433             1          152            41                1           1.60   
15814           1          244           241                1           4.40   
17708           5          166           166                1           0.80   
13498           1           75       

In [32]:
def check_ehail_fee_values(dfs):
    """
    Print all unique values in the 'ehail_fee' column and their counts.
    Also, print the number of null values in the 'ehail_fee' column.
    """
    if isinstance(dfs, pd.DataFrame):
        dfs = [dfs]  # Convert single DataFrame to a list of one DataFrame

    for df in dfs:
        if isinstance(df, pd.DataFrame):
            # Get the unique values and their counts in the 'ehail_fee' column
            unique_values_counts = df['ehail_fee'].value_counts(dropna=False)
            
            # Print the unique values and their counts
            print("Unique values in the 'ehail_fee' column and their counts:")
            print(unique_values_counts)
            
            # Count and print the number of null values in the 'ehail_fee' column
            null_count = df['ehail_fee'].isnull().sum()
            print(f"Number of null values in the 'ehail_fee' column: {null_count}")

check_ehail_fee_values(green_2021_01)


Unique values in the 'ehail_fee' column and their counts:
ehail_fee
None    76518
Name: count, dtype: int64
Number of null values in the 'ehail_fee' column: 76518


In [30]:
print("Current Working Directory:", cwd)

taxi_zone_dir = os.path.join(os.getcwd(), "..", "Datasets", "taxi_other")

# Define the directory where the data is located relative to the current working directory
print("Taxi Zone CSV Directory:", taxi_zone_dir)

# Define the file path relative to the data directory
taxi_zone_path = os.path.join(cwd, taxi_zone_dir, "taxi_zone_lookup.csv")

taxi_zone = pd.read_csv(taxi_zone_path, keep_default_na=True, delimiter=",", skipinitialspace=True, encoding="Windows-1252")

def valid_zones_1(df):
    manhattan_df = df[df["Borough"] == "Manhattan"]
    unique_zones = manhattan_df["LocationID"].unique()
    
    print(f"Number of Unique Zones: {len(unique_zones)}")
    print("List of Unique Zones:", unique_zones)

valid_zones_1(taxi_zone)
    

Current Working Directory: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning
Taxi Zone CSV Directory: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\..\Datasets\taxi_other
Number of Unique Zones: 69
List of Unique Zones: [  4  12  13  24  41  42  43  45  48  50  68  74  75  79  87  88  90 100
 103 104 105 107 113 114 116 120 125 127 128 137 140 141 142 143 144 148
 151 152 153 158 161 162 163 164 166 170 186 194 202 209 211 224 229 230
 231 232 233 234 236 237 238 239 243 244 246 249 261 262 263]


In [31]:
# Define a function to get unique zones for Manhattan
def get_manhattan_zones(df):
    manhattan_df = df[df["Borough"] == "Manhattan"]
    unique_zones = manhattan_df["LocationID"].unique()
    return set(unique_zones)

# Get the unique Manhattan zones from the taxi_zone DataFrame
manhattan_zones = get_manhattan_zones(taxi_zone)

In [32]:
# Define the function to check pickup and dropoff zones
def check_zones(df, manhattan_zones):
    # Check if both pickup_zone and dropoff_zone are not in manhattan_zones
    invalid_zones = df[~df["pickup_zone"].isin(manhattan_zones) & ~df["dropoff_zone"].isin(manhattan_zones)]
    
    print(f"Invalid zones count: {invalid_zones.shape[0]}")
    
    if not invalid_zones.empty:
        print("Examples of rows with invalid zones:")
        print(invalid_zones.head())  # Print first few invalid rows

In [51]:
check_zones(green_2021_01, manhattan_zones)


Invalid zones count: 40376
Examples of rows with invalid zones:
   VendorID     pickup_datetime    dropoff_datetime store_and_fwd_flag  \
4         2 2021-01-01 00:16:36 2021-01-01 00:16:40                  N   
5         2 2021-01-01 00:16:36 2021-01-01 00:16:40                  N   
6         2 2021-01-01 00:19:14 2021-01-01 00:19:21                  N   
8         2 2021-01-01 00:57:46 2021-01-01 00:57:57                  N   
9         2 2021-01-01 00:58:32 2021-01-01 01:32:34                  N   

   RatecodeID  pickup_zone  dropoff_zone  passenger_count  trip_distance  \
4           2          265           265                3           0.00   
5           2          265           265                3           0.00   
6           5          265           265                1           0.00   
8           1          225           225                1           0.00   
9           1          225           265                1          12.19   

   fare_amount  extra  mta_tax  ti

In [34]:
def verifying_invalid_zones(df, manhattan_zones, taxi_zone_csv):
    # Identify invalid rows where both pickup_zone and dropoff_zone are not in manhattan_zones
    invalid_zones = df[~df["pickup_zone"].isin(manhattan_zones) & ~df["dropoff_zone"].isin(manhattan_zones)]
    
    # If there are no invalid rows, print and return
    if invalid_zones.empty:
        print("No invalid zones found.")
        return
    
    # Merge invalid zones with taxi_zone_csv to get borough information for both pickup_zone and dropoff_zone
    invalid_zones_merged = invalid_zones.merge(taxi_zone_csv, left_on="pickup_zone", right_on="LocationID", how="left", suffixes=('_pickup', '_dropoff'))
    invalid_zones_merged = invalid_zones_merged.merge(taxi_zone_csv, left_on="dropoff_zone", right_on="LocationID", how="left", suffixes=('_pickup', '_dropoff'))

    # Concatenate Borough columns for analysis
    invalid_zones_merged["Borough_combined"] = invalid_zones_merged["Borough_pickup"] + " (pickup), " + invalid_zones_merged["Borough_dropoff"] + " (dropoff)"
    
    # Count the combined Borough information
    combined_borough_counts = invalid_zones_merged["Borough_combined"].value_counts()

    print("Invalid Rows Borough Counts:")
    print(combined_borough_counts)

manhattan_zones = get_manhattan_zones(taxi_zone)

In [None]:
verifying_invalid_zones(green_2021_01, manhattan_zones, taxi_zone)

In [53]:
def drop_green_invalid_rows(dfs, manhattan_zones, taxi_zone_csv):
    if isinstance(dfs, pd.DataFrame):
        dfs = [dfs]  # Convert single DataFrame to a list of one DataFrame

    cleaned_dfs = []
    
    for df in dfs:
        if isinstance(df, pd.DataFrame):
            # Drop duplicate rows
            df = df.drop_duplicates()
            
            # Drop "airport_fee" column due to missing values
            if "ehail_fee" in df.columns:
                df = df.drop(columns=["ehail_fee"])
            
            # Drop rows where passenger_count == 0 or >= 6
            df = df[(df["passenger_count"] > 0) & (df["passenger_count"] < 6)]
            
            # Drop rows where fare_amount or total_amount <= 0
            df = df[(df["fare_amount"] > 0) & (df["total_amount"] > 0)]
            
            # Drop rows where extra, mta_tax, tip_amount, tolls_amount, improvement_surcharge, or congestion_surcharge < 0
            df = df[(df["extra"] >= 0) & (df["mta_tax"] >= 0) & (df["tip_amount"] >= 0) & 
                    (df["tolls_amount"] >= 0) & (df["improvement_surcharge"] >= 0) & 
                    (df["congestion_surcharge"] >= 0)]
            
            # Drop rows where pickup_datetime == dropoff_datetime
            df = df[df["pickup_datetime"] != df["dropoff_datetime"]]
            
            # Drop rows where trip_distance <= 0
            df = df[df["trip_distance"] > 0]
            
            # Drop rows where RateCodeID != 1-6
            df = df[df["RatecodeID"].isin([1, 2, 3, 4, 5, 6])]
            
            # Drop rows where payment_type == 4
            df = df[df["payment_type"] != 4]
            
            # Drop rows where both pickup_zone and dropoff_zone are not Manhattan zones
            df = df[df["pickup_zone"].isin(manhattan_zones) | df["dropoff_zone"].isin(manhattan_zones)]
            
            # Append the cleaned DataFrame to the list
            cleaned_dfs.append(df)
        else:
            print("Warning: The list contains non-DataFrame elements")
    
    # Return the cleaned DataFrame(s)
    if len(cleaned_dfs) == 1:
        return cleaned_dfs[0]
    else:
        return cleaned_dfs

In [54]:
manhattan_zones = get_manhattan_zones(taxi_zone)

# Call the verifying_invalid_zones function
green_2021_01 = drop_green_invalid_rows(green_2021_01, manhattan_zones, taxi_zone)

In [55]:
green_2021_01.info()

<class 'pandas.core.frame.DataFrame'>
Index: 25598 entries, 0 to 40470
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   VendorID               25598 non-null  int64         
 1   pickup_datetime        25598 non-null  datetime64[us]
 2   dropoff_datetime       25598 non-null  datetime64[us]
 3   store_and_fwd_flag     25598 non-null  object        
 4   RatecodeID             25598 non-null  int32         
 5   pickup_zone            25598 non-null  int64         
 6   dropoff_zone           25598 non-null  int64         
 7   passenger_count        25598 non-null  int32         
 8   trip_distance          25598 non-null  float64       
 9   fare_amount            25598 non-null  float64       
 10  extra                  25598 non-null  float64       
 11  mta_tax                25598 non-null  float64       
 12  tip_amount             25598 non-null  float64       
 13  tolls_

In [56]:
def drop_green_columns(df):
    columns_to_drop = ["VendorID", "trip_distance", "RatecodeID", "store_and_fwd_flag", "payment_type", 
                       "fare_amount", "extra", "mta_tax", "improvement_surcharge", "tip_amount", 
                       "tolls_amount", "total_amount", "congestion_surcharge", "trip_type"]
    
    # Drop only the columns that exist in the DataFrame
    existing_columns_to_drop = [col for col in columns_to_drop if col in df.columns]
    df.drop(columns=existing_columns_to_drop, inplace=True)

    # Return the modified DataFrame
    return df

In [58]:
drop_green_columns(green_2021_01)

Unnamed: 0,pickup_datetime,dropoff_datetime,pickup_zone,dropoff_zone,passenger_count
0,2021-01-01 00:15:56,2021-01-01 00:19:52,43,151,1
1,2021-01-01 00:25:59,2021-01-01 00:34:44,166,239,1
2,2021-01-01 00:45:57,2021-01-01 00:51:55,41,42,1
3,2020-12-31 23:57:51,2021-01-01 00:04:56,168,75,1
10,2021-01-01 00:31:14,2021-01-01 00:55:07,244,244,2
...,...,...,...,...,...
40463,2021-01-31 23:45:27,2021-01-31 23:56:04,74,244,1
40467,2021-01-31 23:13:36,2021-01-31 23:17:51,75,238,1
40468,2021-01-31 23:46:45,2021-01-31 23:57:08,41,263,1
40469,2021-01-31 23:42:17,2021-01-31 23:48:19,75,75,1


In [59]:
green_2021_01.info()

<class 'pandas.core.frame.DataFrame'>
Index: 25598 entries, 0 to 40470
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   pickup_datetime   25598 non-null  datetime64[us]
 1   dropoff_datetime  25598 non-null  datetime64[us]
 2   pickup_zone       25598 non-null  int64         
 3   dropoff_zone      25598 non-null  int64         
 4   passenger_count   25598 non-null  int32         
dtypes: datetime64[us](2), int32(1), int64(2)
memory usage: 1.1 MB


In [60]:
def drop_missing_values(dfs):
    """
    Drops any rows with missing values from the DataFrame.
    
    Parameters:
        df (pd.DataFrame): The DataFrame to clean.
        
    Returns:
        pd.DataFrame: The cleaned DataFrame.
        int: The number of rows that were dropped.
    """
    
    if isinstance(dfs, pd.DataFrame):
        dfs = [dfs]  # Convert single DataFrame to a list of one DataFrame

    cleaned_dfs = []
    
    for df in dfs:
        if isinstance(df, pd.DataFrame):    # Count the number of rows before dropping missing values
            initial_row_count = df.shape[0]
    
            # Drop rows with missing values
            df = df.dropna()
    
            # Count the number of rows after dropping missing values
            final_row_count = df.shape[0]
    
            # Calculate the number of rows that were dropped
            rows_dropped = initial_row_count - final_row_count
        print(f"Number of rows dropped: {rows_dropped}")
        return df

In [61]:
drop_missing_values(green_2021_01)

Number of rows dropped: 0


Unnamed: 0,pickup_datetime,dropoff_datetime,pickup_zone,dropoff_zone,passenger_count
0,2021-01-01 00:15:56,2021-01-01 00:19:52,43,151,1
1,2021-01-01 00:25:59,2021-01-01 00:34:44,166,239,1
2,2021-01-01 00:45:57,2021-01-01 00:51:55,41,42,1
3,2020-12-31 23:57:51,2021-01-01 00:04:56,168,75,1
10,2021-01-01 00:31:14,2021-01-01 00:55:07,244,244,2
...,...,...,...,...,...
40463,2021-01-31 23:45:27,2021-01-31 23:56:04,74,244,1
40467,2021-01-31 23:13:36,2021-01-31 23:17:51,75,238,1
40468,2021-01-31 23:46:45,2021-01-31 23:57:08,41,263,1
40469,2021-01-31 23:42:17,2021-01-31 23:48:19,75,75,1


In [65]:
def calculate_zone_busy(green_df):
    # Combine pickup and dropoff data into a single DataFrame; Renamed and combined into a single datetime column
    pickup_data = green_df[['pickup_datetime', 'passenger_count', 'pickup_zone']].rename(columns={'pickup_datetime': 'datetime', 'pickup_zone': 'zone'})
    dropoff_data = green_df[['dropoff_datetime', 'passenger_count', 'dropoff_zone']].rename(columns={'dropoff_datetime': 'datetime', 'dropoff_zone': 'zone'})
    combined_data = pd.concat([pickup_data, dropoff_data])
    
    # Round datetime to the nearest hour
    combined_data['datetime'] = combined_data['datetime'].dt.round('h')
    
    # Group by hour and zone, summing passenger counts
    zone_busy_df = combined_data.groupby(['datetime', 'zone'])['passenger_count'].sum().reset_index()
    
    return zone_busy_df

In [66]:
green_2021_01 = calculate_zone_busy(green_2021_01)

In [67]:
green_2021_01.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15898 entries, 0 to 15897
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   datetime         15898 non-null  datetime64[us]
 1   zone             15898 non-null  int64         
 2   passenger_count  15898 non-null  int32         
dtypes: datetime64[us](1), int32(1), int64(1)
memory usage: 310.6 KB


In [68]:
green_2021_01.head(10)

Unnamed: 0,datetime,zone,passenger_count
0,2021-01-01,41,1
1,2021-01-01,42,1
2,2021-01-01,43,1
3,2021-01-01,74,2
4,2021-01-01,75,4
5,2021-01-01,116,3
6,2021-01-01,151,1
7,2021-01-01,152,1
8,2021-01-01,166,2
9,2021-01-01,168,1


In [69]:
directory_path = os.path.join("Datasets", "taxi_other")


# Define the file path
file_path = os.path.join(directory_path, "green_2021_01_cleaned.csv")

# Save the DataFrame to CSV
green_2021_01.to_csv(file_path, index=False)

print("DataFrame saved to:", file_path)

DataFrame saved to: Datasets\taxi_other\green_2021_01_cleaned.csv


In [10]:
date_range = pd.date_range(start='2021-01', end='2024-04', freq='ME')
print("Date range:", date_range)

Date range: DatetimeIndex(['2021-01-31', '2021-02-28', '2021-03-31', '2021-04-30',
               '2021-05-31', '2021-06-30', '2021-07-31', '2021-08-31',
               '2021-09-30', '2021-10-31', '2021-11-30', '2021-12-31',
               '2022-01-31', '2022-02-28', '2022-03-31', '2022-04-30',
               '2022-05-31', '2022-06-30', '2022-07-31', '2022-08-31',
               '2022-09-30', '2022-10-31', '2022-11-30', '2022-12-31',
               '2023-01-31', '2023-02-28', '2023-03-31', '2023-04-30',
               '2023-05-31', '2023-06-30', '2023-07-31', '2023-08-31',
               '2023-09-30', '2023-10-31', '2023-11-30', '2023-12-31',
               '2024-01-31', '2024-02-29', '2024-03-31'],
              dtype='datetime64[ns]', freq='ME')


In [11]:
data_dir = os.path.join(os.getcwd(), "..", "Datasets", "taxi_parquets")
print(f"Data directory: {data_dir}")

# Check if the directory exists
if not os.path.exists(data_dir):
    print(f"Directory {data_dir} does not exist")
else:
    # List all files in the directory to check for existence and naming
    all_files_in_dir = os.listdir(data_dir)
    print(f"Files in directory {data_dir}: {all_files_in_dir}")

all_files = []


Data directory: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\..\Datasets\taxi_parquets
Files in directory c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\..\Datasets\taxi_parquets: ['fhvhv_2021_01.parquet', 'fhvhv_2021_02.parquet', 'fhvhv_2021_03.parquet', 'fhvhv_2021_04.parquet', 'fhvhv_2021_05.parquet', 'fhvhv_2021_06.parquet', 'fhvhv_2021_07.parquet', 'fhvhv_2021_08.parquet', 'fhvhv_2021_09.parquet', 'fhvhv_2021_10.parquet', 'fhvhv_2021_11.parquet', 'fhvhv_2021_12.parquet', 'fhvhv_2022_01.parquet', 'fhvhv_2022_02.parquet', 'fhvhv_2022_03.parquet', 'fhvhv_2022_04.parquet', 'fhvhv_2022_05.parquet', 'fhvhv_2022_06.parquet', 'fhvhv_2022_07.parquet', 'fhvhv_2022_08.parquet', 'fhvhv_2022_09.parquet', 'fhvhv_2022_10.parquet', 'fhvhv_2022_11.parquet', 'fhvhv_2022_12.parquet', 'fhvhv_2023_01.parquet', 'fhvhv_2023_02.parquet', 'fhvhv_2023_03.parquet', 'fhvhv_2023_04.parquet', 'fhvhv_2023_05.parquet', 'fhvhv_2023_06.p

In [12]:
cwd = os.getcwd()
print("Current Working Directory:", cwd)

# Define the file paths relative to the data directory
green_2024_03_path = os.path.join(cwd, data_dir, "green_2024_03.parquet")

print("green_2024_03_path:", green_2024_03_path)



Current Working Directory: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning
green_2024_03_path: c:\Users\35385\Desktop\CS_Summer_2024\Shared_GH\New-York-App\data-analytics\cleaning\..\Datasets\taxi_parquets\green_2024_03.parquet


In [13]:
green_2024_03 = pd.read_parquet(green_2024_03_path, engine='pyarrow')

In [22]:
renaming_green_to_standard(green_2024_03)

In [23]:
convert_float_to_int(green_2024_03)

In [24]:
green_2024_03.head(10)

Unnamed: 0,VendorID,pickup_datetime,dropoff_datetime,store_and_fwd_flag,RatecodeID,pickup_zone,dropoff_zone,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,2,2024-03-01 00:10:52,2024-03-01 00:26:12,N,1,129,226,1,1.72,12.8,1.0,0.5,3.06,0.0,,1.0,18.36,1.0,1.0,0.0
1,2,2024-03-01 00:22:21,2024-03-01 00:35:15,N,1,130,218,1,3.25,17.7,1.0,0.5,0.0,0.0,,1.0,20.2,2.0,1.0,0.0
2,2,2024-03-01 00:45:27,2024-03-01 01:04:32,N,1,255,107,2,4.58,23.3,1.0,0.5,3.5,0.0,,1.0,32.05,1.0,1.0,2.75
3,1,2024-03-01 00:02:00,2024-03-01 00:23:45,N,1,181,71,1,0.0,22.5,0.0,1.5,0.0,0.0,,1.0,24.0,1.0,1.0,0.0
4,2,2024-03-01 00:16:45,2024-03-01 00:23:25,N,1,95,135,1,1.15,8.6,1.0,0.5,1.0,0.0,,1.0,12.1,1.0,1.0,0.0
5,2,2024-03-01 00:41:20,2024-03-01 00:57:00,N,1,80,7,2,6.78,28.9,1.0,0.5,9.42,0.0,,1.0,40.82,1.0,1.0,0.0
6,2,2024-03-01 00:47:47,2024-03-01 01:00:53,N,1,42,233,1,6.19,26.1,1.0,0.5,0.0,0.0,,1.0,31.1,2.0,1.0,2.5
7,2,2024-03-01 00:44:48,2024-03-01 01:07:13,N,1,36,195,1,6.26,28.9,1.0,0.5,9.42,0.0,,1.0,40.82,1.0,1.0,0.0
8,2,2024-03-01 00:32:39,2024-03-01 00:38:57,N,1,75,263,1,0.81,7.9,1.0,0.5,0.73,0.0,,1.0,11.13,1.0,1.0,0.0
9,2,2024-03-01 00:07:41,2024-03-01 00:14:12,N,1,179,179,2,1.15,8.6,1.0,0.5,2.22,0.0,,1.0,13.32,1.0,1.0,0.0


In [26]:
passenger_counts(green_2024_03)

Count for each unique value in 'passenger_count' column: in
passenger_count
1    46154
2     5547
0     2656
5     1347
6     1006
3      544
4      195
9        3
8        3
7        2
Name: count, dtype: int64


In [28]:
count_invalid_fares(green_2024_03)

Number of rows with invalid fares:  223
5 Sample Rows with Invalid Fares:
       VendorID     pickup_datetime    dropoff_datetime store_and_fwd_flag  \
1685          2 2024-03-01 19:54:04 2024-03-01 19:58:00                  N   
20271         2 2024-03-12 09:25:46 2024-03-12 09:29:40                  N   
27722         2 2024-03-15 23:15:20 2024-03-15 23:19:08                  N   
34812         2 2024-03-20 09:29:35 2024-03-20 09:32:32                  N   
54117         2 2024-03-31 03:11:40 2024-03-31 03:11:55                  N   

       RatecodeID  pickup_zone  dropoff_zone  passenger_count  trip_distance  \
1685            1           74            75                2           0.51   
20271           1           74            42                1           0.45   
27722           1           33            52                1           0.79   
34812           1           74            74                1           0.06   
54117           1          129            82             

In [33]:
check_zones(green_2024_03, manhattan_zones)


Invalid zones count: 22441
Examples of rows with invalid zones:
   VendorID     pickup_datetime    dropoff_datetime store_and_fwd_flag  \
0         2 2024-03-01 00:10:52 2024-03-01 00:26:12                  N   
1         2 2024-03-01 00:22:21 2024-03-01 00:35:15                  N   
3         1 2024-03-01 00:02:00 2024-03-01 00:23:45                  N   
4         2 2024-03-01 00:16:45 2024-03-01 00:23:25                  N   
5         2 2024-03-01 00:41:20 2024-03-01 00:57:00                  N   

   RatecodeID  pickup_zone  dropoff_zone  passenger_count  trip_distance  \
0           1          129           226                1           1.72   
1           1          130           218                1           3.25   
3           1          181            71                1           0.00   
4           1           95           135                1           1.15   
5           1           80             7                2           6.78   

   fare_amount  extra  mta_tax  ti

In [35]:
verifying_invalid_zones(green_2024_03, manhattan_zones, taxi_zone)

Invalid Rows Borough Counts:
Borough_combined
Queens (pickup), Queens (dropoff)             13854
Brooklyn (pickup), Brooklyn (dropoff)          6435
Queens (pickup), Brooklyn (dropoff)             499
Bronx (pickup), Bronx (dropoff)                 460
Brooklyn (pickup), Queens (dropoff)             451
Queens (pickup), Unknown (dropoff)              212
Unknown (pickup), Unknown (dropoff)             118
Queens (pickup), Bronx (dropoff)                 58
Brooklyn (pickup), Unknown (dropoff)             55
Bronx (pickup), Queens (dropoff)                 45
Bronx (pickup), Brooklyn (dropoff)               31
Brooklyn (pickup), Bronx (dropoff)               26
Bronx (pickup), Unknown (dropoff)                22
Unknown (pickup), Queens (dropoff)               10
Brooklyn (pickup), EWR (dropoff)                 10
Brooklyn (pickup), Staten Island (dropoff)        3
Staten Island (pickup), Brooklyn (dropoff)        2
Unknown (pickup), Brooklyn (dropoff)              2
Unknown (pickup), 