In [None]:
import os
import pandas as pd
import random

# Path to the folder where the files are located and where the results will be saved
main_data_path = r"C:pythonprojetcs_Dani\nyc_taxi_data"
output_folder = r"C:nyc_taxi_cleaned"
lookup_table_path = r"taxi_zone_lookup.csv"

# Create the output folder if it doesn't exist
try:
    os.makedirs(output_folder, exist_ok=True)
    print(f"Output directory created: {output_folder}")
except Exception as e:
    print(f"Error creating output directory: {e}")

# Load the lookup table
try:
    Zone_table = pd.read_csv(lookup_table_path, encoding='utf-8')
    print("Lookup file successfully loaded.")
except Exception as e:
    print(f"Error loading the lookup file: {e}")
    exit(1)

# Display basic information for initial validation
print("\nLookup Table Info:")
print(Zone_table.info())

# Mapping of payment types and vendors
payment_type_mapping = {
    1: 'Credit Card',
    2: 'Cash',
    3: 'No Charge',
    4: 'Dispute',
    0: 'Unknown'
}
vendor_mapping = {
    1: 'Creative Mobile Technologies (CMT)',
    2: 'VeriFone Inc.'
}

# Function to process data from a single file
def process_data(file_path, file_name):
    try:
        print(f"Processing file data: {file_path}...")
        df = pd.read_parquet(file_path)

        # Remove null values in essential columns
        df_data = df.dropna(subset=['total_amount', 'passenger_count', 'PULocationID', 'DOLocationID'])

        # Select only necessary columns for analysis
        columns_to_keep = [
            'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'passenger_count',
            'trip_distance', 'PULocationID', 'DOLocationID', 'fare_amount',
            'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
            'total_amount', 'payment_type', 'VendorID'
        ]
        df_data = df_data[columns_to_keep]

        # Convert datetime columns
        df_data['tpep_pickup_datetime'] = pd.to_datetime(df_data['tpep_pickup_datetime'], errors='coerce')
        df_data['tpep_dropoff_datetime'] = pd.to_datetime(df_data['tpep_dropoff_datetime'], errors='coerce')

        # Create columns for duration and datetime info
        df_data['trip_duration_minutes'] = (df_data['tpep_dropoff_datetime'] - df_data['tpep_pickup_datetime']).dt.total_seconds() / 60

        # Adjust mappings
        df_data['payment_type_description'] = df_data['payment_type'].map(payment_type_mapping)
        df_data['vendor_name_description'] = df_data['VendorID'].map(vendor_mapping)

        # Add real names for Pickup and Dropoff locations
        df_data = df_data.merge(
            Zone_table,
            left_on='PULocationID',
            right_on='LocationID',
            how='left'
        ).rename(columns={"Zone": "Pickup_Location", "Borough": "Pickup_Borough"})
        df_data.drop(columns=['LocationID'], inplace=True)

        df_data = df_data.merge(
            Zone_table,
            left_on='DOLocationID',
            right_on='LocationID',
            how='left'
        ).rename(columns={"Zone": "Dropoff_Location", "Borough": "Dropoff_Borough"})
        df_data.drop(columns=['LocationID'], inplace=True)

        # Remove duplicates and unnecessary columns
        df_data = df_data.drop_duplicates()
        df_data.drop(columns=['PULocationID', 'DOLocationID', 'payment_type', 'VendorID'], inplace=True)

        # Select 100,000 random rows
        if len(df_data) > 100000:
            df_data = df_data.sample(n=100000, random_state=42)

        # Ensure the output file path is correct
        output_file = os.path.join(output_folder, f"{file_name.replace('.parquet', '_cleaned.csv')}")

        # Save the cleaned data
        df_data.to_csv(output_file, index=False)
        print(f"Cleaned data from file {file_name} successfully saved!\n")
    except Exception as e:
        print(f"Error processing file {file_name}: {e}")

# Loop to process all .parquet files inside the main folder
for file_name in os.listdir(main_data_path):
    if file_name.endswith(".parquet"):
        file_path = os.path.join(main_data_path, file_name)
        try:
            print(f"Processing file: {file_name}")
            process_data(file_path, file_name)
        except Exception as e:
            print(f"Error processing file {file_name}: {e}")

print("Processing completed!")
