In [7]:
import pandas as pd
from sqlalchemy import create_engine, text

# --- Configuration (Ensure these match your previous setup) ---
DB_USER = "postgres"
DB_PASS = "etes1209111"      # Use your confirmed password
DB_HOST = "127.0.0.1"    
DB_PORT = "5432"
DB_NAME = "nyc_taxi"
TABLE_NAME = "taxi_trips"

# SQLAlchemy Connection String
DATABASE_URL = f"postgresql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
engine = create_engine(DATABASE_URL)

print("✅ Database engine created. Ready to execute SQL queries.")

✅ Database engine created. Ready to execute SQL queries.


In [3]:
# A. Handle NULLs in Financial Metrics (Imputation with 0)
sql_coalesce_financial = f"""
UPDATE {TABLE_NAME}
SET 
    tip_amount = COALESCE(tip_amount, 0),
    airport_fee = COALESCE(airport_fee, 0),
    congestion_surcharge = COALESCE(congestion_surcharge, 0),
    fare_amount = COALESCE(fare_amount, 0),
    total_amount = COALESCE(total_amount, 0);
"""
with engine.connect() as connection:
    result = connection.execute(text(sql_coalesce_financial))
    connection.commit()
    print(f"✅ Updated financial NULLs with 0. Rows affected: {result.rowcount}")

✅ Updated financial NULLs with 0. Rows affected: 8772953


In [4]:
# B. Handle NULLs in Categorical Field (payment_type)
sql_coalesce_payment = f"""
UPDATE {TABLE_NAME}
SET payment_type = 99 -- Use 99 for 'Unknown/Missing'
WHERE payment_type IS NULL;
"""
with engine.connect() as connection:
    result = connection.execute(text(sql_coalesce_payment))
    connection.commit()
    print(f"✅ Updated payment_type NULLs with 99. Rows affected: {result.rowcount}")

✅ Updated payment_type NULLs with 99. Rows affected: 0


In [5]:
# C. Outlier Check for Passenger Count (Capping at 6)
sql_outlier_passengers = f"""
UPDATE {TABLE_NAME}
SET passenger_count = 6
WHERE passenger_count > 6;
"""
with engine.connect() as connection:
    result = connection.execute(text(sql_outlier_passengers))
    connection.commit()
    print(f"✅ Passenger count outliers (set > 6 to 6). Rows affected: {result.rowcount}")

✅ Passenger count outliers (set > 6 to 6). Rows affected: 14


In [10]:
# --- Fix 1: Ensure rain_day has no NULLs ---
sql_fix_rain_day = f"""
UPDATE {TABLE_NAME}
SET rain_day = 0
WHERE rain_day IS NULL;
"""

# --- Fix 2: Delete out-of-scope dates ---
sql_fix_date_range = f"""
DELETE FROM {TABLE_NAME}
WHERE pickup_date < '2023-01-01' OR pickup_date >= '2023-04-01';
"""


with engine.connect() as connection:
    print("Executing Fix 1: Updating NULL rain_day...")
    result_rain = connection.execute(text(sql_fix_rain_day))
    print(f"✅ rain_day NULLs fixed. Rows affected: {result_rain.rowcount}")

    print("Executing Fix 2: Deleting pre-2023 and post-Q1 data...")
    result_date = connection.execute(text(sql_fix_date_range))
    print(f"✅ Out-of-scope dates deleted. Rows affected: {result_date.rowcount}")
    
    connection.commit()
    print("\nSource taxi_trips table is now fully cleansed for modeling.")

Executing Fix 1: Updating NULL rain_day...
✅ rain_day NULLs fixed. Rows affected: 119
Executing Fix 2: Deleting pre-2023 and post-Q1 data...
✅ Out-of-scope dates deleted. Rows affected: 119

Source taxi_trips table is now fully cleansed for modeling.


In [6]:
# D. Handle NULLs in Weather Dimensions (Imputation with average)
# We assume max_temp, min_temp, and precipitation might be NULL if weather join failed for some records.
# For simplicity, we'll replace with the *overall average* of each column.
# Note: A more rigorous approach would use the average for that time of day/year.

# First, calculate averages
avg_temp_query = f"""
SELECT AVG(max_temp), AVG(min_temp), AVG(precipitation) FROM {TABLE_NAME}
WHERE max_temp IS NOT NULL;
"""
with engine.connect() as connection:
    avg_results = connection.execute(text(avg_temp_query)).fetchone()
    avg_max_temp = round(avg_results[0], 2)
    avg_min_temp = round(avg_results[1], 2)
    avg_precip = round(avg_results[2], 2)

    sql_coalesce_weather = f"""
    UPDATE {TABLE_NAME}
    SET
        max_temp = COALESCE(max_temp, {avg_max_temp}),
        min_temp = COALESCE(min_temp, {avg_min_temp}),
        precipitation = COALESCE(precipitation, {avg_precip})
    WHERE max_temp IS NULL OR min_temp IS NULL OR precipitation IS NULL;
    """
    result = connection.execute(text(sql_coalesce_weather))
    connection.commit()
    print(f"✅ Updated weather NULLs with overall average (Max:{avg_max_temp}, Min:{avg_min_temp}, Precip:{avg_precip}). Rows affected: {result.rowcount}")


print("\n🎉 Final cleaning and standardization complete on the PostgreSQL server.")

✅ Updated weather NULLs with overall average (Max:9.77, Min:2.54, Precip:2.58). Rows affected: 119

🎉 Final cleaning and standardization complete on the PostgreSQL server.
