# ============================================
# 02_clean_transform.ipynb
# Step 2: Data Cleaning and Transformation
# ============================================

In [7]:
import pandas as pd
import numpy as np
import os
from datetime import timedelta

In [8]:
# ✅ 1. Define input/output paths
RAW_DIR = "../data_raw"
CLEAN_DIR = "../data_clean"
os.makedirs(CLEAN_DIR, exist_ok=True)

In [9]:
# # -----------------------------
# 1️⃣ Load Data
# -----------------------------
print("Loading raw data...")

# Load Taxi Data (combining 3 months)
taxi_files = [f for f in os.listdir(RAW_DIR) if f.startswith('yellow_tripdata') and f.endswith('.csv')]
taxi_dfs = []
for file in taxi_files:
    # Use selected dtypes to reduce memory usage and ensure correct parsing
    dtypes = {
        'VendorID': 'Int64', 'PULocationID': 'Int64', 'DOLocationID': 'Int64',
        'passenger_count': 'Int64', 'fare_amount': 'float64',
        'extra': 'float64', 'mta_tax': 'float64', 'tip_amount': 'float64',
        'tolls_amount': 'float64', 'improvement_surcharge': 'float64',
        'total_amount': 'float64', 'payment_type': 'Int64',
        'congestion_surcharge': 'float64'
    }
    df = pd.read_csv(os.path.join(RAW_DIR, file), dtype=dtypes)
    taxi_dfs.append(df)

taxi_df = pd.concat(taxi_dfs, ignore_index=True)
print(f"Loaded {len(taxi_df):,} taxi trips from {len(taxi_files)} files.")

# Load Weather Data
weather_df = pd.read_csv(os.path.join(RAW_DIR, "weather_2023_Q1.csv"))
print(f"Loaded {len(weather_df)} weather records.")

# OSM Data (For now, we'll skip the complex Geo-Join. It's often slow for full data.
# We'll rely on the existing Location IDs for zone/borough info later.)
# osm_gdf = gpd.read_file(os.path.join(RAW_DIR, "nyc_osm.geojson"))

Loading raw data...


  df = pd.read_csv(os.path.join(RAW_DIR, file), dtype=dtypes)
  df = pd.read_csv(os.path.join(RAW_DIR, file), dtype=dtypes)
  df = pd.read_csv(os.path.join(RAW_DIR, file), dtype=dtypes)


Loaded 9,384,487 taxi trips from 3 files.
Loaded 90 weather records.


In [10]:


# -----------------------------
# 2️⃣ Clean Taxi Data
# -----------------------------
print("Cleaning and transforming taxi data...")

# 2.1 Convert Timestamps
# Ensure correct datetime parsing
taxi_df['tpep_pickup_datetime'] = pd.to_datetime(taxi_df['tpep_pickup_datetime'])
taxi_df['tpep_dropoff_datetime'] = pd.to_datetime(taxi_df['tpep_dropoff_datetime'])

# Extract Date for joining with weather
taxi_df['pickup_date'] = taxi_df['tpep_pickup_datetime'].dt.date
taxi_df['pickup_hour'] = taxi_df['tpep_pickup_datetime'].dt.hour
taxi_df['pickup_dayofweek'] = taxi_df['tpep_pickup_datetime'].dt.dayofweek # Monday=0, Sunday=6

# 2.2 Feature Engineering (Trip Duration)
taxi_df['trip_duration'] = (taxi_df['tpep_dropoff_datetime'] - taxi_df['tpep_pickup_datetime']).dt.total_seconds() / 60 # Duration in minutes

# 2.3 Filter Invalid/Outlier Trips (Critical Cleaning)
initial_count = len(taxi_df)

# a) Remove records with zero or negative duration (impossible trips)
taxi_df = taxi_df[taxi_df['trip_duration'] > 0]
# b) Remove trips lasting over 3 hours (180 mins) or under 1 minute (likely errors)
taxi_df = taxi_df[(taxi_df['trip_duration'] <= 180) & (taxi_df['trip_duration'] >= 1)]
# c) Remove trips with zero or negative fares (impossible business logic)
taxi_df = taxi_df[taxi_df['total_amount'] > 0]
# d) Remove trips with zero passenger count
taxi_df = taxi_df[taxi_df['passenger_count'] > 0]
# e) Remove trips with zero or negative distance
taxi_df = taxi_df[taxi_df['trip_distance'] > 0] 

# Convert 'PULocationID' and 'DOLocationID' to string for consistency in the DB
taxi_df['PULocationID'] = taxi_df['PULocationID'].astype(str)
taxi_df['DOLocationID'] = taxi_df['DOLocationID'].astype(str)

print(f"Filtered out {initial_count - len(taxi_df):,} invalid trips. Remaining: {len(taxi_df):,}")

# Drop unnecessary/redundant columns before merge
cols_to_drop = ['VendorID', 'RatecodeID', 'store_and_fwd_flag', 'extra',
                'mta_tax', 'tolls_amount', 'improvement_surcharge','Airport_fee']
taxi_df = taxi_df.drop(columns=[col for col in cols_to_drop if col in taxi_df.columns])

Cleaning and transforming taxi data...
Filtered out 611,534 invalid trips. Remaining: 8,772,953


In [11]:
# # -----------------------------
# 3️⃣ Clean Weather Data
# -----------------------------
print("Cleaning and transforming weather data...")

# Select relevant columns: DATE, TMAX (Max Temp), TMIN (Min Temp), PRCP (Precipitation)
weather_cols = ['DATE', 'TMAX', 'TMIN', 'PRCP']
weather_df = weather_df[weather_cols]

# 3.1 Convert Date to correct format for joining
weather_df['DATE'] = pd.to_datetime(weather_df['DATE']).dt.date

# 3.2 Convert units (NOAA data is often in tenths of units)
# Temperature is in tenths of a degree C/F. Assuming F in NYC for simplicity, divide by 10.
# We'll keep it general (Tenths of Units -> Units)
weather_df['TMAX'] = weather_df['TMAX'] / 10
weather_df['TMIN'] = weather_df['TMIN'] / 10

# Precipitation (PRCP) is in tenths of mm/inches. We'll divide by 10.
# Note: For NYC Central Park, it's usually in tenths of mm.
weather_df['PRCP'] = weather_df['PRCP'] / 10

# 3.3 Create a 'Rain_Day' flag
# Assuming a 'rain day' has measurable precipitation (> 0 units)
weather_df['Rain_Day'] = np.where(weather_df['PRCP'] > 0, 1, 0)

# Rename DATE for merge consistency
weather_df = weather_df.rename(columns={'DATE': 'pickup_date',
                                        'TMAX': 'max_temp',
                                        'TMIN': 'min_temp',
                                        'PRCP': 'precipitation'})
print('done')

Cleaning and transforming weather data...
done


In [12]:


# -----------------------------
# 4️⃣ Merge Taxi and Weather Data
# -----------------------------
print("Merging taxi and weather data...")

# Perform an inner join to only keep taxi trips that have matching weather data
final_df = pd.merge(
    taxi_df,
    weather_df,
    on='pickup_date',
    how='left'
)

# Convert pickup_date back to datetime for PostgreSQL consistency and easy querying
final_df['pickup_date'] = pd.to_datetime(final_df['pickup_date'])

print(f"Final merged dataset size: {len(final_df):,}")
print("Final columns:", final_df.columns.tolist())

Merging taxi and weather data...
Final merged dataset size: 8,772,953
Final columns: ['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'passenger_count', 'trip_distance', 'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'tip_amount', 'total_amount', 'congestion_surcharge', 'airport_fee', 'pickup_date', 'pickup_hour', 'pickup_dayofweek', 'trip_duration', 'max_temp', 'min_temp', 'precipitation', 'Rain_Day']


In [13]:
# # -----------------------------
# 5️⃣ Save Final Clean Data
# -----------------------------
output_path = os.path.join(CLEAN_DIR, "cleaned_nyc_trips_q1.csv")
final_df.to_csv(output_path, index=False)

print(f"\n✅ Cleaned and transformed data saved to: {output_path}")
print("Ready for Step 7: Load Clean Data to PostgreSQL.")


✅ Cleaned and transformed data saved to: ../data_clean\cleaned_nyc_trips_q1.csv
Ready for Step 7: Load Clean Data to PostgreSQL.


✅ Merged dataset shape: (9177624, 17)
