In [1]:

import pandas as pd
from sqlalchemy import create_engine, text
import os

# --- Configuration ---
CLEAN_DIR = "../data_clean"
DATA_FILE = "taxi_zone_lookup.csv"
TABLE_NAME = "taxi_trips"

# PostgreSQL Connection Details (Customize these!)
DB_USER = "postgres"  # Your PostgreSQL username
DB_PASS = "etes1209111" # !! Replace with your actual password !!
# DB_HOST = "localhost"
DB_HOST = "127.0.0.1" 
DB_PORT = "5432"
DB_NAME = "nyc_taxi"

# SQLAlchemy Connection String
DATABASE_URL = f"postgresql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}"

# Create the SQLAlchemy engine
try:
    engine = create_engine(DATABASE_URL)
    print("✅ Successfully created database engine.")
except ImportError:
    print("❌ ERROR: Ensure psycopg2-binary is installed: pip install psycopg2-binary")
    exit()

✅ Successfully created database engine.


In [2]:
# --- File Path ---
file_path = os.path.join(CLEAN_DIR, DATA_FILE)
if not os.path.exists(file_path):
    print(f"❌ ERROR: Clean data file not found at: {file_path}")
    exit()
# --- 1. Read the CSV ---
print(f"Reading data from: {file_path}")
try:
    # Read the CSV (assuming the column names are as seen in the image)
    location_df = pd.read_csv(file_path)
    print(f"Read {len(location_df)} rows from {DATA_FILE}")

    # Standardize column names (make them all lowercase for PostgreSQL)
    location_df.columns = [
        'locationid', 
        'borough', 
        'zone', 
        'service_zone'
    ]
    
    # --- 2. Load to Staging Table in PostgreSQL ---
    staging_table_name = "raw_location_lookup"
    print(f"Loading data into staging table: {staging_table_name}")
    
    # Use to_sql to directly push the DataFrame to PostgreSQL
    location_df.to_sql(
        name=staging_table_name, 
        con=engine, 
        if_exists='replace', # Replace the table if it exists
        index=False,         # Don't save the Pandas index as a column
        method='multi'       # Use this method for better performance with small tables
    )
    
    print(f"✅ Data successfully loaded into '{staging_table_name}'.")

except FileNotFoundError:
    print(f"❌ ERROR: File not found at {file_path}. Check CLEAN_DIR and DATA_FILE constants.")
except Exception as e:
    print(f"❌ An error occurred during load: {e}")

Reading data from: ../data_clean\taxi_zone_lookup.csv
Read 265 rows from taxi_zone_lookup.csv
Loading data into staging table: raw_location_lookup
✅ Data successfully loaded into 'raw_location_lookup'.


In [3]:
# --- 3. Create the final Dim_Location table ---
dim_location_table = "dim_location"

sql_create_dim_location = f"""
DROP TABLE IF EXISTS {dim_location_table};
CREATE TABLE {dim_location_table} AS
SELECT
    locationid AS location_key,  -- Renaming the key for clarity
    zone AS location_name,       -- Descriptive name for reports
    borough,
    service_zone                 -- Keeping this for potential future analysis
FROM
    raw_location_lookup;         -- Source is the staging table

-- Set the primary key for optimal join performance in Power BI
ALTER TABLE {dim_location_table} ADD PRIMARY KEY (location_key);
"""

with engine.connect() as connection:
    print(f"Executing SQL to create final dimension table: {dim_location_table}")
    connection.execute(text(sql_create_dim_location))
    connection.commit()
    
    # --- 4. Validation Check ---
    validation_query = f"SELECT COUNT(*) FROM {dim_location_table};"
    count = connection.execute(text(validation_query)).scalar()
    print(f"✅ Dim_Location created with {count} rows.")
    
print("\n🎉 The Star Schema is now fully complete (Fact, Dim_Date, Dim_Weather, Dim_Location).")

Executing SQL to create final dimension table: dim_location
✅ Dim_Location created with 265 rows.

🎉 The Star Schema is now fully complete (Fact, Dim_Date, Dim_Weather, Dim_Location).
