In [1]:
import pandas as pd
from sqlalchemy import create_engine, text

# --- Configuration (Ensure these match your previous setup) ---
DB_USER = "postgres"
DB_PASS = "etes1209111"      # Use your confirmed password
DB_HOST = "127.0.0.1"    
DB_PORT = "5432"
DB_NAME = "nyc_taxi"
TABLE_NAME = "taxi_trips"

# SQLAlchemy Connection String
DATABASE_URL = f"postgresql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
engine = create_engine(DATABASE_URL)

print("✅ Database engine created. Ready to execute SQL queries.")

✅ Database engine created. Ready to execute SQL queries.


In [2]:
def run_query(sql_query):
    """Executes a given SQL query and returns the results as a Pandas DataFrame."""
    try:
        with engine.connect() as connection:
            df = pd.read_sql(sql_query, connection)
            print(f"Query executed successfully. Rows returned: {len(df):,}")
            return df
    except Exception as e:
        print(f"❌ ERROR executing query:\n{e}")
        return None

In [3]:
# Q1: Monthly Trip Volume
monthly_volume_query = """
SELECT
    TO_CHAR(tpep_pickup_datetime, 'YYYY-MM') AS trip_month,
    COUNT(*) AS total_trips,
    ROUND(AVG(total_amount)::numeric, 2) AS avg_fare
FROM
    taxi_trips
GROUP BY
    trip_month
ORDER BY
    trip_month;
"""

df_monthly_volume = run_query(monthly_volume_query)
if df_monthly_volume is not None:
    print("\n--- Monthly Trip Volume ---")
    print(df_monthly_volume)

Query executed successfully. Rows returned: 10

--- Monthly Trip Volume ---
  trip_month  total_trips  avg_fare
0    2002-12            2     58.63
1    2003-01            2     95.40
2    2008-12            4     35.44
3    2009-01            1     16.40
4    2022-10            3     12.97
5    2022-12           24     26.99
6    2023-01      2873090     27.33
7    2023-02      2722079     27.27
8    2023-03      3177665     28.19
9    2023-04           83     25.84


In [4]:
# Q2: Peak Hour Analysis
peak_hour_query = """
SELECT
    pickup_hour,
    COUNT(*) AS total_trips
FROM
    taxi_trips
GROUP BY
    pickup_hour
ORDER BY
    total_trips DESC
LIMIT 5;
"""

df_peak_hours = run_query(peak_hour_query)
if df_peak_hours is not None:
    print("\n--- Top 5 Peak Pickup Hours ---")
    print(df_peak_hours)

Query executed successfully. Rows returned: 5

--- Top 5 Peak Pickup Hours ---
   pickup_hour  total_trips
0           18       629354
1           17       601376
2           19       564204
3           16       552451
4           15       550848


In [5]:
# Q3: Weather Impact Analysis
weather_impact_query = """
SELECT
    CASE 
        WHEN rain_day = 1 THEN 'Rainy Day'
        ELSE 'Dry Day'
    END AS weather_condition,
    COUNT(*) AS total_trips,
    ROUND(AVG(trip_duration)::numeric, 2) AS avg_trip_duration_mins,
    ROUND(AVG(total_amount)::numeric, 2) AS avg_total_fare
FROM
    taxi_trips
GROUP BY
    weather_condition;
"""

df_weather_impact = run_query(weather_impact_query)
if df_weather_impact is not None:
    print("\n--- Weather Impact on Trips ---")
    print(df_weather_impact)

Query executed successfully. Rows returned: 2

--- Weather Impact on Trips ---
  weather_condition  total_trips  avg_trip_duration_mins  avg_total_fare
0           Dry Day      5140542                   15.03           27.71
1         Rainy Day      3632411                   15.06           27.50
