In [1]:
import pandas as pd
import numpy as np
import requests
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
df = pd.read_csv("Trips_2018.csv")

#Remove unwanted index column if it exists
if "Unnamed: 0" in df.columns:
    df = df.drop(columns=["Unnamed: 0"])

#Convert start and stop times to proper datetime format
df["starttime"] = pd.to_datetime(df["starttime"], errors="coerce")
df["stoptime"]  = pd.to_datetime(df.get("stoptime"), errors="coerce")


#Create a new column 'start_hour' — the trip’s start time rounded down to the hour (used later to join with hourly weather data)
df["start_hour"] = df["starttime"].dt.floor("H")

#Compute trip duration in minutes
df["trip_duration_min"] = (
    df["tripduration"]/60.0
    if "tripduration" in df.columns
    else (df["stoptime"] - df["starttime"]).dt.total_seconds()/60
)

#Find a representative latitude and longitude for weather data Using median coordinates ensures one central location (e.g., NYC center)
LAT, LON = df["start_station_latitude"].median(), df["start_station_longitude"].median()

#Find the overall date range of the dataset for API query
START = df["start_hour"].min().date().isoformat()
END   = df["start_hour"].max().date().isoformat()

#Print a quick summary
print(f"Trips: {len(df)}  |  Date span: {START} → {END}")

  df["start_hour"] = df["starttime"].dt.floor("H")


Trips: 17548339  |  Date span: 2018-01-01 → 2018-12-31


In [3]:
#  === Fetch historical hourly weather data using Open-Meteo API ===

# Base API endpoint (ERA5 dataset provides global historical weather)
url = "https://archive-api.open-meteo.com/v1/era5"

# Define query parameters:
params = {
    "latitude": LAT,
    "longitude": LON,
    "start_date": START,
    "end_date": END,
    "timezone": "America/New_York",
    "hourly": [
        "temperature_2m",         # Air temperature (°C)
        "apparent_temperature",   # Feels-like temperature (°C)
        "rain",                   # Rainfall (mm)
        "snowfall",               # Snowfall (mm)
        "wind_speed_10m",         # Wind speed at 10m height (m/s)
        "relative_humidity_2m",   # Humidity (%)
        "cloud_cover",            # Cloud cover (%)
        "visibility"              # Horizontal visibility (m)
    ]
}

#Send GET request to the API and convert JSON response to DataFrame
wx = pd.DataFrame(requests.get(url, params=params, timeout=30).json()["hourly"])

# Convert time column to datetime format and rename it to 'start_hour'. This aligns weather timestamps with the hourly trip data
wx["start_hour"] = pd.to_datetime(wx["time"])

#  Remove the old 'time' column (since it's now renamed)
wx = wx.drop(columns=["time"])

# Convert all numeric columns to float32 for memory efficiency
wx = wx.astype({c: "float32" for c in wx.columns if c != "start_hour"})

#Print how many hourly weather records were retrieved
print("Weather rows:", len(wx))


Weather rows: 8760


In [4]:
# === Merge trip data with hourly weather data ===

# Set 'start_hour' as index in weather DataFrame (for faster joins). Each weather record represents one hour of conditions at the selected location.
wx = wx.set_index("start_hour")

#Join weather information to each bike trip based on matching 'start_hour'
dfm = df.join(wx, on="start_hour", how="left")



In [5]:
# === Feature Engineering: Add new weather-based features to the original dataset ===

# Convert raw temperature (°C)
dfm["temp_celsius"] = dfm["temperature_2m"]

#Group temperature into descriptive categories
dfm["temp_category"] = pd.cut(
    dfm["temp_celsius"],
    bins=[-99, 0, 10, 20, 25, 30, 99],
    labels=["freezing", "cold", "cool", "comfortable", "warm", "hot"]
)

# Flag dry hours (1 = dry, 0 = raining/snowing)
dfm["is_dry"] = ((dfm["rain"].fillna(0) + dfm["snowfall"].fillna(0)) == 0).astype("int8")

#Convert wind speed (m/s → km/h)
dfm["wind_kmh"] = dfm["wind_speed_10m"] * 3.6

#Categorize sky conditions based on % cloud cover
dfm["sky_condition"] = pd.cut(
    dfm["cloud_cover"],
    bins=[-1, 25, 50, 75, 100.1],
    labels=["clear", "partly_cloudy", "cloudy", "overcast"]
)

#Convert visibility from meters → kilometers
dfm["visibility_km"] = dfm["visibility"] / 1000.0

#Calculate "Cycling Score" (0–100)
#Higher score = better weather for cycling
dfm["cycling_score"] = (
    np.exp(-((dfm["temp_celsius"] - 20) / 10) ** 2) * 40   # ideal temperature comfort
    + dfm["is_dry"] * 30                                   # dry weather adds points
    + (dfm["wind_kmh"] < 20).astype("int8") * 20           # calm wind conditions
    + (dfm["visibility_km"] > 5).astype("int8") * 10       # clear visibility
)

print(f"Total columns now: {dfm.shape[1]}")

dfm.head()


Total columns now: 30


Unnamed: 0,tripduration,starttime,stoptime,start_station_id,start_station_latitude,start_station_longitude,end_station_id,end_station_latitude,end_station_longitude,bikeid,...,relative_humidity_2m,cloud_cover,visibility,temp_celsius,temp_category,is_dry,wind_kmh,sky_condition,visibility_km,cycling_score
0,970,2018-01-01 13:50:57.434,2018-01-01 14:07:08.186,72.0,40.767272,-73.993929,505.0,40.749013,-73.988484,31956,...,30.0,0.0,,-7.8,freezing,1,73.439995,clear,,30.017607
1,723,2018-01-01 15:33:30.182,2018-01-01 15:45:33.341,72.0,40.767272,-73.993929,3255.0,40.750585,-73.994685,32536,...,24.0,55.0,,-6.6,freezing,1,70.919998,cloudy,,30.033819
2,496,2018-01-01 15:39:18.337,2018-01-01 15:47:35.172,72.0,40.767272,-73.993929,525.0,40.755942,-74.002116,16069,...,24.0,55.0,,-6.6,freezing,1,70.919998,cloudy,,30.033819
3,306,2018-01-01 15:40:13.372,2018-01-01 15:45:20.191,72.0,40.767272,-73.993929,447.0,40.763707,-73.985162,31781,...,24.0,55.0,,-6.6,freezing,1,70.919998,cloudy,,30.033819
4,306,2018-01-01 18:14:51.568,2018-01-01 18:19:57.642,72.0,40.767272,-73.993929,3356.0,40.774667,-73.984706,30319,...,39.0,23.0,,-9.8,freezing,1,26.639999,clear,,30.005564
