In [3]:
import pandas as pd
import numpy as np
import requests
import os

print("Libraries imported successfully!")

Libraries imported successfully!


In [2]:
# Create a folder to store raw data
os.makedirs('data/raw', exist_ok=True)

# Base URL of CitiBike 2022 data
base_url = 'https://s3.amazonaws.com/tripdata/'

# List of 2022 CSV filenames (monthly)
files = [
    "202201-citibike-tripdata.csv",
    "202202-citibike-tripdata.csv",
    "202203-citibike-tripdata.csv",
    "202204-citibike-tripdata.csv",
    "202205-citibike-tripdata.csv",
    "202206-citibike-tripdata.csv",
    "202207-citibike-tripdata.csv",
    "202208-citibike-tripdata.csv",
    "202209-citibike-tripdata.csv",
    "202210-citibike-tripdata.csv",
    "202211-citibike-tripdata.csv",
    "202212-citibike-tripdata.csv"
]

# Download each file if not already downloaded
for file in files:
    file_path = os.path.join('data/raw', file)
    if not os.path.exists(file_path):
        print(f"Downloading {file}...")
        r = requests.get(base_url + file)
        with open(file_path, 'wb') as f:
            f.write(r.content)
    else:
        print(f"{file} already exists.")


Downloading 202201-citibike-tripdata.csv...
Downloading 202202-citibike-tripdata.csv...
Downloading 202203-citibike-tripdata.csv...
Downloading 202204-citibike-tripdata.csv...
Downloading 202205-citibike-tripdata.csv...
Downloading 202206-citibike-tripdata.csv...
Downloading 202207-citibike-tripdata.csv...
Downloading 202208-citibike-tripdata.csv...
Downloading 202209-citibike-tripdata.csv...
Downloading 202210-citibike-tripdata.csv...
Downloading 202211-citibike-tripdata.csv...
Downloading 202212-citibike-tripdata.csv...


In [1]:
# Notebook cell: list CSV files in current folder
import os, glob, pathlib
cwd = pathlib.Path.cwd()
print("Current working directory:", cwd)
csvs = list(cwd.glob("../data/raw/*.csv"))
print(f"Found {len(csvs)} csv file(s) in this folder:")
for f in csvs:
    print(" -", f.name)
# also print top-level filenames for quick glance
print("\nTop-level items:")
for p in cwd.iterdir():
    print(" ", p.name)

Current working directory: c:\Users\Biswajit\citi-bike-nyc-2022-dashboard\notebooks
Found 12 csv file(s) in this folder:
 - JC-202201-citibike-tripdata.csv
 - JC-202202-citibike-tripdata.csv
 - JC-202203-citibike-tripdata.csv
 - JC-202204-citibike-tripdata.csv
 - JC-202205-citibike-tripdata.csv
 - JC-202206-citibike-tripdata.csv
 - JC-202207-citbike-tripdata.csv
 - JC-202208-citibike-tripdata.csv
 - JC-202209-citibike-tripdata.csv
 - JC-202210-citibike-tripdata.csv
 - JC-202211-citibike-tripdata.csv
 - JC-202212-citibike-tripdata.csv

Top-level items:
  2.2_citibike_weather_merge.ipynb
  2.3_fundamentals_of_visualization_libraries_Part 1.ipynb
  2.4_fundamentals_of_visualization_libraries_part_2.ipynb
  2.5_advanced_geospatial_plotting.ipynb
  Configurations.ipynb
  kepler_config.json


In [4]:
# Read and combine all monthly CSVs
all_data = []

for file in csvs:
    df = pd.read_csv(os.path.join('data/raw', file))
    all_data.append(df)

# Combine into a single DataFrame
citi_bike_df = pd.concat(all_data, ignore_index=True)

# Check the first few rows
citi_bike_df.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,CA5837152804D4B5,electric_bike,2022-01-26 18:50:39,2022-01-26 18:51:53,12 St & Sinatra Dr N,HB201,12 St & Sinatra Dr N,HB201,40.750604,-74.02402,40.750604,-74.02402,member
1,BA06A5E45B6601D2,classic_bike,2022-01-28 13:14:07,2022-01-28 13:20:23,Essex Light Rail,JC038,Essex Light Rail,JC038,40.712774,-74.036486,40.712774,-74.036486,member
2,7B6827D7B9508D93,classic_bike,2022-01-10 19:55:13,2022-01-10 20:00:37,Essex Light Rail,JC038,Essex Light Rail,JC038,40.712774,-74.036486,40.712774,-74.036486,member
3,6E5864EA6FCEC90D,electric_bike,2022-01-26 07:54:57,2022-01-26 07:55:22,12 St & Sinatra Dr N,HB201,12 St & Sinatra Dr N,HB201,40.750604,-74.02402,40.750604,-74.02402,member
4,E24954255BBDE32D,electric_bike,2022-01-13 18:44:46,2022-01-13 18:45:43,12 St & Sinatra Dr N,HB201,12 St & Sinatra Dr N,HB201,40.750604,-74.02402,40.750604,-74.02402,member


In [6]:
citi_bike_df.shape
citi_bike_df.to_csv(
                    "../data/processed/citibike_with_weather_merged.csv",index=False)

In [None]:
noaa_token = ""


In [None]:
# NOAA API token
token = ""

# NOAA endpoint for daily data
url = "https://www.ncdc.noaa.gov/cdo-web/api/v2/data?"

# Parameters to get daily weather from LaGuardia for 2022
params = {
    "datasetid": "GHCND",
    "stationid": "GHCND:USW00014732",  # LaGuardia station
    "startdate": "2022-01-01",
    "enddate": "2022-12-31",
    "units": "metric",
    "limit": 1000  # NOAA allows max 1000 per request, so might need paging
}

# Header with token
headers = {"token": token}

# Make the request
response = requests.get(url, headers=headers, params=params)

# Check if request is successful
if response.status_code == 200:
    weather_data = response.json()
    print("Weather data fetched successfully!")
else:
    print("Failed to fetch weather data:", response.status_code)

Weather data fetched successfully!


In [13]:
# Extract 'results' from JSON
weather_df = pd.DataFrame(weather_data['results'])

# Save to CSV
os.makedirs('data/processed', exist_ok=True)
weather_df.to_csv('data/processed/la_guardia_weather_2022.csv', index=False)


In [14]:
# Convert start time to date only
citi_bike_df['date'] = pd.to_datetime(citi_bike_df['started_at']).dt.date

# Convert weather date
weather_df['date'] = pd.to_datetime(weather_df['date']).dt.date

# Merge on date
merged_df = pd.merge(citi_bike_df, weather_df, on='date', how='left')

# Save merged CSV
merged_df.to_csv('data/processed/citi_bike_with_weather_2022.csv', index=False)

KeyError: 'started_at'

In [15]:
# Cell 1: load or build the DataFrames (run first)
import os
import pandas as pd

print("Working dir:", os.getcwd())

# Load or build citi_bike_data
try:
    citi_bike_data
    print("citi_bike_data already in memory.")
except NameError:
    merged_path_local = "data/processed/citibike_nyc_2022_merged.csv"
    raw_folder = "data/raw"
    if os.path.exists(merged_path_local):
        print("Loading merged CitiBike CSV from:", merged_path_local)
        citi_bike_data = pd.read_csv(merged_path_local, low_memory=False)
    elif os.path.exists(raw_folder):
        files = sorted([os.path.join(raw_folder, f) for f in os.listdir(raw_folder) if f.endswith('.csv')])
        if len(files) == 0:
            raise FileNotFoundError(f"No CSV files found in {raw_folder}. Put the downloaded monthly CSVs there.")
        print(f"Concatenating {len(files)} CSV files from {raw_folder} ... (may take a minute)")
        df_list = [pd.read_csv(f, low_memory=False) for f in files]
        citi_bike_data = pd.concat(df_list, ignore_index=True)
        os.makedirs("data/processed", exist_ok=True)
        citi_bike_data.to_csv(merged_path_local, index=False)
        print("Saved merged CitiBike to:", merged_path_local)
    else:
        raise FileNotFoundError("No raw data folder found. Please download CitiBike CSVs into data/raw/")

# Load or check weather_df
try:
    weather_df
    print("weather_df already in memory.")
except NameError:
    # check common names
    candidates = [
        "data/processed/laguardia_2022_weather.csv",
        "data/processed/la_guardia_2022_tavg.csv",
        "data/processed/la_guardia_weather_2022.csv",
        "data/processed/laguardia_weather_2022.csv"
    ]
    loaded = False
    for p in candidates:
        if os.path.exists(p):
            print("Loading weather CSV from:", p)
            weather_df = pd.read_csv(p, low_memory=False)
            loaded = True
            break
    if not loaded:
        raise FileNotFoundError("Weather CSV not found in data/processed/. Save NOAA output CSV there (e.g. laguardia_2022_weather.csv).")

print("citi_bike_data shape:", citi_bike_data.shape)
print("weather_df shape:", weather_df.shape)


Working dir: c:\Users\Biswajit\citi-bike-nyc-2022-dashboard
citi_bike_data already in memory.
weather_df already in memory.
citi_bike_data shape: (895485, 13)
weather_df shape: (1000, 5)


In [16]:
# Cell 2: inspect column names and show small samples
print("=== CitiBike columns (first 60) ===")
print(list(citi_bike_data.columns)[:60])
print("\n=== CitiBike columns lowercased (first 60) ===")
print([c.lower() for c in list(citi_bike_data.columns)[:60]])

print("\n=== Weather columns (first 60) ===")
print(list(weather_df.columns)[:60])
print("\n=== Weather columns lowercased (first 60) ===")
print([c.lower() for c in list(weather_df.columns)[:60]])

print("\n=== CitiBike sample rows ===")
display(citi_bike_data.head(5))

print("\n=== Weather sample rows ===")
display(weather_df.head(5))


=== CitiBike columns (first 60) ===
['ride_id', 'rideable_type', 'started_at', 'ended_at', 'start_station_name', 'start_station_id', 'end_station_name', 'end_station_id', 'start_lat', 'start_lng', 'end_lat', 'end_lng', 'member_casual']

=== CitiBike columns lowercased (first 60) ===
['ride_id', 'rideable_type', 'started_at', 'ended_at', 'start_station_name', 'start_station_id', 'end_station_name', 'end_station_id', 'start_lat', 'start_lng', 'end_lat', 'end_lng', 'member_casual']

=== Weather columns (first 60) ===
['date', 'datatype', 'station', 'attributes', 'value']

=== Weather columns lowercased (first 60) ===
['date', 'datatype', 'station', 'attributes', 'value']

=== CitiBike sample rows ===


Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,CA5837152804D4B5,electric_bike,2022-01-26 18:50:39,2022-01-26 18:51:53,12 St & Sinatra Dr N,HB201,12 St & Sinatra Dr N,HB201,40.750604,-74.02402,40.750604,-74.02402,member
1,BA06A5E45B6601D2,classic_bike,2022-01-28 13:14:07,2022-01-28 13:20:23,Essex Light Rail,JC038,Essex Light Rail,JC038,40.712774,-74.036486,40.712774,-74.036486,member
2,7B6827D7B9508D93,classic_bike,2022-01-10 19:55:13,2022-01-10 20:00:37,Essex Light Rail,JC038,Essex Light Rail,JC038,40.712774,-74.036486,40.712774,-74.036486,member
3,6E5864EA6FCEC90D,electric_bike,2022-01-26 07:54:57,2022-01-26 07:55:22,12 St & Sinatra Dr N,HB201,12 St & Sinatra Dr N,HB201,40.750604,-74.02402,40.750604,-74.02402,member
4,E24954255BBDE32D,electric_bike,2022-01-13 18:44:46,2022-01-13 18:45:43,12 St & Sinatra Dr N,HB201,12 St & Sinatra Dr N,HB201,40.750604,-74.02402,40.750604,-74.02402,member



=== Weather sample rows ===


Unnamed: 0,date,datatype,station,attributes,value
0,2022-01-01T00:00:00,ADPT,GHCND:USW00014732,",,W,",100.0
1,2022-01-01T00:00:00,ASLP,GHCND:USW00014732,",,W,",10078.0
2,2022-01-01T00:00:00,ASTP,GHCND:USW00014732,",,W,",10078.0
3,2022-01-01T00:00:00,AWBT,GHCND:USW00014732,",,W,",106.0
4,2022-01-01T00:00:00,AWND,GHCND:USW00014732,",,W,",2.8


In [17]:
# Cell 3: auto-detect start-time, normalize dates, aggregate weather, merge, save
import numpy as np
import os

# --- Detect start-time column in CitiBike ---
c_cols = list(citi_bike_data.columns)
c_lower_map = {c.lower(): c for c in c_cols}

candidates = [
    'started_at','start_time','startedat','starttime','start','start_date','start_date_local',
    'starttime_local','start_ts','start_timestamp','started_at_local','started_at_utc'
]

found_start = None
for cand in candidates:
    if cand in c_lower_map:
        found_start = c_lower_map[cand]
        break

if not found_start:
    # fallback substring rule
    for actual in c_cols:
        low = actual.lower()
        if 'start' in low and ('time' in low or 'at' in low or 'date' in low):
            found_start = actual
            break

if not found_start:
    raise KeyError("Could not detect a start-time column in CitiBike data. Inspect columns from Cell 2 and set 'found_start' manually.")

print("Detected CitiBike start-time column:", found_start)

# Convert to datetime
citi_bike_data['_start_dt'] = pd.to_datetime(citi_bike_data[found_start], errors='coerce')
na_rate = citi_bike_data['_start_dt'].isna().mean()
print(f"NaT (missing timestamp) rate after parse: {na_rate:.2%}")
if na_rate > 0.9:
    # show sample problematic values
    print("High NaT rate — sample values from the start-time column:")
    display(citi_bike_data[found_start].head(20))
    raise ValueError("Timestamp parsing failed for most rows. Inspect the sample above and tell me the column name/format.")

citi_bike_data['date'] = citi_bike_data['_start_dt'].dt.date

# --- Detect date column in weather_df ---
w_cols = list(weather_df.columns)
w_lower_map = {c.lower(): c for c in w_cols}

w_candidates = ['date','datetime','time','dateutc','date_time','observation_time','valid']
found_wdate = None
for cand in w_candidates:
    if cand in w_lower_map:
        found_wdate = w_lower_map[cand]
        break

if not found_wdate:
    for actual in w_cols:
        low = actual.lower()
        if 'date' in low or 'time' in low:
            found_wdate = actual
            break

if not found_wdate:
    raise KeyError("Could not detect a date/time column in weather_df. Inspect columns from Cell 2 and tell me the correct weather date column name.")

print("Detected weather date column:", found_wdate)
weather_df['_weather_dt'] = pd.to_datetime(weather_df[found_wdate], errors='coerce')
if weather_df['_weather_dt'].isna().all():
    print("Sample weather date values:")
    display(weather_df[found_wdate].head(20))
    raise ValueError("Weather date parsing failed for all rows. Inspect sample above.")

weather_df['date'] = weather_df['_weather_dt'].dt.date

# --- Aggregate weather by date (numeric mean, non-numeric first) ---
numeric_cols = weather_df.select_dtypes(include=[np.number]).columns.tolist()
non_numeric = [c for c in weather_df.columns if c not in numeric_cols and c not in ['date','_weather_dt']]

agg = {c: 'mean' for c in numeric_cols}
for c in non_numeric:
    agg[c] = 'first'

weather_agg = weather_df.groupby('date').agg(agg).reset_index()
print("Weather aggregated shape:", weather_agg.shape)
display(weather_agg.head(5))

# --- Merge ---
merged_df = citi_bike_data.merge(weather_agg, on='date', how='left', indicator=True)
print("Merge counts:")
print(merged_df['_merge'].value_counts())

# Save merged CSV
os.makedirs('data/processed', exist_ok=True)
out_path = "data/processed/citibike_with_weather_merged.csv"
merged_df.to_csv(out_path, index=False)
print("Saved merged CSV to:", out_path)

# Display small sample of merged table
display(merged_df.head(5))


Detected CitiBike start-time column: started_at
NaT (missing timestamp) rate after parse: 0.00%
Detected weather date column: date
Weather aggregated shape: (54, 5)


Unnamed: 0,date,value,datatype,station,attributes
0,2022-01-01,1110.5,ADPT,GHCND:USW00014732,",,W,"
1,2022-01-02,1112.921053,ADPT,GHCND:USW00014732,",,W,"
2,2022-01-03,1172.227778,ADPT,GHCND:USW00014732,",,W,"
3,2022-01-04,1178.855556,ADPT,GHCND:USW00014732,",,W,"
4,2022-01-05,1050.75,ADPT,GHCND:USW00014732,",,W,"


Merge counts:
_merge
left_only     842391
both           53094
right_only         0
Name: count, dtype: int64
Saved merged CSV to: data/processed/citibike_with_weather_merged.csv


Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,_start_dt,date,value,datatype,station,attributes,_merge
0,CA5837152804D4B5,electric_bike,2022-01-26 18:50:39,2022-01-26 18:51:53,12 St & Sinatra Dr N,HB201,12 St & Sinatra Dr N,HB201,40.750604,-74.02402,40.750604,-74.02402,member,2022-01-26 18:50:39,2022-01-26,1171.016667,ADPT,GHCND:USW00014732,",,W,",both
1,BA06A5E45B6601D2,classic_bike,2022-01-28 13:14:07,2022-01-28 13:20:23,Essex Light Rail,JC038,Essex Light Rail,JC038,40.712774,-74.036486,40.712774,-74.036486,member,2022-01-28 13:14:07,2022-01-28,1030.57,ADPT,GHCND:USW00014732,",,W,",both
2,7B6827D7B9508D93,classic_bike,2022-01-10 19:55:13,2022-01-10 20:00:37,Essex Light Rail,JC038,Essex Light Rail,JC038,40.712774,-74.036486,40.712774,-74.036486,member,2022-01-10 19:55:13,2022-01-10,1168.655556,ADPT,GHCND:USW00014732,",,W,",both
3,6E5864EA6FCEC90D,electric_bike,2022-01-26 07:54:57,2022-01-26 07:55:22,12 St & Sinatra Dr N,HB201,12 St & Sinatra Dr N,HB201,40.750604,-74.02402,40.750604,-74.02402,member,2022-01-26 07:54:57,2022-01-26,1171.016667,ADPT,GHCND:USW00014732,",,W,",both
4,E24954255BBDE32D,electric_bike,2022-01-13 18:44:46,2022-01-13 18:45:43,12 St & Sinatra Dr N,HB201,12 St & Sinatra Dr N,HB201,40.750604,-74.02402,40.750604,-74.02402,member,2022-01-13 18:44:46,2022-01-13,1145.411111,ADPT,GHCND:USW00014732,",,W,",both


**Data sources**
- CitiBike trip data (2022) — raw CSV files per month/period (downloaded from S3).
- NOAA daily weather (LaGuardia station) — daily average temperature for 2022 (obtained via NOAA API).

**What\s the Task Here**
- Combined all CitiBike 2022 CSVs into a single DataFrame using a memory-efficient pattern:  
  `pd.concat((pd.read_csv(f) for f in filepaths), ignore_index=True)`.
- Retrieved daily avg temperature (TAVG) for LaGuardia via NOAA API, converted values from tenths of °C to °C.
- Created `date` columns in both datasets (CitiBike: from `start_time`; NOAA: parsed datetimes), and merged by `date` using a left join:
  `df_merged = df.merge(df_temps, how='left', on='date', indicator=True)`.
- Saved a merged CSV locally: `data/processed/citibike_nyc_2022_with_weather.csv`.

**Relationship & what achieved**
- The merged dataset attaches daily weather (avg temperature) to each bike trip, enabling analysis of how temperature correlates with ridership (seasonality, peaks).
- With this merged data we can now create strategic visualizations: top stations (bar charts), monthly trips vs temperature (line charts), and geospatial route analysis (maps). This completes the data-prep stage; next step is to build interactive visuals/dashboards.

**Notes**
- Large raw / processed data files are excluded from the Git repo (`.gitignore` includes `data/`) to respect GitHub size limits. The notebooks and sample outputs are included in the repository.