In [1]:
# Notebook Cell 1: imports & path
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from pathlib import Path
import matplotlib.pyplot as plt
from datetime import datetime as dt

PROJECT_ROOT = Path.cwd()
if PROJECT_ROOT.name.lower() == "notebooks":
    PROJECT_ROOT = PROJECT_ROOT.parent
MERGED_CSV = PROJECT_ROOT / "data" / "processed" / "citibike_with_weather_merged.csv"

print("Using MERGED_CSV:", MERGED_CSV)

Using MERGED_CSV: c:\Users\Biswajit\citi-bike-nyc-2022-dashboard\data\processed\citibike_with_weather_merged.csv


In [6]:
# Cell 1: imports + file path check
import pandas as pd
from pathlib import Path
import os

# Resolve project root depending on where notebook is opened from
nb_path = Path().resolve()
# If the notebook is inside "notebooks", step up one level
if nb_path.name.lower() == "notebooks":
    project_root = nb_path.parent
else:
    project_root = nb_path

MERGED_CSV = project_root / "data" / "processed" / "citibike_with_weather_merged.csv"

print("Notebook path:", nb_path)
print("Expected merged CSV path:", MERGED_CSV)
print("Exists?:", MERGED_CSV.exists())
if MERGED_CSV.exists():
    size_mb = os.path.getsize(MERGED_CSV) / (1024*1024)
    print(f"File size: {size_mb:.1f} MB")
else:
    print("File not found at that path — please confirm the file is in data/processed and named exactly:")
    print("  citibike_with_weather_merged.csv")

Notebook path: C:\Users\Biswajit\citi-bike-nyc-2022-dashboard\notebooks
Expected merged CSV path: C:\Users\Biswajit\citi-bike-nyc-2022-dashboard\data\processed\citibike_with_weather_merged.csv
Exists?: True
File size: 165.0 MB


In [7]:
# Cell 2: inspect header then load safely
# (reads only header fast, then loads full file with parse_dates if 'date' exists)

# Read columns quickly
cols = pd.read_csv(MERGED_CSV, nrows=0).columns.tolist()
print("Columns in CSV:", cols)

# Decide whether to parse dates
parse_dates = ['date'] if 'date' in cols else None
print("Will parse dates:", parse_dates)

# If the file is very large you may want to use chunksize to process in parts.
# We'll attempt a normal load; if memory issue occurs, let me know and I'll provide a chunked load.
df = pd.read_csv(MERGED_CSV, low_memory=False, parse_dates=parse_dates)
print("Loaded df shape:", df.shape)
display(df.head())

Columns in CSV: ['ride_id', 'rideable_type', 'started_at', 'ended_at', 'start_station_name', 'start_station_id', 'end_station_name', 'end_station_id', 'start_lat', 'start_lng', 'end_lat', 'end_lng', 'member_casual']
Will parse dates: None
Loaded df shape: (895485, 13)


Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,CA5837152804D4B5,electric_bike,2022-01-26 18:50:39,2022-01-26 18:51:53,12 St & Sinatra Dr N,HB201,12 St & Sinatra Dr N,HB201,40.750604,-74.02402,40.750604,-74.02402,member
1,BA06A5E45B6601D2,classic_bike,2022-01-28 13:14:07,2022-01-28 13:20:23,Essex Light Rail,JC038,Essex Light Rail,JC038,40.712774,-74.036486,40.712774,-74.036486,member
2,7B6827D7B9508D93,classic_bike,2022-01-10 19:55:13,2022-01-10 20:00:37,Essex Light Rail,JC038,Essex Light Rail,JC038,40.712774,-74.036486,40.712774,-74.036486,member
3,6E5864EA6FCEC90D,electric_bike,2022-01-26 07:54:57,2022-01-26 07:55:22,12 St & Sinatra Dr N,HB201,12 St & Sinatra Dr N,HB201,40.750604,-74.02402,40.750604,-74.02402,member
4,E24954255BBDE32D,electric_bike,2022-01-13 18:44:46,2022-01-13 18:45:43,12 St & Sinatra Dr N,HB201,12 St & Sinatra Dr N,HB201,40.750604,-74.02402,40.750604,-74.02402,member


In [8]:
# Check columns and a small sample (run now)
print("Columns:", df.columns.tolist())
print("\nSample types:")
print(df.dtypes)
display(df.head(3))

Columns: ['ride_id', 'rideable_type', 'started_at', 'ended_at', 'start_station_name', 'start_station_id', 'end_station_name', 'end_station_id', 'start_lat', 'start_lng', 'end_lat', 'end_lng', 'member_casual']

Sample types:
ride_id                object
rideable_type          object
started_at             object
ended_at               object
start_station_name     object
start_station_id       object
end_station_name       object
end_station_id         object
start_lat             float64
start_lng             float64
end_lat               float64
end_lng               float64
member_casual          object
dtype: object


Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,CA5837152804D4B5,electric_bike,2022-01-26 18:50:39,2022-01-26 18:51:53,12 St & Sinatra Dr N,HB201,12 St & Sinatra Dr N,HB201,40.750604,-74.02402,40.750604,-74.02402,member
1,BA06A5E45B6601D2,classic_bike,2022-01-28 13:14:07,2022-01-28 13:20:23,Essex Light Rail,JC038,Essex Light Rail,JC038,40.712774,-74.036486,40.712774,-74.036486,member
2,7B6827D7B9508D93,classic_bike,2022-01-10 19:55:13,2022-01-10 20:00:37,Essex Light Rail,JC038,Essex Light Rail,JC038,40.712774,-74.036486,40.712774,-74.036486,member


In [11]:
# A1: parse started_at -> create a datetime 'date' column
import pandas as pd
from pathlib import Path

# df already loaded in your notebook
print("Before:", df.shape)
# parse started_at (robust to different formats)
df['started_at'] = pd.to_datetime(df['started_at'], errors='coerce', infer_datetime_format=True)

# create a date column (datetime date at midnight) — prefer full datetime for plotting
df['date'] = df['started_at'].dt.floor('D')   # midnight of that day

print("After:", df.shape)
print("Example dtypes:\n", df[['started_at','date']].dtypes)
display(df[['started_at','date']].head(5))

Before: (895485, 13)


  df['started_at'] = pd.to_datetime(df['started_at'], errors='coerce', infer_datetime_format=True)


After: (895485, 14)
Example dtypes:
 started_at    datetime64[ns]
date          datetime64[ns]
dtype: object


Unnamed: 0,started_at,date
0,2022-01-26 18:50:39,2022-01-26
1,2022-01-28 13:14:07,2022-01-28
2,2022-01-10 19:55:13,2022-01-10
3,2022-01-26 07:54:57,2022-01-26
4,2022-01-13 18:44:46,2022-01-13


In [12]:
# B1: aggregate bike rides per day
# If you have a ride id column use it, otherwise count rows
if 'ride_id' in df.columns:
    bike_daily = df.groupby('date').agg(bike_rides_daily=('ride_id','count')).reset_index()
else:
    bike_daily = df.groupby('date').size().reset_index(name='bike_rides_daily')

# show
print("bike_daily rows:", bike_daily.shape[0])
display(bike_daily.head(8))

# Save as daily_df variable for the rest of the notebook
daily_df = bike_daily.sort_values('date').reset_index(drop=True)

bike_daily rows: 365


Unnamed: 0,date,bike_rides_daily
0,2022-01-01,592
1,2022-01-02,1248
2,2022-01-03,832
3,2022-01-04,934
4,2022-01-05,914
5,2022-01-06,1297
6,2022-01-07,459
7,2022-01-08,713


In [26]:
df['started_at'] = pd.to_datetime(df['started_at'])
df['date'] = df['started_at'].dt.date
df['date'] = pd.to_datetime(df['date'])

In [14]:
# D1: Top 20 start stations and Plotly bar
import plotly.graph_objects as go
from pathlib import Path

# compute counts
df['_one'] = 1
station_counts = df.groupby('start_station_name').agg(count=('_one','sum')).reset_index()
top20 = station_counts.nlargest(20, 'count').reset_index(drop=True)

# plot horizontal bar (readable station names)
fig_bar = go.Figure(go.Bar(
    x=top20['count'][::-1],
    y=top20['start_station_name'][::-1],
    orientation='h',
    marker=dict(color=top20['count'], colorscale='Blues')
))
fig_bar.update_layout(title='Top 20 Start Stations', xaxis_title='Trips', height=700, margin=dict(l=300))
fig_bar.show()

# save top20 csv for Streamlit
out = Path.cwd() / "notebooks" / "top20.csv"
out.parent.mkdir(parents=True, exist_ok=True)
top20.to_csv(out, index=False)
print("Saved top20 to:", out)

Saved top20 to: c:\Users\Biswajit\citi-bike-nyc-2022-dashboard\notebooks\notebooks\top20.csv


In [17]:
# Notebook cell
from pathlib import Path
p = Path.cwd() / "notebooks" / "top20.csv"
print("top20 exists:", p.exists())
if p.exists():
    import pandas as pd
    print(pd.read_csv(p).head())

top20 exists: True
                             start_station_name  count
0                                 Grove St PATH  42556
1  South Waterfront Walkway - Sinatra Dr & 1 St  34245
2       Hoboken Terminal - River St & Hudson Pl  33020
3      Hoboken Terminal - Hudson St & Hudson Pl  30244
4              City Hall - Washington St & 1 St  23289


In [27]:
daily_df = df.groupby('date').agg(
    bike_rides_daily=('ride_id', 'count')
).reset_index()

print("daily_df columns:", daily_df.columns.tolist())
daily_df.head()

daily_df columns: ['date', 'bike_rides_daily']


Unnamed: 0,date,bike_rides_daily
0,2022-01-01,592
1,2022-01-02,1248
2,2022-01-03,832
3,2022-01-04,934
4,2022-01-05,914


In [28]:
# E1: Dual-axis chart using daily_df
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# ensure date is datetime
daily_df['date'] = pd.to_datetime(daily_df['date'])

fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(go.Scatter(x=daily_df['date'], y=daily_df['bike_rides_daily'],
                         name='Daily Bike Rides', mode='lines'), secondary_y=False)

# add temperature only if exists
if 'temperature' in daily_df.columns:
    fig.add_trace(go.Scatter(x=daily_df['date'], y=daily_df['temperature'],
                             name='Avg Temp', mode='lines'), secondary_y=True)
else:
    print("No 'temperature' column in daily_df — only bike rides plotted.")

fig.update_yaxes(title_text="Bike rides (count)", secondary_y=False)
fig.update_yaxes(title_text="Temperature", secondary_y=True)
fig.update_layout(title='Daily Bike Rides and Temperature', height=600)
fig.show()

No 'temperature' column in daily_df — only bike rides plotted.


In [18]:
print("daily_df columns:", daily_df.columns.tolist())
print("Sample rows:\n", daily_df.head())
print("'temperature' present?", 'temperature' in daily_df.columns)

daily_df columns: ['date', 'bike_rides_daily']
Sample rows:
         date  bike_rides_daily
0 2022-01-01               592
1 2022-01-02              1248
2 2022-01-03               832
3 2022-01-04               934
4 2022-01-05               914
'temperature' present? False


### How the plotting code works
1. I parsed `started_at` into datetime and created a `date` column to be able to aggregate daily counts.
2. I used pandas `groupby()` to compute `bike_rides_daily`.
3. For plotting I used Plotly:
   - Top20 bar chart: `go.Bar` with horizontal orientation for readability.
   - Dual-axis time series: `make_subplots(specs=[[{"secondary_y": True}]])` and `fig.add_trace()` for bike rides and temperature.
4. This approach is "axes-level" (explicit figure/axes/traces) similar to matplotlib's object-oriented pattern — it gives us full control and interactivity.

In [2]:
import os

# Detect base folder whether running as .py or inside Jupyter
if "__file__" in globals():
    BASE = Path(__file__).resolve().parent
else:
    BASE = Path(os.getcwd())

MERGED_CSV = BASE / "data" / "processed" / "citibike_with_weather_merged.csv"
TOP20_CSV = BASE / "docs" / "top20.csv"
KEPLER_HTML = BASE / "docs" / "Citibike_Aggregated_Map.html"