# Preparations

In [None]:
import os
import requests
import zipfile
import pandas as pd
from io import BytesIO
from concurrent.futures import ThreadPoolExecutor
from prophet.make_holidays import make_holidays_df

# ✅ Mount Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


# Load Dataset

In [None]:
data_raw = pd.read_parquet("/content/drive/Shared drives/Time Series/divvy_data/archive/full expanded/divvy_data_expanded.parquet")
print(data_raw.shape)
data_raw.head(4)

(21846092, 12)


Unnamed: 0,rideable_type,date,start_station_name,end_station_name,start_station_id,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,rides
0,classic_bike,2020-12-02,900 W Harrison St,Wolcott Ave & Polk St,13028,TA1309000064,41.874754,-87.649807,41.871262,-87.673688,member,1
1,classic_bike,2020-12-02,Aberdeen St & Jackson Blvd,Clinton St & Roosevelt Rd,13157,WL-008,41.877726,-87.654787,41.867118,-87.641088,member,1
2,classic_bike,2020-12-02,Albany Ave & Montrose Ave,Kedzie Ave & Milwaukee Ave,15621,13085,41.961041,-87.705866,41.929567,-87.707857,member,1
3,classic_bike,2020-12-02,Ashland Ave & 13th St,Blue Island Ave & 18th St,13354,13135,41.865234,-87.666507,41.857556,-87.661535,member,1


# Preprocess Citywide data

In [None]:
def add_unique_station_count(df, window_size=365):
  data_raw = df.copy()
  all_start_stations = data_raw["start_station_name"].unique()
  all_end_stations = data_raw["end_station_name"].unique()
  all_stations = list(set(all_start_stations) | set(all_end_stations))
  start_station_bool = data_raw.groupby(["date", "start_station_name"])["start_station_name"].count().unstack(fill_value=0) > 0
  end_station_bool = data_raw.groupby(["date", "end_station_name"])["end_station_name"].count().unstack(fill_value=0) > 0


  station_count_bool = start_station_bool | end_station_bool
  station_count_bool.fillna(0,inplace=True)
  station_count_bool = station_count_bool.astype(int)
  rolling_station_count = station_count_bool.rolling(window=365, min_periods=1).sum()
  unique_stations_per_window = rolling_station_count.apply(lambda x: len(x[x > 0]), axis=1)
  station_count_bool[f'unique_stations_{window_size}d'] = unique_stations_per_window
  station_count_bool[f'unique_stations_{window_size}d'] = station_count_bool[f'unique_stations_{window_size}d'].shift(1) # shift because we can't use today's value
  return station_count_bool[[f'unique_stations_{window_size}d']].reset_index()[window_size:]

def add_ebike_proportion(df, window_size=365):
  data_raw = df.copy()
  daily_ebike_count = data_raw.groupby('date')['rideable_type'].apply(lambda x: (x == 'electric_bike').sum())
  daily_ebike_count.rename('ebike_count', inplace=True)
  daily_bike_count = data_raw.groupby('date')['rideable_type'].count()
  daily_bike_count.rename('bike_count', inplace=True)
  daily_bike_count = pd.merge(daily_bike_count, daily_ebike_count, how="left",left_index=True, right_index=True)
  rolling_bike_count = daily_bike_count.rolling(window=window_size, min_periods=1).sum()
  rolling_bike_count[f'ebike_proportion_{window_size}d'] = rolling_bike_count['ebike_count'] / rolling_bike_count['bike_count']
  return rolling_bike_count[[f'ebike_proportion_{window_size}d']].reset_index()[window_size:]

def add_date_features(df):
  data_raw = df.copy()
  data_raw["date"] = pd.to_datetime(data_raw["date"])
  data_raw.sort_values("date",inplace=True)
  data_raw["month"] = data_raw["date"].dt.month
  data_raw["dayofweek"] = data_raw["date"].dt.dayofweek
  data_raw["year"] = data_raw["date"].dt.year
  return data_raw

def add_weather_features(df, weather_path):
  grouped_df = df.copy()
  # load weather
  weather = pd.read_csv(weather_path)
  weather["time"] = pd.to_datetime(weather["time"])
  weather.sort_values("time",inplace=True)
  # merge
  grouped_df = grouped_df.merge(weather[["time","temp_min_c","rain_sum_mm","snowfall_sum_cm"]],
                  left_on="date",
                  right_on="time",
                  how="left"
                  )
  grouped_df.drop(columns=["time"],inplace=True)
  return grouped_df

def preprocess_citywide(data_raw):
  grouped_df = data_raw.groupby("date")["rides"].sum().reset_index()
  grouped_df.rename(columns={"rides":"total_rides"},inplace=True)
  # Add date
  grouped_df = add_date_features(grouped_df)

  # Add station availability
  grouped_df = grouped_df.merge(add_unique_station_count(data_raw,364), on="date", how="left")

  # Add e-bike availability
  grouped_df = grouped_df.merge(add_ebike_proportion(data_raw,364), on="date", how="left")

  grouped_df = add_weather_features(grouped_df,
                    weather_path="/content/drive/Shared drives/Time Series/weather_data/daily_weather_forecast_chicago.csv" )
  return grouped_df




In [None]:
rides_citywide = preprocess_citywide(data_raw)
rides_citywide.head()

Unnamed: 0,date,total_rides,month,dayofweek,year,unique_stations_364d,ebike_proportion_364d,temp_min_c,rain_sum_mm,snowfall_sum_cm
0,2020-01-01,2141,1,2,2020,,,-5.5,0.0,0.0
1,2020-01-02,6479,1,3,2020,,,2.0,0.0,0.0
2,2020-01-03,5890,1,4,2020,,,0.0,0.0,0.0
3,2020-01-04,3187,1,5,2020,,,-3.5,0.2,0.42
4,2020-01-05,3035,1,6,2020,,,-4.1,0.0,0.0


In [None]:
rides_citywide.to_csv("/content/drive/Shared drives/Time Series/Notebooks/Modeling/modeling_andy/rides_citywide.csv")