In [None]:
import pandas as pd
import os
from google.colab import drive
drive.mount('/content/drive')

# Define the Parquet file path
parquet_path = "/content/drive/Shared drives/Time Series/divvy_data/prod/divvy_data_expanded.parquet"

df = pd.read_parquet(parquet_path)

# Ensure 'date' column is in datetime format
df["date"] = pd.to_datetime(df["date"])

# Count total rides per day
total_rides_per_day = df.groupby("date")["rides"].sum().reset_index()
total_rides_per_day.rename(columns={"rides": "total_rides"}, inplace=True)

# Count rides that ended at "Outside of Dock" per day
outside_dock_rides_per_day = df[df["end_station_name"] == "Outside of Dock"].groupby("date")["rides"].sum().reset_index()
outside_dock_rides_per_day.rename(columns={"rides": "outside_dock_rides"}, inplace=True)

# Merge both counts
daily_proportion = total_rides_per_day.merge(outside_dock_rides_per_day, on="date", how="left")

# Fill NaN values (if a day has no "Outside of Dock" rides, set count to 0)
daily_proportion["outside_dock_rides"] = daily_proportion["outside_dock_rides"].fillna(0)

# Calculate the proportion of rides ending at "Outside of Dock"
daily_proportion["proportion_outside_dock"] = daily_proportion["outside_dock_rides"] / daily_proportion["total_rides"]

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
daily_proportion.head(500)

Unnamed: 0,date,total_rides,outside_dock_rides,proportion_outside_dock
0,2020-01-01,2141,0.0,0.000000
1,2020-01-02,6479,0.0,0.000000
2,2020-01-03,5890,0.0,0.000000
3,2020-01-04,3187,0.0,0.000000
4,2020-01-05,3035,0.0,0.000000
...,...,...,...,...
495,2021-05-12,13406,1407.0,0.104953
496,2021-05-13,16566,1789.0,0.107992
497,2021-05-14,20460,2265.0,0.110704
498,2021-05-15,15502,1762.0,0.113663


In [None]:
# Create a DataFrame with Divvy's pricing history using "Date Effective"
divvy_pricing_history = pd.DataFrame({
    "date_effective": ["2013-06-01", "2022-06-10", "2023-03-29", "2024-02-01"],
    "annual_membership_price": [75, 119, 130.90, 143.90],
    "day_pass_price": [None, 15, 16.50, 18.10],
    "classic_bike_fee_per_minute": [None, 0.16, 0.17, 0.18],
    "e_bike_fee_per_minute_member": [None, 0.16, 0.17, 0.18],
    "e_bike_fee_per_minute_non_member": [None, 0.39, 0.42, 0.44],
    "scooter_fee_per_minute_member": [None, 0.25, 0.27, 0.29],
    "scooter_fee_per_minute_non_member": [None, 0.39, 0.42, 0.44],
    "source": [
        "https://abc7chicago.com/divvy-bikes-prices-chicago-pay/14388759/",
        "https://blockclubchicago.org/2022/06/10/divvys-new-e-bike-fees-pricing-out-some-riders-users-say/",
        "https://chi.streetsblog.org/2023/03/29/sad-trombone-another-divvy-price-hike-quietly-kicked-in-this-month",
        "https://divvybikes.com/pricechange"
    ]
})

# Convert "date_effective" to datetime format
divvy_pricing_history["date_effective"] = pd.to_datetime(divvy_pricing_history["date_effective"])

# Add an "end_date" column to indicate when each pricing period ended
divvy_pricing_history["end_date"] = divvy_pricing_history["date_effective"].shift(-1)

# Fill the last row's "end_date" with a placeholder for ongoing pricing
divvy_pricing_history.loc[divvy_pricing_history.index[-1], "end_date"] = "Ongoing"

# Convert "end_date" to datetime format where applicable
divvy_pricing_history["end_date"] = pd.to_datetime(divvy_pricing_history["end_date"], errors='coerce')

# Example join operation to merge ride data with pricing history based on date
def join_pricing_to_rides(rides_df, pricing_df):
    rides_df["date"] = pd.to_datetime(rides_df["date"])
    pricing_df = pricing_df.sort_values("date_effective")
    rides_df = rides_df.merge(pricing_df, how="left", left_on="date", right_on="date_effective")
    return rides_df

divvy_pricing_history.head()

  divvy_pricing_history.loc[divvy_pricing_history.index[-1], "end_date"] = "Ongoing"


Unnamed: 0,date_effective,annual_membership_price,day_pass_price,classic_bike_fee_per_minute,e_bike_fee_per_minute_member,e_bike_fee_per_minute_non_member,scooter_fee_per_minute_member,scooter_fee_per_minute_non_member,source,end_date
0,2013-06-01,75.0,,,,,,,https://abc7chicago.com/divvy-bikes-prices-chi...,2022-06-10
1,2022-06-10,119.0,15.0,0.16,0.16,0.39,0.25,0.39,https://blockclubchicago.org/2022/06/10/divvys...,2023-03-29
2,2023-03-29,130.9,16.5,0.17,0.17,0.42,0.27,0.42,https://chi.streetsblog.org/2023/03/29/sad-tro...,2024-02-01
3,2024-02-01,143.9,18.1,0.18,0.18,0.44,0.29,0.44,https://divvybikes.com/pricechange,NaT


In [None]:
# Dang you even gave sources in the dataset