## Using best hyperparameters for XGBoost model:

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# XGBoost
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

# ✅ Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

from prophet.make_holidays import make_holidays_df

# Load training and test data
train = pd.read_csv("/content/drive/Shared drives/Time Series/divvy_data/prod/station/divvy_station_train.csv")
test = pd.read_csv("/content/drive/Shared drives/Time Series/divvy_data/prod/station/divvy_station_test.csv")

# Ensure dates are in datetime format
train['ds'] = pd.to_datetime(train['date'])
test['ds'] = pd.to_datetime(test['date'])
train.drop(columns=['date'], inplace=True)
test.drop(columns=['date'], inplace=True)

# Create a combined dataframe with a marker column
train['is_train'] = True
test['is_train'] = False
combined = pd.concat([train, test])

# Feature engineering for XGBoost
for df in [combined]:
    df['day'] = df['ds'].dt.day
    df['month'] = df['ds'].dt.month
    df['year'] = df['ds'].dt.year
    df['weekday'] = df['ds'].dt.weekday
    df['week_of_year'] = df['ds'].dt.isocalendar().week
    df['is_weekend'] = df['weekday'].isin([5, 6]).astype(int)
    df['rain_intensity'] = pd.cut(df['rain_sum_mm'], bins=[-1, 0, 5, 20, np.inf], labels=[0, 1, 2, 3])
    df['snow_flag'] = (df['snowfall_sum_cm'] > 0).astype(int)
    df['rain_intensity'] = df['rain_intensity'].astype(float)
    df['weekend_rain'] = df['is_weekend'] * df['rain_intensity']
    df['cold_no_snow'] = (df['temp_min_c'] < 5) & (df['snow_flag'] == 0)

# Split back into train and test
train = combined[combined['is_train']].drop('is_train', axis=1)
test = combined[~combined['is_train']].drop('is_train', axis=1)

# Drop the datetime column, start_station_id
train.drop(columns=['ds', 'start_station_id'], inplace=True)
test.drop(columns=['ds', 'start_station_id'], inplace=True)

# Ensure only common stations exist in test before encoding
test = test[test['start_station_name'].isin(train['start_station_name'].unique())]

# One-hot encoding
train = pd.get_dummies(train, columns=['start_station_name'], drop_first=True)
test = pd.get_dummies(test, columns=['start_station_name'], drop_first=True)

# Align test columns with train columns
test = test.reindex(columns=train.columns, fill_value=0)

# Prepare data for XGBoost
X_train = train.drop(columns=['total_rides'])
y_train = train['total_rides']
X_test = test.drop(columns=['total_rides'])
y_test = test['total_rides']

best_params = {
    'colsample_bytree': 0.8,
    'learning_rate': 0.1,
    'max_depth': 10,
    'n_estimators': 1000,
    'subsample': 0.8
}

best_xgb_model = XGBRegressor(**best_params, random_state=42)
best_xgb_model.fit(X_train, y_train)

# Ensure test features match train
X_test = X_test[X_train.columns]

# Make predictions
y_pred = best_xgb_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

# R-squared score
r2 = best_xgb_model.score(X_test, y_test)
print(f'R-squared: {r2}')

# Calculate potential for each station
potential = {}
stations = train.columns[train.columns.str.startswith('start_station_name_')]
for station in stations:
    if station in test.columns:
        train_mean = train[train[station] == 1]['total_rides'].mean()
        test_mean = test[test[station] == 1]['total_rides'].mean()

        if not np.isnan(test_mean):
            potential[station] = {
                'train_mean': train_mean,
                'test_mean': test_mean,
                'potential_increase': test_mean - train_mean,
                'percentage_increase': (test_mean - train_mean) / train_mean * 100
            }

# Sort by potential increase and get top 3 without NaNs
top_3_stations = sorted(potential.items(), key=lambda x: x[1]['potential_increase'], reverse=True)[:3]
top_3_stations = [station_data for station_data in top_3_stations if not np.isnan(station_data[1]['test_mean'])]
top_3_stations = sorted(top_3_stations, key=lambda x: x[1]['potential_increase'], reverse=True)[:3]

# Plotting and R^2 calculation for top 3 stations
for station, data in top_3_stations:
    station_name = station[len('start_station_name_'):]

    # Get data for plotting
    df_train_station = train[train[station] == 1][['ds', 'total_rides']].copy()
    df_test_station = test[test[station] == 1][['ds', 'total_rides']].copy()

    X_test_station = df_test_station[X_train.columns.intersection(df_test_station.columns)]
    y_pred_station = best_xgb_model.predict(X_test_station)

    df_train_station.rename(columns={'ds': 'date'}, inplace=True)
    df_test_station.rename(columns={'ds': 'date'}, inplace=True)
    df_test_station['predictions'] = y_pred_station

    # Calculate R² score
    r2 = r2_score(df_test_station['total_rides'], y_pred_station)
    print(f"Station: {station_name}, R²: {r2:.2f}")

    # Time series plot
    plt.figure(figsize=(12, 6))
    plt.plot(df_train_station['date'], df_train_station['total_rides'], label='Train')
    plt.plot(df_test_station['date'], df_test_station['total_rides'], label='Test')
    plt.plot(df_test_station['date'], df_test_station['predictions'], label='Predicted', linestyle='--')  # Using df_test_station['predictions']
    plt.title(f'Predictions for {station_name} (R²: {r2:.2f})')
    plt.legend()

    # Format the x-axis
    plt.gca().xaxis.set_major_locator(mdates.AutoDateLocator())
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
    plt.gcf().autofmt_xdate()

    plt.show()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


  train = pd.read_csv("/content/drive/Shared drives/Time Series/divvy_data/prod/station/divvy_station_train.csv")


In [None]:
from sklearn.metrics import r2_score
for station, data in top_3_stations:
    station_name = station[len('start_station_name_'):]
    station_mask = test[station] == 1
    station_r2 = r2_score(y_test[station_mask], y_pred[station_mask])
    print(f"Station: {station_name}")
    print(f"  Train Mean: {data['train_mean']:.2f}")
    print(f"  Test Mean: {data['test_mean']:.2f}")
    print(f"  Potential Increase: {data['potential_increase']:.2f}")
    print(f"  Percentage Increase: {data['percentage_increase']:.2f}%")
    print(f"  R^2 Score: {station_r2:.2f}")

# Plotting the top 3 stations
station_names = [station[len('start_station_name_'):] for station, _ in top_3_stations]
train_means = [data['train_mean'] for _, data in top_3_stations]
test_means = [data['test_mean'] for _, data in top_3_stations]

plt.figure(figsize=(10, 6))
plt.bar(station_names, train_means, label='Train Mean', alpha=0.6)
plt.bar(station_names, test_means, label='Test Mean', alpha=0.6)
plt.xlabel('Station')
plt.ylabel('Mean Total Rides')
plt.title('Top 3 Stations by Potential Increase in Rides')
plt.legend()
plt.show()

ValueError: Found array with 0 sample(s) (shape=(0,)) while a minimum of 1 is required.

In [7]:
test

Unnamed: 0,total_rides,temp_min_c,rain_sum_mm,snowfall_sum_cm,month,dayofweek,year,day,weekday,week_of_year,...,start_station_name_Wood St & Taylor St,start_station_name_Wood St & Taylor St (Temp),start_station_name_Wood St & Webster Ave,start_station_name_Woodlawn & 103rd - Olive Harvey Vaccination Site,start_station_name_Woodlawn Ave & 55th St,start_station_name_Woodlawn Ave & 58th St,start_station_name_Woodlawn Ave & 75th St,start_station_name_Woodlawn Ave & Lake Park Ave,start_station_name_Yates Blvd & 75th St,start_station_name_Yates Blvd & 93rd St
301680,1,16.9,0.0,0.00,8,0,2021,16,0,33,...,0,False,0,0,False,0,False,False,False,0
747251,32,18.9,0.0,0.00,7,3,2023,20,3,29,...,0,False,0,0,False,0,False,False,False,0
359415,13,-0.5,0.0,0.00,11,3,2021,18,3,46,...,0,False,0,0,False,0,False,False,False,0
214861,3,1.9,3.9,0.07,3,3,2021,18,3,11,...,0,False,0,0,False,0,False,False,False,0
753793,123,19.1,0.0,0.00,7,5,2023,29,5,30,...,0,False,0,0,False,0,False,False,False,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
409607,6,-2.5,0.0,0.00,2,6,2022,27,6,8,...,0,False,0,0,False,0,False,False,False,0
43473,10,10.0,11.5,0.00,4,1,2020,7,1,15,...,0,False,0,0,False,0,False,False,False,0
825589,10,7.7,0.5,0.00,11,2,2023,8,2,45,...,0,False,0,0,False,0,False,False,False,0
622828,11,-3.1,0.0,0.00,1,0,2023,9,0,2,...,0,False,0,0,False,0,False,False,False,0


In [None]:
station_mask = test[station] == 1

In [5]:
y_test

Unnamed: 0,total_rides
301680,1
747251,32
359415,13
214861,3
753793,123
...,...
409607,6
43473,10
825589,10
622828,11


In [6]:
y_pred

array([14.784644 , 17.382305 ,  8.68138  , ..., 21.714663 ,  7.2776504,
       22.390568 ], dtype=float32)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# XGBoost
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# ✅ Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

from prophet.make_holidays import make_holidays_df

# Load training and test data
train = pd.read_csv("/content/drive/Shared drives/Time Series/divvy_data/prod/station/divvy_station_train.csv")
# Sample 1% of the training data
sampled_train = train.sample(frac=0.01, random_state=42)

# Split the sampled data into train and test
train, test = train_test_split(sampled_train, test_size=0.2, random_state=42)  # Adjust test_size as needed

# Ensure dates are in datetime format
train['ds'] = pd.to_datetime(train['date'])
test['ds'] = pd.to_datetime(test['date'])
train.drop(columns=['date'], inplace=True)
test.drop(columns=['date'], inplace=True)

# Create a combined dataframe with a marker column
train['is_train'] = True
test['is_train'] = False
combined = pd.concat([train, test])

# Feature engineering for XGBoost
for df in [combined]:
    df['day'] = df['ds'].dt.day
    df['month'] = df['ds'].dt.month
    df['year'] = df['ds'].dt.year
    df['weekday'] = df['ds'].dt.weekday
    df['week_of_year'] = df['ds'].dt.isocalendar().week
    df['is_weekend'] = df['weekday'].isin([5, 6]).astype(int)
    df['rain_intensity'] = pd.cut(df['rain_sum_mm'], bins=[-1, 0, 5, 20, np.inf], labels=[0, 1, 2, 3])
    df['snow_flag'] = (df['snowfall_sum_cm'] > 0).astype(int)
    df['rain_intensity'] = df['rain_intensity'].astype(float)
    df['weekend_rain'] = df['is_weekend'] * df['rain_intensity']
    df['cold_no_snow'] = (df['temp_min_c'] < 5) & (df['snow_flag'] == 0)

# Split back into train and test
train = combined[combined['is_train']].drop('is_train', axis=1)
test = combined[~combined['is_train']].drop('is_train', axis=1)

# Drop the datetime column, start_station_id
train.drop(columns=['ds', 'start_station_id'], inplace=True)
test.drop(columns=['ds', 'start_station_id'], inplace=True)

# Ensure only common stations exist in test before encoding
test = test[test['start_station_name'].isin(train['start_station_name'].unique())]

# One-hot encoding
train = pd.get_dummies(train, columns=['start_station_name'], drop_first=True)
test = pd.get_dummies(test, columns=['start_station_name'], drop_first=True)

# Align test columns with train columns
test = test.reindex(columns=train.columns, fill_value=0)

# Prepare data for XGBoost
X_train = train.drop(columns=['total_rides'])
y_train = train['total_rides']
X_test = test.drop(columns=['total_rides'])
y_test = test['total_rides']

best_params = {
    'colsample_bytree': 0.8,
    'learning_rate': 0.1,
    'max_depth': 10,
    'n_estimators': 1000,
    'subsample': 0.8
}

best_xgb_model = XGBRegressor(**best_params, random_state=42)
best_xgb_model.fit(X_train, y_train)

# Ensure test features match train
X_test = X_test[X_train.columns]

# Make predictions
y_pred = best_xgb_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

# R-squared score
r2 = best_xgb_model.score(X_test, y_test)
print(f'R-squared: {r2}')

# Calculate potential for each station
potential = {}
stations = train.columns[train.columns.str.startswith('start_station_name_')]
for station in stations:
    if station in test.columns:
        train_mean = train[train[station] == 1]['total_rides'].mean()
        test_mean = test[test[station] == 1]['total_rides'].mean()

        if not np.isnan(test_mean):
            potential[station] = {
                'train_mean': train_mean,
                'test_mean': test_mean,
                'potential_increase': test_mean - train_mean,
                'percentage_increase': (test_mean - train_mean) / train_mean * 100
            }

# Sort by potential increase and get top 3 without NaNs
top_3_stations = sorted(potential.items(), key=lambda x: x[1]['potential_increase'], reverse=True)[:3]
top_3_stations = [station_data for station_data in top_3_stations if not np.isnan(station_data[1]['test_mean'])]
top_3_stations = sorted(top_3_stations, key=lambda x: x[1]['potential_increase'], reverse=True)[:3]

# Plotting and R^2 calculation for top 3 stations
for station, data in top_3_stations:
    station_name = station[len('start_station_name_'):]

    # Get data for plotting
    df_train_station = train[train[station] == 1][['ds', 'total_rides']].copy()
    df_test_station = test[test[station] == 1][['ds', 'total_rides']].copy()

    X_test_station = df_test_station[X_train.columns.intersection(df_test_station.columns)]
    y_pred_station = best_xgb_model.predict(X_test_station)

    df_train_station.rename(columns={'ds': 'date'}, inplace=True)
    df_test_station.rename(columns={'ds': 'date'}, inplace=True)
    df_test_station['predictions'] = y_pred_station

    # Calculate R² score
    r2 = r2_score(df_test_station['total_rides'], y_pred_station)
    print(f"Station: {station_name}, R²: {r2:.2f}")

    # Time series plot
    plt.figure(figsize=(12, 6))
    plt.plot(df_train_station['date'], df_train_station['total_rides'], label='Train')
    plt.plot(df_test_station['date'], df_test_station['total_rides'], label='Test')
    plt.plot(df_test_station['date'], df_test_station['predictions'], label='Predicted', linestyle='--')  # Using df_test_station['predictions']
    plt.title(f'Predictions for {station_name} (R²: {r2:.2f})')
    plt.legend()

    # Format the x-axis
    plt.gca().xaxis.set_major_locator(mdates.AutoDateLocator())
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
    plt.gcf().autofmt_xdate()

    plt.show()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


  train = pd.read_csv("/content/drive/Shared drives/Time Series/divvy_data/prod/station/divvy_station_train.csv")


Mean Squared Error: 405.98046875
R-squared: 0.5762161016464233


KeyError: "['ds'] not in index"

In [2]:
for station, data in top_3_stations:
    station_name = station[len('start_station_name_'):]

    # Get data for plotting - using 'date' column instead of 'ds'
    df_train_station = train[train[station] == 1][['date', 'total_rides']].copy()
    df_test_station = test[test[station] == 1][['date', 'total_rides']].copy()

    # Ensure columns are aligned before prediction
    X_test_station = df_test_station[X_train.columns.intersection(df_test_station.columns)]
    y_pred_station = best_xgb_model.predict(X_test_station)

    df_train_station['date'] = pd.to_datetime(df_train_station['date']) # Convert 'date' to datetime
    df_test_station['date'] = pd.to_datetime(df_test_station['date'])   # Convert 'date' to datetime
    df_test_station['predictions'] = y_pred_station

    # Calculate R² score
    r2 = r2_score(df_test_station['total_rides'], y_pred_station)
    print(f"Station: {station_name}, R²: {r2:.2f}")

    # Time series plot
    plt.figure(figsize=(12, 6))
    plt.plot(df_train_station['date'], df_train_station['total_rides'], label='Train')
    plt.plot(df_test_station['date'], df_test_station['total_rides'], label='Test')
    plt.plot(df_test_station['date'], df_test_station['predictions'], label='Predicted', linestyle='--')
    plt.title(f'Predictions for {station_name} (R²: {r2:.2f})')
    plt.legend()

    # Format the x-axis
    plt.gca().xaxis.set_major_locator(mdates.AutoDateLocator())
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
    plt.gcf().autofmt_xdate()

    plt.show()

KeyError: "['date'] not in index"