## Using best hyperparameters for XGBoost model:

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# XGBoost
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

# ✅ Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Load training and test data
train = pd.read_csv("/content/drive/Shared drives/Time Series/divvy_data/prod/station/divvy_station_train.csv")
test = pd.read_csv("/content/drive/Shared drives/Time Series/divvy_data/prod/station/divvy_station_test.csv")

# Ensure dates are in datetime format
train['ds'] = pd.to_datetime(train['date'])
test['ds'] = pd.to_datetime(test['date'])
train.drop(columns=['date'], inplace=True)
test.drop(columns=['date'], inplace=True)

# Create lag features on train ONLY
for lag in [1, 2, 7, 30, 60, 90, 365]:
    train[f'y_lag{lag}'] = train.groupby('start_station_name')['total_rides'].shift(lag)

# Feature engineering for XGBoost
for df in [train, test]:
    df['day'] = df['ds'].dt.day
    df['month'] = df['ds'].dt.month
    df['year'] = df['ds'].dt.year
    df['weekday'] = df['ds'].dt.weekday
    df['week_of_year'] = df['ds'].dt.isocalendar().week
    df['is_weekend'] = df['weekday'].isin([5, 6]).astype(int)
    df['rain_intensity'] = pd.cut(df['rain_sum_mm'], bins=[-1, 0, 5, 20, np.inf], labels=[0, 1, 2, 3])
    df['snow_flag'] = (df['snowfall_sum_cm'] > 0).astype(int)
    df['rolling_avg_7'] = df.groupby('start_station_name')['total_rides'].transform(lambda x: x.rolling(window=7, min_periods=1).mean())
    df['rolling_avg_30'] = df.groupby('start_station_name')['total_rides'].transform(lambda x: x.rolling(window=30, min_periods=1).mean())
    df['ride_growth'] = df.groupby('start_station_name')['total_rides'].transform(lambda x: x.pct_change(periods=7).fillna(0))
    df['rain_intensity'] = df['rain_intensity'].astype(float)
    df['weekend_rain'] = df['is_weekend'] * df['rain_intensity']
    df['cold_no_snow'] = (df['temp_min_c'] < 5) & (df['snow_flag'] == 0)

# Drop the datetime column, start_station_id
train.drop(columns=['ds', 'start_station_id'], inplace=True)
test.drop(columns=['ds', 'start_station_id'], inplace=True)

# Ensure only common stations exist in test before encoding
test = test[test['start_station_name'].isin(train['start_station_name'].unique())]

# One-hot encoding
train = pd.get_dummies(train, columns=['start_station_name'], drop_first=True)
test = pd.get_dummies(test, columns=['start_station_name'], drop_first=True)

# Align test columns with train columns
test = test.reindex(columns=train.columns, fill_value=0)

# Prepare data for XGBoost
X_train = train.drop(columns=['total_rides'])
y_train = train['total_rides']
X_test = test.drop(columns=['total_rides'])
y_test = test['total_rides']

best_params = {
    'colsample_bytree': 0.8,
    'learning_rate': 0.1,
    'max_depth': 10,
    'n_estimators': 1000,
    'subsample': 0.8
}

best_xgb_model = XGBRegressor(**best_params, random_state=42)
best_xgb_model.fit(X_train, y_train)

# Ensure test features match train
X_test = X_test[X_train.columns]

# Make predictions
y_pred = best_xgb_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f"RMSE: {rmse}")

# Forecast horizon (number of days to forecast into the future)
forecast_horizon = 365

# Create a dataframe for future dates
future_dates = pd.date_range(start=test['ds'].max(), periods=forecast_horizon, freq='D')
future_df = pd.DataFrame({'ds': future_dates})

# Calculate potential for each station (same as before)
potential = {}
stations = train.columns[train.columns.str.startswith('start_station_name_')]
for station in stations:
    train_mean = train[train[station] == 1]['total_rides'].mean()
    test_mean = test[test[station] == 1]['total_rides'].mean()
    potential[station] = test_mean - train_mean

# Sort stations by potential and select top 3 (same as before)
top_3_stations = sorted(potential.items(), key=lambda x: x[1], reverse=True)[:3]

# Initialize a list to store predictions
future_predictions = []

# Recursive forecasting loop
for i in range(forecast_horizon):
    # Create lag features for the current prediction step
    current_date = future_dates[i]
    for lag in [1, 2, 7, 30, 60, 90, 365]:
        if i >= lag:
            future_df.loc[i, f'y_lag{lag}'] = future_predictions[i - lag]
        else:
            # Use the last lag values from the TRAINING set for initial lags
            future_df.loc[i, f'y_lag{lag}'] = train['total_rides'].iloc[-lag]

    # Feature engineering for the current date
    future_df.loc[i, 'day'] = current_date.day
    future_df.loc[i, 'month'] = current_date.month
    future_df.loc[i, 'year'] = current_date.year
    future_df.loc[i, 'weekday'] = current_date.weekday
    future_df.loc[i, 'week_of_year'] = current_date.isocalendar().week
    future_df.loc[i, 'is_weekend'] = int(current_date.weekday in [5, 6])
    # Add other feature engineering steps if needed (e.g., weather data)

    # One-hot encoding for start_station_name (assuming the same encoding as train/test)
    # Adjust this part based on your specific one-hot encoding scheme
    for station_col in X_train.columns[X_train.columns.str.startswith('start_station_name_')]:
        future_df.loc[i, station_col] = 0  # Initialize to 0
        # Set to 1 if the station matches the current station (you'll need logic here)

        # Check if the current station is in the top 3
        station_name = station_col.replace('start_station_name_', '')
        if any(station_name in s for s, _ in top_3_stations):
            future_df.loc[i, station_col] = 1  # Set to 1 if in top 3

    # Ensure future_df has the same columns as X_train
    future_df = future_df.reindex(columns=X_train.columns, fill_value=0)

    # Make prediction for the current date
    current_prediction = best_xgb_model.predict(future_df.iloc[[i]])[0]
    future_predictions.append(current_prediction)

# Add the predictions to the future_df
future_df['predicted_rides'] = future_predictions

# Print or visualize the predictions
print(future_df[['ds', 'predicted_rides']])

# Basic visualization using matplotlib
plt.figure(figsize=(12, 6))
plt.plot(test['ds'], y_test, label='Actual (Test Set - All Stations)', color='gray', alpha=0.5)  # Plot overall actual rides

for station, _ in top_3_stations:
    station_name = station.replace('start_station_name_', '')  # Get station name

    # Filter predictions for the current station
    station_predictions = future_df[future_df[station] == 1]['predicted_rides']

    # Plot predictions for the current station
    plt.plot(future_df['ds'], station_predictions, label=f'Predicted ({station_name})')

plt.xlabel('Date')
plt.ylabel('Total Rides')
plt.title('Bike Rides Forecast (Top 3 Stations)')
plt.legend()
plt.grid(True)
plt.show()

Mounted at /content/drive


  train = pd.read_csv("/content/drive/Shared drives/Time Series/divvy_data/prod/station/divvy_station_train.csv")


KeyboardInterrupt: 