In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score




# Load and preprocess data
df = pd.read_csv("/Users/daisy/Desktop/Purdue/590 Computing for Analytics/Final Project/filtered_data 1.csv")
df['start_rental_date_time'] = pd.to_datetime(df['start_rental_date_time'])

### Filter to Top 5 Stations

In [3]:
# Focus on top 5 stations
top_5_stations = df['start_station_name'].value_counts().nlargest(5).index
df_top5 = df[df['start_station_name'].isin(top_5_stations)].copy()

### Extract Time Features

In [4]:
# Extract datetime features
df_top5['hour'] = df_top5['start_rental_date_time'].dt.hour
df_top5['date'] = df_top5['start_rental_date_time'].dt.date
df_top5['day_of_week'] = df_top5['start_rental_date_time'].dt.dayofweek
df_top5['is_weekend'] = df_top5['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)

### Aggregate Trips

In [5]:
# Aggregate: trips per station per hour
grouped = df_top5.groupby(['start_station_name', 'date', 'hour', 'day_of_week', 'is_weekend']).size().reset_index(name='trip_count')

# One-hot encode station name
encoded = pd.get_dummies(grouped, columns=['start_station_name'])


### Prepare Features

- hour : Hour of the day (0 to 23)

- day_of_week : Day of the week (0 = Monday, ..., 6 = Sunday)

- is_weekend – 1 if Saturday or Sunday, 0 otherwise

- One-hot encoded station names: Only the top 5 most frequent stations

In [6]:
# Split features and target
X = encoded.drop(columns=['trip_count', 'date'])
y = encoded['trip_count']


### Train-Test Split

In [7]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



### Train Random Forest

In [8]:
# Train Random Forest
rf_model = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
rf_model.fit(X_train, y_train)


### Model Evaluation

In [9]:
# Predict and evaluate
y_pred = rf_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Show metrics
results_df = pd.DataFrame({
    "Metric": ["Mean Absolute Error", "R^2 Score"],
    "Value": [mae, r2]
})

print(results_df)

                Metric     Value
0  Mean Absolute Error  4.009338
1            R^2 Score  0.890836


In [10]:
max_row = grouped.loc[grouped['trip_count'].idxmax()]

print("Station with highest hourly traffic:")
print(max_row)

Station with highest hourly traffic:
start_station_name    Belgrove Street , King's Cross
date                                      2017-01-09
hour                                               8
day_of_week                                        0
is_weekend                                         0
trip_count                                       208
Name: 122, dtype: object


### Test Example

In [14]:
# Step 1: Specify the station you want to test
station_col = "start_station_name_Belgrove Street , King's Cross"

# Step 2: Check if the station was used in training
if station_col in X_train.columns:
    # Step 3: Create a grid for all hours (0-23) and days of the week (0–6)
    test_grid = pd.DataFrame([
        {'hour': h, 'day_of_week': d, 'is_weekend': 1 if d >= 5 else 0}
        for d in range(7) for h in range(24)
    ])

    # Step 4: Add all model features as columns (fill missing with 0)
    for col in X_train.columns:
        if col not in test_grid.columns:
            test_grid[col] = 0

    # Step 5: Mark the target station column as 1
    test_grid[station_col] = 1

    # Step 6: Reorder to match training feature order
    test_grid = test_grid[X_train.columns]

    # Step 7: Predict trip count
    test_grid['predicted_trips'] = rf_model.predict(test_grid)

    # Step 8: Find the hour/day with the highest predicted demand
    max_pred = test_grid.loc[test_grid['predicted_trips'].idxmax()]
    print("Most active time for Belgrove Street, King's Cross:")
    print(f"Day of Week: {int(max_pred['day_of_week'])} (0=Mon, 6=Sun)")
    print(f"Hour: {int(max_pred['hour'])}:00")
    print(f"Predicted Trips: {max_pred['predicted_trips']:.2f}")

else:
    print("Station not found in training data.")


Most active time for Belgrove Street, King's Cross:
Day of Week: 3 (0=Mon, 6=Sun)
Hour: 8:00
Predicted Trips: 146.70
