In [3]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder

# Load the weather data
weather_data = pd.read_csv('/Users/lijunyi/Downloads/cs506/bus performance/weather/boston_weather_data_2022.csv')

# Define the months to be included in the analysis
months = [1,2,3,4,5,6,7,8,9,10,11,12]

# Initialize an empty DataFrame for selected bus data
selected_bus_data = pd.DataFrame()

# Iterate over the selected months
for month in months:
    month_str = str(month).zfill(2)  # Ensure month is two digits
    file_name = f'MBTA-Bus-Arrival-Departure-Times_2022-{month_str}.csv'
    bus_data_month = pd.read_csv(file_name)

    # Convert scheduled and actual times to datetime
    bus_data_month['scheduled'] = pd.to_datetime(bus_data_month['scheduled'], format='%Y-%m-%d %H:%M:%S.%f')
    bus_data_month['actual'] = pd.to_datetime(bus_data_month['actual'], format='%Y-%m-%d %H:%M:%S.%f')

    # Calculate delay in minutes and handle NaN values in the 'delay' column
    bus_data_month['delay'] = (bus_data_month['actual'] - bus_data_month['scheduled']).dt.total_seconds() / 60
    
    # Append to the main DataFrame
    selected_bus_data = pd.concat([selected_bus_data, bus_data_month])

# Drop rows where 'delay' is NaN
selected_bus_data = selected_bus_data.dropna(subset=['delay'])

# Merge weather data with selected bus data based on the service_date
merged_data = pd.merge(selected_bus_data, weather_data, left_on='service_date', right_on='time')

# One-hot encode the 'route_id' column
encoder = OneHotEncoder(sparse=False)
route_id_encoded = encoder.fit_transform(merged_data[['route_id']])

# Prepare the data for training the model
X_numeric = merged_data[['tavg']]  # Numeric features
X_encoded = pd.DataFrame(route_id_encoded, columns=encoder.get_feature_names_out(['route_id']))  # Encoded features
X = pd.concat([X_numeric, X_encoded], axis=1)
y = merged_data['delay']  # label: delay time

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a simple linear regression model
model = LinearRegression()

# Fit the model to the training data
model.fit(X_train, y_train)

# Predict on the testing data
y_pred = model.predict(X_test)

# Calculate the mean squared error of the predictions
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# The prediction function needs to use the same one-hot encoding as the model
def predict_delay(route_id, temperature):
    # Encode the route_id
    route_id_encoded = encoder.transform([[route_id]])
    
    # Prepare the input features
    features = pd.DataFrame(route_id_encoded, columns=encoder.get_feature_names_out(['route_id']))
    features['tavg'] = temperature
    
    # Predict delay
    predicted_delay = model.predict(features)
    return predicted_delay[0]



Mean Squared Error: 31.08085015949543


In [17]:
# Assuming 'model' and 'encoder' have been previously fitted with the appropriate training data

# The prediction function needs to use the same one-hot encoding as the model
def predict_delay(route_id, temperature):
    # Ensure route_id is a string, consistent with training data
    route_id = str(route_id)
    
    # Create a dataframe with a single row for temperature
    features = pd.DataFrame({'tavg': [temperature]})
    
    # Now create the one-hot encoded variables for route_id
    # It's important to do this after adding 'tavg' to maintain the correct order
    route_id_encoded = encoder.transform([[route_id]])
    encoded_features = pd.DataFrame(route_id_encoded, columns=encoder.get_feature_names_out(['route_id']))
    
    # Combine 'tavg' with the encoded features
    features = pd.concat([features.reset_index(drop=True), encoded_features.reset_index(drop=True)], axis=1)
    
    # Predict delay
    predicted_delay = model.predict(features)
    return predicted_delay[0]

# Example usage:
route_id_input = '57'  # example route_id as string
temperature_input = -10  # example temperature in Celsius

# Predict delay
predicted_delay = predict_delay(route_id_input, temperature_input)
print(f"The predicted delay for route {route_id_input} at {temperature_input}°C is approximately {predicted_delay:.2f} minutes.")


The predicted delay for route 57 at -10°C is approximately 5.46 minutes.


