In [50]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import BayesianRidge, LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [51]:
# Get data
df = pd.read_csv("../data/cleaned_data.csv")
df.head(5)

Unnamed: 0,date,location_key,new_persons_vaccinated,cumulative_persons_vaccinated,new_confirmed,mobility_workplaces
0,2022-05-09,US_VA,930.0,7326007.0,0.0,-13.0
1,2021-11-16,US_IA,11191.0,1934013.0,0.0,-11.0
2,2022-02-03,US_GA,0.0,6766353.0,485.0,23.0
3,2021-10-15,US_KY,2794.0,2767917.0,0.0,-21.0
4,2021-04-29,US_MN,20930.0,2621229.0,205.0,-14.0


In [52]:
X = df[['new_persons_vaccinated', 'cumulative_persons_vaccinated', 'mobility_workplaces']]
y = df['new_confirmed']

model = BayesianRidge()
model.fit(X, y)

0,1,2
,max_iter,300
,tol,0.001
,alpha_1,1e-06
,alpha_2,1e-06
,lambda_1,1e-06
,lambda_2,1e-06
,alpha_init,
,lambda_init,
,compute_score,False
,fit_intercept,True


In [53]:
# Coefficients for each feature in the order of X columns
print("Coefficients:", model.coef_)

# Intercept term
print("Intercept:", model.intercept_)

Coefficients: [-6.47411227e-05  1.15050359e-05 -5.55077174e-06]
Intercept: 10.178419526062541


In [54]:
# Evaluate model 
y_pred = model.predict(X)
mse = mean_squared_error(y, y_pred)
r2 = r2_score(y, y_pred)

print("Mean Squared Error:", mse)
print("R^2 Score:", r2)

n = X.shape[0]  # number of samples
p = X.shape[1]  # number of predictors

adjusted_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)

print("Adjusted R²:", adjusted_r2)

Mean Squared Error: 1335089.2112302792
R^2 Score: 0.003749731339316109
Adjusted R²: 0.003675005096542394


In [55]:
df.head(1)

Unnamed: 0,date,location_key,new_persons_vaccinated,cumulative_persons_vaccinated,new_confirmed,mobility_workplaces
0,2022-05-09,US_VA,930.0,7326007.0,0.0,-13.0


In [56]:
base_df = df.copy()

In [61]:
from sklearn.linear_model import LinearRegression, BayesianRidge
from sklearn.metrics import mean_squared_error
import plotly.graph_objects as go
import pandas as pd

# Reset df
df = base_df.copy()

# Sort by date
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values(by='date').reset_index(drop=True)
df = df[df['date'] < "2021-03-01"] # Subset to focus on early info

# Define features and target
X = df[['new_persons_vaccinated', 'cumulative_persons_vaccinated', 'mobility_workplaces']]
y = df['new_confirmed']

# Split training and test sets (time-based)
split_date = df['date'].quantile(0.7)
train_df = df[df['date'] <= split_date]
test_df = df[df['date'] > split_date]

X_test = test_df[['new_persons_vaccinated', 'cumulative_persons_vaccinated', 'mobility_workplaces']]
y_test = test_df['new_confirmed']

# Loop through expanding training windows
unique_dates = train_df['date'].sort_values().unique()
results = []

for d in unique_dates:
    subset = train_df[train_df['date'] <= d]
    X_sub = subset[['new_persons_vaccinated', 'cumulative_persons_vaccinated', 'mobility_workplaces']]
    y_sub = subset['new_confirmed']

    # Fit models
    ols = LinearRegression().fit(X_sub, y_sub)
    bayes = BayesianRidge().fit(X_sub, y_sub)

    # Evaluate
    ols_mse = mean_squared_error(y_test, ols.predict(X_test))
    bayes_mse = mean_squared_error(y_test, bayes.predict(X_test))

    results.append({
        'date': d,
        'OLS': ols_mse,
        'Bayes': bayes_mse,
        'n_samples': len(X_sub)
    })

# Results DataFrame
mle = pd.DataFrame(results)

# Create "(n)" x-axis labels
step = max(1, len(mle) // 8)
xticks = mle['date'][::step]
xtick_labels = [
    f"{d.strftime('%Y-%m-%d')} (n={mle.loc[mle['date'] == d, 'n_samples'].iloc[0]})"
    for d in xticks
]



# Build Plotly figure
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=mle['date'],
    y=mle['OLS'],
    mode='lines+markers',
    name='OLS',
    line=dict(color='black'),
    marker=dict(symbol='circle', size=6)
))

fig.add_trace(go.Scatter(
    x=mle['date'],
    y=mle['Bayes'],
    mode='lines+markers',
    name='Bayesian Ridge',
    line=dict(color='royalblue'),
    marker=dict(symbol='square', size=6)
))

# Customize layout
fig.update_layout(
    title=f'Model Performance Over Time tested against data after {str(split_date)[:10]}',
    xaxis_title='Training End Date',
    yaxis_title='Mean Squared Error (on Test Set)',
    template='plotly_white',
    legend=dict(x=0.95, y=0.02, borderwidth=0),
    width=900,
    height=550
)

# Apply custom x-axis ticks
fig.update_xaxes(
    tickvals=xticks,
    ticktext=xtick_labels,
    tickangle=45
)

fig.write_html("../plots/covid_regression.html", include_plotlyjs='cdn')
fig.show()
