Build a regression model.

In [None]:
!pip install statsmodels;

In [4]:
import pandas as pd
# loading necessary CSV files
station_summary = pd.read_csv("../data/station_summary.csv")
full_combined_clean = pd.read_csv("../data/combined_all.csv")

# creating new columns for additional insights to use in regression model
avg_rating = full_combined_clean.groupby("station_name")["rating"].mean().reset_index(name="avg_rating")
total_reviews = full_combined_clean.groupby("station_name")["review_count"].sum().reset_index(name="review_count_sum")

# merging new columns
station_summary = station_summary.merge(avg_rating, on="station_name", how="left")
station_summary = station_summary.merge(total_reviews, on="station_name", how="left")

# building regression model
import statsmodels.api as sm
X = station_summary[["poi_count", "avg_rating", "review_count_sum"]]
X = sm.add_constant(X)
y = station_summary["num_bikes"]

Provide model output and an interpretation of the results. 

In [6]:
# dropping any rows with missing data
regression_df = pd.concat([X, y], axis=1).dropna()

# redefining X and y
X_clean = regression_df[X.columns]
y_clean = regression_df[y.name]

# running regression model
model = sm.OLS(y_clean, X_clean).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:              num_bikes   R-squared:                       0.053
Model:                            OLS   Adj. R-squared:                  0.033
Method:                 Least Squares   F-statistic:                     2.624
Date:                Mon, 28 Jul 2025   Prob (F-statistic):             0.0529
Time:                        16:35:14   Log-Likelihood:                -425.79
No. Observations:                 146   AIC:                             859.6
Df Residuals:                     142   BIC:                             871.5
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const               13.4957      7.489  

In [7]:
# rebuilding the model without avg_rating (had a 0.351 p-value and may be one of the culprits in the multicollinearity)
X_reduced = station_summary[["poi_count", "review_count_sum"]]
X_reduced = sm.add_constant(X_reduced)
y = station_summary["num_bikes"]

# dropping any rows with missing data
regression_df_reduced = pd.concat([X_reduced, y], axis=1).dropna()
X_clean = regression_df_reduced[X_reduced.columns]
y_clean = regression_df_reduced[y.name]

# running the model
model_reduced = sm.OLS(y_clean, X_clean).fit()
print(model_reduced.summary())

                            OLS Regression Results                            
Dep. Variable:              num_bikes   R-squared:                       0.043
Model:                            OLS   Adj. R-squared:                  0.034
Method:                 Least Squares   F-statistic:                     4.627
Date:                Mon, 28 Jul 2025   Prob (F-statistic):             0.0108
Time:                        16:41:38   Log-Likelihood:                -614.90
No. Observations:                 207   AIC:                             1236.
Df Residuals:                     204   BIC:                             1246.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const                4.7513      0.684  

In [8]:
# finalizing and rebuilding the model with only poi_count
X_final = station_summary[["poi_count"]]
X_final = sm.add_constant(X_final)
regression_df_final = pd.concat([X_final, y], axis=1).dropna()

X_clean = regression_df_final[X_final.columns]
y_clean = regression_df_final[y.name]

model_final = sm.OLS(y_clean, X_clean).fit()
print(model_final.summary())

                            OLS Regression Results                            
Dep. Variable:              num_bikes   R-squared:                       0.043
Model:                            OLS   Adj. R-squared:                  0.039
Method:                 Least Squares   F-statistic:                     9.284
Date:                Mon, 28 Jul 2025   Prob (F-statistic):            0.00262
Time:                        16:43:36   Log-Likelihood:                -614.91
No. Observations:                 207   AIC:                             1234.
Df Residuals:                     205   BIC:                             1240.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          4.7218      0.637      7.416      0.0

### Across all three models, the number of nearby POIs was the only consistent and "significant" predictor of bike availability, suggesting that stations in denser areas are more likely to be well-stocked with bikes.