Build a regression model.

In [27]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn import linear_model, datasets
from sklearn.datasets import fetch_california_housing

Provide model output and an interpretation of the results. 

In [16]:
# Load Data
stations_joined = pd.read_csv('stations_joined.csv')
stations_joined = stations_joined.drop(columns=['Unnamed: 0'])
stations_joined.head(3)

Unnamed: 0,avg_poi_rating,avg_poi_review_count,avg_poi_price,avg_poi_distance,avg_poi_popularity,poi_popularity_sum,id,latitude,longitude,name,total_slots
0,3.8,204.0,1.9,386.198279,767.3,15346.0,7a19c49f486d7c0c02b3685d7b240448,49.262487,-123.114397,10th & Cambie,35
1,3.928571,80.428571,1.714286,156.174324,302.357143,2116.5,32603a87cfca71d0f7dfa3513bad69d5,49.274566,-123.121817,Yaletown-Roundhouse Station,16
2,3.03125,86.25,1.375,120.42524,307.78125,4924.5,6d42fa40360f9a6b2bf641c7b8bb2862,49.279764,-123.110154,Dunsmuir & Beatty,26


In [22]:
stations_joined.columns

Index(['avg_poi_rating', 'avg_poi_review_count', 'avg_poi_price',
       'avg_poi_distance', 'avg_poi_popularity', 'poi_popularity_sum', 'id',
       'latitude', 'longitude', 'name', 'total_slots'],
      dtype='object')

In [23]:
# Define variables
X = stations_joined[stations_joined.columns[~stations_joined.columns.isin(['id','latitude', 'longitude', 'name', 'total_slots'])]]
y = stations_joined['total_slots']

In [24]:
# Check data
X.shape

(206, 6)

In [25]:
X.head()

Unnamed: 0,avg_poi_rating,avg_poi_review_count,avg_poi_price,avg_poi_distance,avg_poi_popularity,poi_popularity_sum
0,3.8,204.0,1.9,386.198279,767.3,15346.0
1,3.928571,80.428571,1.714286,156.174324,302.357143,2116.5
2,3.03125,86.25,1.375,120.42524,307.78125,4924.5
3,3.55,65.8,1.5,160.393656,259.85,2598.5
4,3.275,117.6,1.6,152.997636,411.525,8230.5


In [26]:
y.head()

0    35
1    16
2    26
3    16
4    16
Name: total_slots, dtype: int64

In [28]:
X = sm.add_constant(X) # adding a constant
lin_reg = sm.OLS(y,X)

In [29]:
model = lin_reg.fit()
print_model = model.summary()
print(print_model)

                            OLS Regression Results                            
Dep. Variable:            total_slots   R-squared:                       0.019
Model:                            OLS   Adj. R-squared:                 -0.011
Method:                 Least Squares   F-statistic:                    0.6365
Date:                Mon, 27 Feb 2023   Prob (F-statistic):              0.701
Time:                        21:12:29   Log-Likelihood:                -638.32
No. Observations:                 206   AIC:                             1291.
Df Residuals:                     199   BIC:                             1314.
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                   20.6618 

In [33]:
# Backward elimination method (1 at a time, fast-forward 3 steps)
X = stations_joined[stations_joined.columns[stations_joined.columns.isin(['avg_poi_rating', 'avg_poi_price', 'poi_popularity_sum'])]]
y = stations_joined['total_slots']

In [34]:
X = sm.add_constant(X) # adding a constant
lin_reg = sm.OLS(y,X)

In [35]:
model = lin_reg.fit()
print_model = model.summary()
print(print_model)

                            OLS Regression Results                            
Dep. Variable:            total_slots   R-squared:                       0.019
Model:                            OLS   Adj. R-squared:                  0.004
Method:                 Least Squares   F-statistic:                     1.271
Date:                Mon, 27 Feb 2023   Prob (F-statistic):              0.285
Time:                        21:18:57   Log-Likelihood:                -638.35
No. Observations:                 206   AIC:                             1285.
Df Residuals:                     202   BIC:                             1298.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
const                 20.3934      4

### Multilinear does not appear effective.

# Stretch

How can you turn the regression model into a classification model?

# Stretch Answer

Two meaninful opportunities for classification are:
1. {high, medium, low} classification of total_bikes by station
2. {overserved, good, underserved} from total_bikes/popularity by station