In [43]:
# libraries
import numpy as np
import pandas as pd
import statsmodels.api as sm

In [29]:
# Load CSV file:
station_pois = pd.read_csv('../data/collective_stations_data.csv')

# Generate one more column for analysis:
station_pois['fsq_poi_counts'] = station_pois['park_poi_counts'] + station_pois['bar_poi_counts'] + station_pois['food_poi_counts'] + station_pois['grocer_poi_counts']

# Create .csv and show final DataFrame:

station_pois.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144 entries, 0 to 143
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 144 non-null    object 
 1   latitude           144 non-null    float64
 2   longitude          144 non-null    float64
 3   name               144 non-null    object 
 4   est_bike_slots     144 non-null    int64  
 5   park_poi_counts    144 non-null    int64  
 6   bar_poi_counts     144 non-null    int64  
 7   food_poi_counts    144 non-null    int64  
 8   grocer_poi_counts  144 non-null    int64  
 9   yelp_counts        144 non-null    int64  
 10  fsq_poi_counts     144 non-null    int64  
dtypes: float64(2), int64(7), object(2)
memory usage: 12.5+ KB


In [44]:
station_pois.describe()

Unnamed: 0,latitude,longitude,est_bike_slots,park_poi_counts,bar_poi_counts,food_poi_counts,grocer_poi_counts,yelp_counts,fsq_poi_counts
count,144.0,144.0,144.0,144.0,144.0,144.0,144.0,144.0,144.0
mean,43.257392,-79.875175,12.055556,6.055556,15.763889,27.916667,8.631944,39.736111,58.368056
std,0.00742,0.032363,5.515113,4.384936,11.358116,6.101129,7.708633,12.613671,24.70858
min,43.242163,-79.953754,1.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,43.252003,-79.895209,9.0,2.0,5.0,30.0,3.0,32.0,39.0
50%,43.256647,-79.869532,11.0,6.0,13.0,30.0,6.0,44.0,53.0
75%,43.262017,-79.855642,13.0,9.0,30.0,30.0,13.0,50.0,82.25
max,43.289479,-79.766388,34.0,15.0,30.0,30.0,26.0,50.0,98.0


#### Notes: 'est_bike_slots' is my singular dependent variable; all others are potential independent variables I can test it against.

Build a regression model.

In [53]:
# Dependent Variable = est_bike_slots; also dropping coordinates, id, and names from test
y = station_pois['est_bike_slots']
indep = station_pois.drop(columns = ['est_bike_slots', 'latitude', 'longitude', 'name', 'id', 'yelp_counts', 'fsq_poi_counts'])

X = [sm.add_constant(indep[column]) for column in indep.columns] 
#Print all the X columns in a formatted manner as pandas dataframe
df_X = pd.DataFrame(indep, columns=indep.columns)
print(df_X.head())
X

   park_poi_counts  bar_poi_counts  food_poi_counts  grocer_poi_counts
0                8              30               30                 24
1                5              11               30                  5
2                6              17               30                 12
3                8              30               30                 20
4                8              30               30                 24


[     const  park_poi_counts
 0      1.0                8
 1      1.0                5
 2      1.0                6
 3      1.0                8
 4      1.0                8
 ..     ...              ...
 139    1.0                7
 140    1.0                8
 141    1.0                1
 142    1.0                1
 143    1.0               10
 
 [144 rows x 2 columns],
      const  bar_poi_counts
 0      1.0              30
 1      1.0              11
 2      1.0              17
 3      1.0              30
 4      1.0              30
 ..     ...             ...
 139    1.0              19
 140    1.0              30
 141    1.0               3
 142    1.0               1
 143    1.0              30
 
 [144 rows x 2 columns],
      const  food_poi_counts
 0      1.0               30
 1      1.0               30
 2      1.0               30
 3      1.0               30
 4      1.0               30
 ..     ...              ...
 139    1.0               30
 140    1.0               30
 

In [59]:
# Models:
Models = [sm.OLS(y,x) for x in X]
Results = [model.fit() for model in Models]

# Significance Values:
Adj_Rsquared = [results.rsquared_adj for results in Results]
Pval = [results.pvalues for results in Results]

Params = [results.params for results in Results]
Results[2].summary()

# The condition number is not significantly close to 1, which means there is room for refinement.
# r-squared is 0.49, meaning it can predict with 49% accuracy; certain POIs may have more influence 
# than others.

0,1,2,3
Dep. Variable:,est_bike_slots,R-squared:,0.008
Model:,OLS,Adj. R-squared:,0.001
Method:,Least Squares,F-statistic:,1.124
Date:,"Mon, 05 Jun 2023",Prob (F-statistic):,0.291
Time:,03:42:58,Log-Likelihood:,-449.14
No. Observations:,144,AIC:,902.3
Df Residuals:,142,BIC:,908.2
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,14.2915,2.159,6.620,0.000,10.024,18.559
food_poi_counts,-0.0801,0.076,-1.060,0.291,-0.229,0.069

0,1,2,3
Omnibus:,65.841,Durbin-Watson:,1.785
Prob(Omnibus):,0.0,Jarque-Bera (JB):,189.153
Skew:,1.855,Prob(JB):,8.429999999999999e-42
Kurtosis:,7.214,Cond. No.,134.0


In [57]:
print(Pval)

[const              3.333134e-38
park_poi_counts    7.432858e-03
dtype: float64, const             5.545314e-38
bar_poi_counts    8.346937e-03
dtype: float64, const              6.846666e-10
food_poi_counts    2.909461e-01
dtype: float64, const                1.115738e-41
grocer_poi_counts    2.597570e-02
dtype: float64]


Provide model output and an interpretation of the results. 

# Stretch

How can you turn the regression model into a classification model?