In [155]:
# libraries
import numpy as np
import pandas as pd
import statsmodels.api as sm

In [175]:
# Load CSV file:
station_pois = pd.read_csv('../data/collective_stations_data.csv')

# Generate one more column for analysis:
station_pois['fsq_poi_counts'] = station_pois['park_poi_counts'] + station_pois['bar_poi_counts'] + station_pois['food_poi_counts'] + station_pois['grocer_poi_counts']

# Create .csv and show final DataFrame:

station_pois.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144 entries, 0 to 143
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 144 non-null    object 
 1   latitude           144 non-null    float64
 2   longitude          144 non-null    float64
 3   name               144 non-null    object 
 4   est_bike_slots     144 non-null    int64  
 5   park_poi_counts    144 non-null    int64  
 6   bar_poi_counts     144 non-null    int64  
 7   food_poi_counts    144 non-null    int64  
 8   grocer_poi_counts  144 non-null    int64  
 9   yelp_counts        144 non-null    int64  
 10  fsq_poi_counts     144 non-null    int64  
dtypes: float64(2), int64(7), object(2)
memory usage: 12.5+ KB


In [176]:
station_pois.describe()

Unnamed: 0,latitude,longitude,est_bike_slots,park_poi_counts,bar_poi_counts,food_poi_counts,grocer_poi_counts,yelp_counts,fsq_poi_counts
count,144.0,144.0,144.0,144.0,144.0,144.0,144.0,144.0,144.0
mean,43.257392,-79.875175,12.055556,6.055556,15.763889,27.916667,8.631944,39.736111,58.368056
std,0.00742,0.032363,5.515113,4.384936,11.358116,6.101129,7.708633,12.613671,24.70858
min,43.242163,-79.953754,1.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,43.252003,-79.895209,9.0,2.0,5.0,30.0,3.0,32.0,39.0
50%,43.256647,-79.869532,11.0,6.0,13.0,30.0,6.0,44.0,53.0
75%,43.262017,-79.855642,13.0,9.0,30.0,30.0,13.0,50.0,82.25
max,43.289479,-79.766388,34.0,15.0,30.0,30.0,26.0,50.0,98.0


#### Notes: 'est_bike_slots' is my singular dependent variable; all others are potential independent variables I can test it against.

Build a regression model.

I decided to employ backward selection method for refining my model:

In [178]:
# Dependent Variable = est_bike_slots; also dropping coordinates, id, and names from test
y = station_pois['est_bike_slots']
indep = station_pois.drop(columns = ['latitude', 'longitude', 'name', 'id', 'yelp_counts', 'fsq_poi_counts', 'est_bike_slots'])

X = [sm.add_constant(indep[column]) for column in indep.columns] 
#Print all the X columns in a formatted manner as pandas dataframe
df_X = pd.DataFrame(indep, columns=indep.columns)
X

[     const  park_poi_counts
 0      1.0                8
 1      1.0                5
 2      1.0                6
 3      1.0                8
 4      1.0                8
 ..     ...              ...
 139    1.0                7
 140    1.0                8
 141    1.0                1
 142    1.0                1
 143    1.0               10
 
 [144 rows x 2 columns],
      const  bar_poi_counts
 0      1.0              30
 1      1.0              11
 2      1.0              17
 3      1.0              30
 4      1.0              30
 ..     ...             ...
 139    1.0              19
 140    1.0              30
 141    1.0               3
 142    1.0               1
 143    1.0              30
 
 [144 rows x 2 columns],
      const  food_poi_counts
 0      1.0               30
 1      1.0               30
 2      1.0               30
 3      1.0               30
 4      1.0               30
 ..     ...              ...
 139    1.0               30
 140    1.0               30
 

In [133]:
# Models:
models = [sm.OLS(y,x) for x in X]
results = [model.fit() for model in models]

# Significance Values:
adj_rsquared = [results.rsquared_adj for results in results]
pval = [results.pvalues for results in results]

Params = [results.params for results in results]
print(results[0].summary())

                            OLS Regression Results                            
Dep. Variable:         est_bike_slots   R-squared:                       0.049
Model:                            OLS   Adj. R-squared:                  0.043
Method:                 Least Squares   F-statistic:                     7.376
Date:                Mon, 05 Jun 2023   Prob (F-statistic):            0.00743
Time:                        04:58:33   Log-Likelihood:                -446.06
No. Observations:                 144   AIC:                             896.1
Df Residuals:                     142   BIC:                             902.1
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const              13.7480      0.768     

In [136]:
#Model Output
for i in range(len(indep.columns)):
    print("Model for", indep.columns[i])
    print(Results[i].summary())
    print(" ")  #print a blank line

#Create a dataframe with the indep. variables, their adj. R-squared, p-values and params
df = pd.DataFrame({'Indep. Variables': indep.columns, 'Adj. R-squared': Adj_Rsquared, 'P-values': Pval, 'Params': Params})
df

#Sort the dataframe by the adj. R-squared
df.sort_values(by='Adj. R-squared', ascending=False)

Model for park_poi_counts
                            OLS Regression Results                            
Dep. Variable:         est_bike_slots   R-squared:                       0.049
Model:                            OLS   Adj. R-squared:                  0.043
Method:                 Least Squares   F-statistic:                     7.376
Date:                Mon, 05 Jun 2023   Prob (F-statistic):            0.00743
Time:                        04:59:21   Log-Likelihood:                -446.06
No. Observations:                 144   AIC:                             896.1
Df Residuals:                     142   BIC:                             902.1
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const           

Unnamed: 0,Indep. Variables,Adj. R-squared,P-values,Params
0,park_poi_counts,0.042682,const 3.333134e-38 park_poi_count...,const 13.747959 park_poi_counts ...
1,bar_poi_counts,0.041275,const 5.545314e-38 bar_poi_counts ...,const 13.732190 bar_poi_counts ...
3,grocer_poi_counts,0.027629,const 1.115738e-41 grocer_poi_c...,const 13.201460 grocer_poi_coun...
2,food_poi_counts,0.000864,const 6.846666e-10 food_poi_count...,const 14.291476 food_poi_counts ...


In [116]:
for i in range(len(Adj_Rsquared)):
     print(f'adj_R2: {Adj_Rsquared[i]:.3f}, P-values: {*Pval[i],}, column: {indep.columns[i]}')

adj_R2: 0.043, P-values: (3.3331344037826663e-38, 0.007432857518816909), column: park_poi_counts
adj_R2: 0.041, P-values: (5.545314131761724e-38, 0.008346936600383714), column: bar_poi_counts
adj_R2: 0.001, P-values: (6.846665594717721e-10, 0.2909460691789762), column: food_poi_counts
adj_R2: 0.028, P-values: (1.1157382527248846e-41, 0.02597570031087397), column: grocer_poi_counts


In [142]:
# park_poi_counts has the highest adj_r2, so we can proceed with the model
remaining_var = indep.drop(['park_poi_counts'], axis=1)
remaining_var.head()

Unnamed: 0,bar_poi_counts,food_poi_counts,grocer_poi_counts
0,30,30,24
1,11,30,5
2,17,30,12
3,30,30,20
4,30,30,24


In [143]:
included_df = indep[['park_poi_counts']]
included_df

Unnamed: 0,park_poi_counts
0,8
1,5
2,6
3,8
4,8
...,...
139,7
140,8
141,1
142,1


In [144]:
X = [sm.add_constant(pd.merge(included_df,remaining_var[column], right_index = True, left_index = True)) for column in remaining_var.columns] 
X[0]

Unnamed: 0,const,park_poi_counts,bar_poi_counts
0,1.0,8,30
1,1.0,5,11
2,1.0,6,17
3,1.0,8,30
4,1.0,8,30
...,...,...,...
139,1.0,7,19
140,1.0,8,30
141,1.0,1,3
142,1.0,1,1


In [147]:
models = [sm.OLS(y,x) for x in X]
Results = [model.fit() for model in models]
Adj_Rsquared = [results.rsquared_adj for results in Results]
Pval = [results.pvalues for results in Results] 

for i in range(len(Adj_Rsquared)):
     print(f'adj_R2: {Adj_Rsquared[i]:.3f}, P-values: {*Pval[i],}, column: {remaining_var.columns[i]}')

adj_R2: 0.041, P-values: (5.1404824511215365e-37, 0.33521789040586814, 0.39538276072492473), column: bar_poi_counts
adj_R2: 0.037, P-values: (1.4931262870336451e-10, 0.012525798976562669, 0.6332875975957615), column: food_poi_counts
adj_R2: 0.039, P-values: (1.2256882353321797e-37, 0.10318867531434955, 0.49596226900604046), column: grocer_poi_counts


Provide model output and an interpretation of the results. 

My hypothesis (that bike station placement correlates with specific POI density) is wrong, and my model is unable to disprove the null hypothesis that it is not. My P-values are not decreasing, my R-Squares are staying relatively consistent, and none of the other metrics for testing significance have turned any meaningful results. This at least suggests there is no positive correlation between bike station placement and what kinds of venues are in the immediate area. It is entirely possible that there is some other criteria I have overlooked, however.

My last output is shared below:

In [150]:
#Model Output
for i in range(len(remaining_var.columns)):
    print("Model for", remaining_var.columns[i])
    print(results[i].summary())
    print(" ")  #print a blank line

#Create a dataframe with the indep. variables, their adj. R-squared, p-values and params
df = pd.DataFrame({'Indep. Variables': remaining_var.columns, 'Adj. R-squared': Adj_Rsquared, 'P-values': Pval, 'Params': Params})
df

#Sort the dataframe by the adj. R-squared
df.sort_values(by='Adj. R-squared', ascending=False)

Model for bar_poi_counts
                            OLS Regression Results                            
Dep. Variable:         est_bike_slots   R-squared:                       0.054
Model:                            OLS   Adj. R-squared:                  0.041
Method:                 Least Squares   F-statistic:                     4.044
Date:                Mon, 05 Jun 2023   Prob (F-statistic):             0.0196
Time:                        05:20:14   Log-Likelihood:                -445.69
No. Observations:                 144   AIC:                             897.4
Df Residuals:                     141   BIC:                             906.3
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const            

ValueError: All arrays must be of the same length

# Stretch

How can you turn the regression model into a classification model?