Build a regression model.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm

# Read in CityBike and Yelp data
citybike_df = pd.read_csv('citybike_data.csv')
yelp_parks_df = pd.read_csv('yelp_parks_data.csv')

# Calculate distance from each CityBike location to each Yelp park
distances = []
for bike_loc in citybike_df[['Latitude', 'Longitude']].values:
    for park_loc in yelp_parks_df[['Latitude', 'Longitude']].values:
        dist = np.sqrt((bike_loc[0]-park_loc[0])**2 + (bike_loc[1]-park_loc[1])**2)
        distances.append(dist)
yelp_parks_df['Distance'] = distances

# Filter Yelp parks within 1000m radius of each CityBike location
yelp_parks_df = yelp_parks_df[yelp_parks_df['Distance']<=1000]

# Calculate number of parks within 1000m radius of each CityBike location
num_parks_per_location_yelp = yelp_parks_df.groupby(['location'])['Type'].count().reset_index(name='num_parks')

# Plot bar chart
plt.figure(figsize=(8,5))
plt.bar(num_parks_per_location_yelp['location'], num_parks_per_location_yelp['num_parks'])
plt.xticks(rotation=90)
plt.xlabel('Region')
plt.ylabel('Number of Parks')
plt.title('Number of Parks per Region (Yelp data)')
plt.show()

# Build regression model
X = citybike_df['Population']
y = num_parks_per_location_yelp['num_parks']
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
print(model.summary())


Provide model output and an interpretation of the results. 

# Stretch

How can you turn the regression model into a classification model?