### Join the data from Part 1 with the data from Part 2 to create a new dataframe.

#### Import libraries

In [1]:
import pandas as pd
import sqlite3
import os
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

#### Step 1: Import CSV files from CityBikes, Yelp, and FourSquare

In [2]:
# load CityBikes dataframe
city_bikes_df = pd.read_csv('/Users/brittanyharding/LHL-Projects/Statistical-Modelling-with-Python/data/CSV_files/London_CityBikes_Data.csv')

# load Yelp parks dataframe
yelp_parks_df = pd.read_csv('/Users/brittanyharding/LHL-Projects/Statistical-Modelling-with-Python/data/CSV_files/yelp_parks_df.csv')

# load Yelp restaurants dataframe
yelp_restaurants_df = pd.read_csv('/Users/brittanyharding/LHL-Projects/Statistical-Modelling-with-Python/data/CSV_files/yelp_restaurants_df.csv')

# load Foursquare parks dataframe
foursquare_parks_df = pd.read_csv('/Users/brittanyharding/LHL-Projects/Statistical-Modelling-with-Python/data/CSV_files/foursquare_parks_df.csv')

# load FourSquare restaurants dataframe
foursquare_restaurants_df = pd.read_csv('/Users/brittanyharding/LHL-Projects/Statistical-Modelling-with-Python/data/CSV_files/foursquare_restaurants_df.csv')




In [3]:
#Adding Type descriptive labels to differentiate the data when combined (as categories and poi type does not provide uniform type names)
yelp_parks_df['Type'] = 'Park'
yelp_restaurants_df['Type'] = 'Restaurant'
foursquare_restaurants_df['Type'] = 'Restaurant'
foursquare_parks_df['Type'] = 'Park'

### Step 2: Merge DataFrames Together

In [4]:
#rename CityBikes 'name' column to 'location' to be able to perform merge
city_bikes_df = city_bikes_df.rename(columns={'name': 'location'})

#rename FourSquare Parks 'name' column to 'location' to be able to perform merge
foursquare_parks_df = foursquare_parks_df.rename(columns={'location_name': 'location'})

#rename FourSquare Restaurants 'name' column to 'location' to be able to perform merge
foursquare_restaurants_df = foursquare_restaurants_df.rename(columns={'location_name': 'location'})

In [5]:
# merge Yelp DataFrames based on the location column
merged_yelp_df = pd.concat([yelp_parks_df, yelp_restaurants_df], axis=0)

# merge FourSquare DataFrames based on the location column
merged_foursquare_df = pd.concat([foursquare_parks_df, foursquare_restaurants_df], axis=0)

# merge Foursquare and CityBikes DataFrames based on the location column (left-join)
merged_yelp_citybikes = pd.merge(merged_yelp_df, city_bikes_df, on='location', how='left')

# merge Foursquare and CityBikes DataFrames based on the location column (left-join)
merged_foursquare_citybikes = pd.merge(merged_foursquare_df, city_bikes_df, on='location', how='left')

In [6]:
# #Removing id and fsq_id, as they do not add to the data
merged_yelp_citybikes = merged_yelp_citybikes.drop('id', axis=1)
merged_foursquare_citybikes = merged_foursquare_citybikes.drop('fsq_id', axis=1)


In [7]:
merged_yelp_citybikes.to_csv('merged_yelp_citybikes.csv', index=False)

# export foursquare/citybikes DataFrame to Csv file
merged_foursquare_citybikes.to_csv('merged_foursquare_citybikes.csv', index=False)

# load merged_yelp_citybikes dataframe
yelp_citybikes = pd.read_csv('/Users/brittanyharding/LHL-Projects/Statistical-Modelling-with-Python/notebooks/merged_yelp_citybikes.csv')

# load foursquare/citybikes DataFrame parks dataframe
foursquare_citybikes = pd.read_csv('/Users/brittanyharding/LHL-Projects/Statistical-Modelling-with-Python/notebooks/merged_foursquare_citybikes.csv')

Provide a visualization that you used as part of your EDA process. Explain the initial pattern or relationship you discoved through this visualization. 

### EDA Stats for Yelp CityBikes Data

In [8]:
# print first 5 rows of the data
yelp_citybikes.head(10)

Unnamed: 0,name,location,categories,rating,price,Type,latitude,longitude,free_bikes
0,Leicester Square Gardens,"001163 - Wardour Street, Soho","[{'alias': 'landmarks', 'title': 'Landmarks & ...",3.5,,Park,51.512515,-0.133202,16
1,Leicester Square,"001163 - Wardour Street, Soho","[{'alias': 'landmarks', 'title': 'Landmarks & ...",4.0,,Park,51.512515,-0.133202,16
2,Allen Gardens,"200003 - Cheshire Street, Bethnal Green","[{'alias': 'parks', 'title': 'Parks'}]",5.0,,Park,51.52388,-0.065076,18
3,Weavers Fields,"200003 - Cheshire Street, Bethnal Green","[{'alias': 'parks', 'title': 'Parks'}]",4.0,,Park,51.52388,-0.065076,18
4,Myddelton Square Park,"001104 - Claremont Square, Angel","[{'alias': 'parks', 'title': 'Parks'}]",3.0,,Park,51.531667,-0.109915,11
5,St George's Gardens,"001019 - Ampton Street , Clerkenwell","[{'alias': 'parks', 'title': 'Parks'}]",5.0,,Park,51.527281,-0.118295,6
6,Argyle Square,"001019 - Ampton Street , Clerkenwell","[{'alias': 'parks', 'title': 'Parks'}]",3.0,,Park,51.527281,-0.118295,6
7,Soho Square,"001052 - Soho Square , Soho","[{'alias': 'parks', 'title': 'Parks'}]",3.5,,Park,51.515631,-0.132329,54
8,St Martin's Gardens,"200026 - Greenland Road, Camden Town","[{'alias': 'parks', 'title': 'Parks'}]",5.0,,Park,51.539099,-0.141728,35
9,Argyle Square,"001013 - St. Chad's Street, King's Cross","[{'alias': 'parks', 'title': 'Parks'}]",3.0,,Park,51.530059,-0.120974,13


In [9]:
# print basic info about the data
print(yelp_citybikes.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3572 entries, 0 to 3571
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   name        3572 non-null   object 
 1   location    3572 non-null   object 
 2   categories  3572 non-null   object 
 3   rating      3572 non-null   float64
 4   price       2300 non-null   object 
 5   Type        3572 non-null   object 
 6   latitude    3572 non-null   float64
 7   longitude   3572 non-null   float64
 8   free_bikes  3572 non-null   int64  
dtypes: float64(3), int64(1), object(5)
memory usage: 251.3+ KB
None


In [10]:
# print statistical summary of the data
print(yelp_citybikes.describe())

            rating     latitude    longitude   free_bikes
count  3572.000000  3572.000000  3572.000000  3572.000000
mean      3.809490    51.507356    -0.136093    14.034155
std       0.984991     0.018562     0.049115     9.528037
min       0.000000    51.456821    -0.229117     0.000000
25%       3.500000    51.495593    -0.174411     6.000000
50%       4.000000    51.508103    -0.137044    12.000000
75%       4.500000    51.521588    -0.106824    20.000000
max       5.000000    51.546805    -0.006990    63.000000


### EDA Stats for Foursquare CityBikes Data

In [16]:
# print first 5 rows of the data
foursquare_citybikes.head(5)

Unnamed: 0,location,name,poi_type,rating,price,Type,latitude,longitude,free_bikes
0,"003486 - St. Luke's Church, Chelsea",St Luke's Garden,['Park'],,,Park,51.489717,-0.170194,17
1,"003486 - St. Luke's Church, Chelsea",Lennox Gardens,['Park'],,,Park,51.489717,-0.170194,17
2,"003486 - St. Luke's Church, Chelsea",Cadogan Square Small Garden Limited,"['Property Management Office', 'Real Estate Ag...",,,Park,51.489717,-0.170194,17
3,"003486 - St. Luke's Church, Chelsea",Markham Square,['Park'],,,Park,51.489717,-0.170194,17
4,"003486 - St. Luke's Church, Chelsea",Tedworth Square,['Park'],,,Park,51.489717,-0.170194,17


In [12]:
# print basic info about the data
print(foursquare_citybikes.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5938 entries, 0 to 5937
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    5938 non-null   object 
 1   name        5938 non-null   object 
 2   poi_type    5938 non-null   object 
 3   rating      0 non-null      float64
 4   price       0 non-null      float64
 5   Type        5938 non-null   object 
 6   latitude    5938 non-null   float64
 7   longitude   5938 non-null   float64
 8   free_bikes  5938 non-null   int64  
dtypes: float64(4), int64(1), object(4)
memory usage: 417.6+ KB
None


In [13]:
# print statistical summary of the data
print(foursquare_citybikes.describe())

       rating  price     latitude    longitude   free_bikes
count     0.0    0.0  5938.000000  5938.000000  5938.000000
mean      NaN    NaN    51.506871    -0.130733    13.509768
std       NaN    NaN     0.019537     0.053287     9.218018
min       NaN    NaN    51.454753    -0.229117     0.000000
25%       NaN    NaN    51.493985    -0.173797     6.000000
50%       NaN    NaN    51.508622    -0.132140    12.000000
75%       NaN    NaN    51.521564    -0.094475    19.000000
max       NaN    NaN    51.549369    -0.006990    63.000000


In [17]:
#At this time I made the decision to drop the FourSquare rating and price columns, as API retrieval had brought back no data, despite following Foursquare specific API rules.
# #Removing id and fsq_id, as they do not add to the data
foursquare_citybikes = foursquare_citybikes.drop('price', axis=1)
foursquare_citybikes = foursquare_citybikes.drop('rating', axis=1)

Unnamed: 0,location,name,poi_type,Type,latitude,longitude,free_bikes
0,"003486 - St. Luke's Church, Chelsea",St Luke's Garden,['Park'],Park,51.489717,-0.170194,17
1,"003486 - St. Luke's Church, Chelsea",Lennox Gardens,['Park'],Park,51.489717,-0.170194,17
2,"003486 - St. Luke's Church, Chelsea",Cadogan Square Small Garden Limited,"['Property Management Office', 'Real Estate Ag...",Park,51.489717,-0.170194,17
3,"003486 - St. Luke's Church, Chelsea",Markham Square,['Park'],Park,51.489717,-0.170194,17
4,"003486 - St. Luke's Church, Chelsea",Tedworth Square,['Park'],Park,51.489717,-0.170194,17
...,...,...,...,...,...,...,...
5933,"000979 - Wenlock Road , Hoxton",Shepherdess Walk Park,['Park'],Restaurant,51.530992,-0.093904,8
5934,"002677 - Natural History Museum, South Kensington",Natural History Museum,['Science Museum'],Restaurant,51.495593,-0.179078,39
5935,"200027 - Ford Road, Old Ford",St Stephens Green,['Park'],Restaurant,51.532513,-0.033085,4
5936,"200027 - Ford Road, Old Ford",Vicolo Romano,"['Pizzeria', 'Italian Restaurant']",Restaurant,51.532513,-0.033085,4


In [20]:
# Count to see how many parks and restaurants
count = foursquare_citybikes['poi_type'].str.count('Park').sum()
print('Number of Parks:', count)

count = foursquare_citybikes['poi_type'].str.count('Restaurant').sum()
print('Number of Restaurants:', count)


Number of Parks: 2150
Number of Restaurants: 2777


# Database

Put all your results in an SQLite3 database (remember, SQLite stores its databases as files in your local machine - make sure to create your database in your project's data/ directory!)

Look at the data before and after the join to validate your data.