# Foursquare

## Send a request to Foursquare with a small radius (1000m) for all the bike stations in your city of choice. 

### Import Libraries

In [9]:
# Import required librairies and modules
import os   #used to access the values of environment variables
import requests #send api requests
import json # to read json files
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import numpy as np # linear algebra
import matplotlib.pyplot as plt # data visualization
import seaborn as sns # data visualization

### Defining a function to pull all venues within 1km of a specified lat/long


In [10]:
def get_venues(ll, id):

    #Retrieving API key
    api_key=os.getenv('FOURSQUARE_API')
    
    url = "https://api.foursquare.com/v3/places/search?ll=" + ll + "&radius=1000&categories=13000%2C12072&fields=fsq_id%2Ccategories%2Cname%2Crating&limit=50"

    headers = {
        "accept": "application/json",
        "Authorization": api_key 
    }

    reponse = requests.get(url, headers=headers)

    venues_json = json.loads(reponse.text)

    
    # Parse thru the JSON to get the required info. Below are my parsing commands & process,
        # print(venues_json.keys()) #Returns two keys. The second one has no information of interest so focus on the first one
        # print(pd.json_normalize(venues_json['results'])) #Perfect. Categories has a dictionnary, let's parse it to see what it has.
        # print(pd.json_normalize(venues_json['results'], record_path=['categories'])) #Parse down the categories. We'll only be interested in counting the amount of results so this is too specific for us


    # Add to a df
    add_df = pd.json_normalize(venues_json['results'])
    
    #adding a column to the DF to refer to the ID of the bikes DF. Will be used later to join the data.
    add_df['bikestation_id'] = id
    
    #For this project, we'll consolidate all foursquare return to a df, but also store the number of results into the bikes df
    global fsq_df
    
    fsq_df = pd.concat([fsq_df, add_df], ignore_index=True)
    
    return len(add_df)

### Get the data for the bike stations

In [11]:
citybikes_df = pd.read_csv('Vancouver_BikeStation.csv') 

# Add a column to the df that will include the amount of POIs
citybikes_df['foursquare_venues'] = ""

fsq_df = pd.DataFrame() #Creating an empty df to store pulls from Foursquare. Will be used in the function below


#We need to loop thru all records of citybikes_df, but also return values of column 'latitude', 'Longitude' and 'id' as they are needed for the get_values function. This is the only working solution I found:
for i, j, k in zip(citybikes_df['id'].values, citybikes_df['latitude'].values, citybikes_df['longitude'].values):
    #Set ll to the lat and long of the station of this row
    lat= round(j, 2)
    long= round(k,2)

    ll = str(lat) + ',' + str(long)
        
    #modify value of a column where the value of another column is a certain value
    citybikes_df.loc[citybikes_df['id'] == i, 'foursquare_venues'] = get_venues(ll, i)    

### Checking the amount of results to confirm it worked.

In [12]:
fsq_df.shape

(11595, 5)

## Parse through the response to get the POI (such as restaurants, bars, etc) details you want (ratings, name, location, etc)

### I chose which fields to include at the API request.So no need to remove unwanted columns.

In [13]:
fsq_df.head()

Unnamed: 0,fsq_id,categories,name,rating,bikestation_id
0,5674aec9498e6800815c3598,"[{'id': 13199, 'name': 'Indian Restaurant', 'i...",Vij's,9.0,7a19c49f486d7c0c02b3685d7b240448
1,4aec909ff964a520ccc821e3,"[{'id': 13035, 'name': 'Coffee Shop', 'icon': ...",Milano Coffee,8.7,7a19c49f486d7c0c02b3685d7b240448
2,51b201d27dd249ae714ba728,"[{'id': 13003, 'name': 'Bar', 'icon': {'prefix...",33 Acres Brewing Co,9.2,7a19c49f486d7c0c02b3685d7b240448
3,5177156c498ea4fd6b901dda,"[{'id': 13034, 'name': 'Café', 'icon': {'prefi...",Aperture Coffee Bar,8.4,7a19c49f486d7c0c02b3685d7b240448
4,4e223f06d4c0d32590f80ff4,"[{'id': 13306, 'name': 'Taco Restaurant', 'ico...",La Taqueria Pinche Taco Shop,8.7,7a19c49f486d7c0c02b3685d7b240448


Not too sure about categories yet. I'll leave it there just in case, but likely won't use it.

## Put your parsed results into a DataFrame

In [9]:
# Results are stored in fsq_df. However, I will save them to a csv as well.
fsq_df.to_csv('Foursquare_venues.csv')

# Yelp

## Send a request to Yelp with a small radius (1000m) for all the bike stations in your city of choice. 

### Defining a function to pull all venues within 1km of a specified lat/long

In [28]:
def get_business_yelp(lat, long, id):
    #Retrieving API key
    api_key=os.getenv('YELP_API')
    
    url = "https://api.yelp.com/v3/businesses/search?latitude=" + lat + "&longitude=" + long + "&radius=1000&categories=nightlife,bars,restaurants,policedepartments&limit=50"

    # headers contain the api key.
    headers = {'Authorization': 'Bearer {}'.format(api_key)}

    response = requests.get(url, headers=headers)
    
    json_response = json.loads(response.text)

    # Parse thru the JSON to get the required info. Below are my parsing commands & process,
    #print(json_response.keys()) #Returned 3 keys, will look at all 3 in details:
        ##### print(pd.json_normalize(json_response['businesses'])) # Jackpot!
        #print(pd.json_normalize(json_response['total'])) # This doesn't work.. trying this instead:
            ##### print(json_response['total']) # It seems to return the total number of results. So although we hit our limit of 50, we could get the total number of results there. Interesting! I will store this as well, it may come in handy
        #print(pd.json_normalize(json_response['region'])) #returning us our lat/long.. don't need this

    # Add to a df
    add_df = pd.json_normalize(json_response['businesses'])

    #adding a column to the DF to refer to the ID of the bikes DF. Will be used later to join the data.
    add_df['bikestation_id'] = id

    #For this project, we'll consolidate all yelp return to a df, but also store the number of results into the bikes df
    global yelp_df

    yelp_df = pd.concat([yelp_df, add_df], ignore_index=True)

    # In this case, instead of returning the len(add_df), I'll return the total. I'm taking note however that I don't have this ability with Foursquare and so when comparing the two number of returns I'll have to take into account that the number of results from Foursquare will be capped to 50 and yelp will not be.
    return json_response['total']

### Get the data for the bike stations

In [29]:
#citybikes_df = pd.read_csv('Vancouver_BikeStation.csv') 
## skipping for this instance as I added the Foursquare venues already.


# Add a column to the df that will include the amount of POIs
citybikes_df['yelp_venues'] = ""

yelp_df = pd.DataFrame() #Creating an empty df to store pulls from Foursquare. Will be used in the function below


#We need to loop thru all records of citybikes_df, but also return values of column 'latitude', 'Longitude' and 'id' as they are needed for the get_business_yelp function. This is the only working solution I found:
for i, j, k in zip(citybikes_df['id'].values, citybikes_df['latitude'].values, citybikes_df['longitude'].values):
    
    # Yelp API divides lat and long so we won't merge them in a formatted string
    lat= str(j)
    long= str(k)
       
    #modify value of a column where the value of another column is a certain value
    citybikes_df.loc[citybikes_df['id'] == i, 'yelp_venues'] = get_business_yelp(lat, long, i)    

## Parse through the response to get the POI (such as restaurants, bars, etc) details you want (ratings, name, location, etc)

### We didn't have the ability to return only the columns we wanted in this API. So we'll have to drop them here.

In [30]:
yelp_df.drop(['alias', 'image_url', 'is_closed', 'url', 'transactions', 'coordinates.longitude', 'location.address1', 'location.address2', 'location.address3', 'location.city', 'location.zip_code', 'location.country', 'location.state', 'location.display_address'], axis=1, inplace=True)

### Had to do it twice, some columns were hidden

In [31]:
yelp_df.drop(['price', 'phone', 'display_phone', 'distance', 'coordinates.latitude'], axis=1, inplace=True)
yelp_df.head()

Unnamed: 0,id,name,review_count,categories,rating,bikestation_id
0,6iOAgzJ0DRZNSKA3FSrrOg,La Taqueria Pinche Taco Shop,668,"[{'alias': 'mexican', 'title': 'Mexican'}]",4.0,7a19c49f486d7c0c02b3685d7b240448
1,AEOyRbQtYD3bmX1qJWvt4g,Peaceful Restaurant,610,"[{'alias': 'chinese', 'title': 'Chinese'}, {'a...",3.5,7a19c49f486d7c0c02b3685d7b240448
2,NensKn1MSVU_rm-1Y6WlFA,Marulilu Cafe,281,"[{'alias': 'cafes', 'title': 'Cafes'}]",4.0,7a19c49f486d7c0c02b3685d7b240448
3,nkDZY5xqihF3XtZMzzfqqg,Hokkaido Ramen Santouka,218,"[{'alias': 'noodles', 'title': 'Noodles'}, {'a...",4.0,7a19c49f486d7c0c02b3685d7b240448
4,XAH2HpuUUtu7CUO26pbs4w,Saku,182,"[{'alias': 'japanese', 'title': 'Japanese'}]",4.0,7a19c49f486d7c0c02b3685d7b240448


## Put your parsed results into a DataFrame

In [32]:
# Results are stored in yelp_df. However, I will save them to a csv as well.
yelp_df.to_csv('Yelp_venues.csv')

In [33]:
# Same for the citybikes
citybikes_df.to_csv('Vancouver_BikeStation_Enriched.csv')

# Comparing Results

## Which API provided you with more complete data? Provide an explanation. 

When I was cleaning the data, Yelp had much more details on venues than Foursquare did.

However, in our case, we are also interested in the amount of venues returned by each, for each bike stations. Both queries have the same radius and we are pulling venues with equivalent categories. Therefore, looking at the amount of returns they provided for each bike stations could help us determine which provides more complete data.

Let's crunch the numbers:

In [34]:
## Count the amount of foursquare venues per values use groupby
print(citybikes_df.groupby('foursquare_venues').size())

foursquare_venues
2       2
3       1
4       1
7       1
18      2
33      3
38      2
44      2
45      9
46      3
47      3
48      2
49      2
50    208
dtype: int64


In [35]:
#For Yelp, our number of venues is not capped to 50, so we'll have to do it in two parts.

#Less than 50
print(citybikes_df[citybikes_df['yelp_venues'] < 50].groupby('yelp_venues').size())

print("-----------------------")

#Over 50
print("Bike stations with Yelp returning 50 and over POIs:", citybikes_df[citybikes_df['yelp_venues'] > 50].shape[0])

yelp_venues
1     2
2     1
23    1
26    1
28    1
39    1
42    1
49    2
dtype: int64
-----------------------
Bike stations with Yelp returning 50 and over POIs: 230


From this data, it appears Yelp is returning more data and so we could conclude Yelp is providing us with more complete data, both in amount of results and in details for these results.

However, I would add the following consideration: 
Are Yelp and Foursquare pulling exactly 1000m from the same point? Is Yelp pulling slightly over 1000m, or Foursquare slightly below? It should be noted Foursqure doesn't take the same level of precision in lat/long than Yelp is. It is possible that this is moving the point enough that the results are not the same and, consequently, could significantly disrupt the accuracy of our data.

Observing this data on a map would help to quickly see if the pulls are the same and if Yelp is indeed providing us additional results than Foursquare.

Nevertheless, Foursquare returns are capped to 50 (meaning even if there is 600 venues around a bikestation, Foursquare still returns us a value of 50). Yelp returns us 50 results as well, but it also includes a total number of returns, which will be more useful for us to use. 

## Get the top 10 restaurants according to their rating

In [36]:
# Yelp:

#The df would return duplicate data if we used it as is (as some restaurants got returned multiples times if close to more than one bikestation)
## We'll create one with no duplicate venues
yelp_unique_df = yelp_df
yelp_unique_df.drop_duplicates(subset=['id'], keep='first', inplace=True)


#There are more than 10 restaurants with a rating of 5. We'll sort by rating and then by review_count
yelp_unique_df.sort_values(by=['rating', 'review_count'], ascending=[False, False]).head(10)



Unnamed: 0,id,name,review_count,categories,rating,bikestation_id
58,K1nbiOrySlw_-XG-3NmErQ,Manoush'eh,220,"[{'alias': 'mediterranean', 'title': 'Mediterr...",5.0,32603a87cfca71d0f7dfa3513bad69d5
73,3ByjrLF8nyArUxvO6vqJCw,Incognito Coffee,135,"[{'alias': 'coffee', 'title': 'Coffee & Tea'},...",5.0,32603a87cfca71d0f7dfa3513bad69d5
79,kDZTaeUaf78Jd25ktX8a8g,Number e food,134,"[{'alias': 'sandwiches', 'title': 'Sandwiches'...",5.0,32603a87cfca71d0f7dfa3513bad69d5
360,XFkhu9HNNx6fx1Iz6aRPeA,L'atelier Patisserie,91,"[{'alias': 'bakeries', 'title': 'Bakeries'}, {...",5.0,95e624191c655f50e401d280cd39a9ad
282,pHFumFTvgu653EaRUHuh4w,Smithe Salad,62,"[{'alias': 'juicebars', 'title': 'Juice Bars &...",5.0,b07d513f87897cff7319a0e59d7e567c
5455,viUq9OyTgOGgcdyuNoey3w,The Garden Strathcona,35,"[{'alias': 'cafes', 'title': 'Cafes'}, {'alias...",5.0,91035fe1d70b65c9cb8ec4a007af7c7f
1880,ForB55b3mE5EEhC25sHMnQ,Mazahr Lebanese Kitchen,32,"[{'alias': 'lebanese', 'title': 'Lebanese'}]",5.0,71c064fa93cab5f0b7306e6c923b77c4
5468,XMCI9IrEi0WsBVzYFJ1TzA,Caffe La Tana,27,"[{'alias': 'cafes', 'title': 'Cafes'}]",5.0,91035fe1d70b65c9cb8ec4a007af7c7f
3086,qR4rQOC8ewW0PD5KK9Blng,Casereccio Foods,26,"[{'alias': 'italian', 'title': 'Italian'}]",5.0,51e4d3731aebbf9c3186f310a08de6c8
994,Q7d2jLllssfrP9u3JJGMIw,Arike Restaurant,25,"[{'alias': 'african', 'title': 'African'}, {'a...",5.0,1d7e73b3321ebb6713774d376247d9f9


In [37]:
#Foursquare:
#The df would return duplicate data if we used it as is (as some restaurants got returned multiples times if close to more than one bikestation)
## We'll create one with no duplicate venues
fsq_unique_df = fsq_df
fsq_unique_df.drop_duplicates(subset=['fsq_id'], keep='first', inplace=True)

#Foursqure has a more precise rating system than yelp. However, I've added a bit more rows than 10 as 10 & 11
fsq_unique_df.sort_values(by=['rating'], ascending=[False]).head(13)

Unnamed: 0,fsq_id,categories,name,rating,bikestation_id
124,5075f7b9f2e73b925f6431ef,"[{'id': 10001, 'name': 'Amusement Park', 'icon...",FlyOver Canada,9.3,6d42fa40360f9a6b2bf641c7b8bb2862
102,4d9b41359298b1f7635b5338,"[{'id': 13034, 'name': 'Café', 'icon': {'prefi...",Revolver,9.3,6d42fa40360f9a6b2bf641c7b8bb2862
350,54bea64f498ec3ed3dde9a10,"[{'id': 13046, 'name': 'Ice Cream Parlor', 'ic...",Earnest Ice Cream,9.3,95e624191c655f50e401d280cd39a9ad
2,51b201d27dd249ae714ba728,"[{'id': 13003, 'name': 'Bar', 'icon': {'prefix...",33 Acres Brewing Co,9.2,7a19c49f486d7c0c02b3685d7b240448
954,4aa80589f964a520ca4e20e3,"[{'id': 13263, 'name': 'Japanese Restaurant', ...",Guu with Garlic,9.1,1d7e73b3321ebb6713774d376247d9f9
105,4aa7f22bf964a520214e20e3,"[{'id': 13383, 'name': 'Steakhouse', 'icon': {...",Gotham Steakhouse & Cocktail Bar,9.1,6d42fa40360f9a6b2bf641c7b8bb2862
511,5646ab13498e460c732920e8,"[{'id': 13272, 'name': 'Ramen Restaurant', 'ic...",Ramen Danbo - Robson,9.1,bf8408067b0e0c963f3ff526977bcef3
1759,5780683c498e51523ab4851b,"[{'id': 13049, 'name': 'Diner', 'icon': {'pref...",Raisu,9.1,b9fd847d53fe45658fee4c46e7f562b8
1506,553ac7e1498e04e927f8762d,"[{'id': 13034, 'name': 'Café', 'icon': {'prefi...",Birds & the Beets,9.1,f1ca711137cde59e824167a4274df888
115,4d2cce46ae3a8cfa4067bf70,"[{'id': 13065, 'name': 'Restaurant', 'icon': {...",Hawksworth Restaurant,9.1,6d42fa40360f9a6b2bf641c7b8bb2862
