# Data Collection

## CITY BIKE API

In [1]:

# Import necessary libraries
import os
import re
import json
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from scipy.stats import ttest_ind
import sqlite3
import googlemaps

# Check for the installation of required packages and install if necessary
required_packages = ["requests", "pandas", "plotly", "seaborn", "scipy"]
for package in required_packages:
    try:
        __import__(package)
    except ImportError:
        !pip install {package}

# City Bike API Call

In [None]:
# Test connection 
# Fetch the City Bikes API Data
request = requests.get('http://api.citybik.es/v2/networks')

# Check for success or failure
#print(request.status_code)

# Observe the records
#print(json.dumps(request.json(), sort_keys=True, indent=4))


# Ask the user to select a city of choice for analysis 

# Create a dictionary of href and city. City is required for user input and href is required for endpoint to fetch bike station data

dictionary_of_cities_href = {'list_of_cities': [],
                            'href': []}

for i in range(len(request.json()['networks'])):
    dictionary_of_cities_href['list_of_cities'].append(request.json()['networks'][i]['location']['city'])
    dictionary_of_cities_href['href'].append(request.json()['networks'][i]['href'])

# Print the list of cities for user to select 
user_choice = input(f"Select a city to fetch the data for:\n{', '.join(dictionary_of_cities_href['list_of_cities'])}\n")

pattern = re.compile(user_choice, re.IGNORECASE)

# Filter the list based on the regex match
matching_strings = [s for s in dictionary_of_cities_href['list_of_cities'] if pattern.search(s)]
    
print(f"You have selected - {matching_strings[0]}")

# Create and use the endpoint for the user input city to fetch the bike station data

endpoint = 'https://api.citybik.es/'+ dictionary_of_cities_href['href'][dictionary_of_cities_href['list_of_cities'].index(matching_strings[0])]

request_station_date = requests.get(endpoint)

#print(request_station_date.status_code)

#print(json.dumps(request_station_date.json(), sort_keys=True, indent=4))

Parse through the response to get the details you want for the bike stations in that city (latitude, longitude, number of bikes).

In [None]:
# Relevant Fields 
# Station_Id, Station_Name, Latitude, Longitude, Timestamp, City, Country, Total_Slots, Total Available_Bikes, Available Ebikes, 
# Available Normal Bikes

bike_dataframe_dictionary = {
    'Station_Id': [], # network (d) -> stations (ld) -> id (k)
    'Station_Name': [], # network (d) -> stations (ld) -> name (k)
    'Latitude': [], # network (d) -> stations (ld) -> latitude (k)
    'Longitude': [], # network (d) -> stations (ld) -> longitude (k) 
    'Timestamp': [], # network (d) -> stations (ld) -> timestamp (k)  
    'City': [], # network (d) -> location (d) -> city (k)
    'Country': [], # network (d) -> location (d) -> country (k)
    'Total_Available_Slots': [], # network (d) -> stations (ld) -> extra (d) -> slots (k)
    'Total_Available_Free_Bikes': [], # network (d) -> stations (ld) -> free_bikes (k)
    'Total_Available_EBikes': [], # network (d) -> stations (ld) -> extra (d) -> ebikes (k)
    #'Total_Available_Normal_Bikes': [], # network (d) -> stations (ld) -> extra (d) -> normal_bikes (k)
    'Total_Available_Empty_Slots': [] # network (d) -> stations (ld) -> empty_slots (k)
}



for i in request_station_date.json()['network'].keys():
    if 'stations' in i:
        for j in range(len(request_station_date.json()['network'][i])):
            for k in request_station_date.json()['network'][i][j].keys():
                if 'id' == k:
                    bike_dataframe_dictionary['Station_Id'].append(request_station_date.json()['network'][i][j][k])
                if 'name' == k:
                    bike_dataframe_dictionary['Station_Name'].append(request_station_date.json()['network'][i][j][k])
                if 'latitude' == k:
                    bike_dataframe_dictionary['Latitude'].append(request_station_date.json()['network'][i][j][k])
                if 'longitude' == k:
                    bike_dataframe_dictionary['Longitude'].append(request_station_date.json()['network'][i][j][k])
                if 'timestamp' == k:
                    bike_dataframe_dictionary['Timestamp'].append(request_station_date.json()['network'][i][j][k])
                if 'empty_slots' == k:
                    bike_dataframe_dictionary['Total_Available_Empty_Slots'].append(request_station_date.json()['network'][i][j][k])
                if 'free_bikes' == k:
                    bike_dataframe_dictionary['Total_Available_Free_Bikes'].append(request_station_date.json()['network'][i][j][k])
                if 'extra' == k:
                    for m in request_station_date.json()['network'][i][j][k].keys():
                        if 'slots' == m:
                            bike_dataframe_dictionary['Total_Available_Slots'].append(request_station_date.json()['network'][i][j][k][m])
                        if 'ebikes' == m:
                            bike_dataframe_dictionary['Total_Available_EBikes'].append(request_station_date.json()['network'][i][j][k][m])
                        # if 'normal_bikes' == m:
                        #     bike_dataframe_dictionary['Total_Available_Normal_Bikes'].append(request_station_date.json()['network'][i][j][k][m])
    if 'location' == i:
        for l in request_station_date.json()['network'][i].keys():
            if 'city' == l:
                bike_dataframe_dictionary['City'].append(request_station_date.json()['network'][i][l])
            if 'country' == l:
                bike_dataframe_dictionary['Country'].append(request_station_date.json()['network'][i][l])



Put your parsed results into a DataFrame.

In [None]:
#print(bike_dataframe_dictionary)
bike_dataframe_city_country = pd.DataFrame({'id': 1, 
                                           'City': bike_dataframe_dictionary['City'],
                                           'Country': bike_dataframe_dictionary['Country']})
bike_dataframe_station_details = pd.DataFrame({'id': 1, 
                                               'Station_Id': bike_dataframe_dictionary['Station_Id'],
                                                'Station_Name': bike_dataframe_dictionary['Station_Name'],
                                                'Latitude': bike_dataframe_dictionary['Latitude'],
                                                'Longitude': bike_dataframe_dictionary['Longitude'], 
                                                'Timestamp': bike_dataframe_dictionary['Timestamp'],  
                                                'Total_Available_Slots': bike_dataframe_dictionary['Total_Available_Slots'],
                                                'Total_Available_Free_Bikes': bike_dataframe_dictionary['Total_Available_Free_Bikes'],
                                                'Total_Available_EBikes': bike_dataframe_dictionary['Total_Available_EBikes'],
                                                #'Total_Available_Normal_Bikes': bike_dataframe_dictionary['Total_Available_Normal_Bikes'],
                                                'Total_Available_Empty_Slots': bike_dataframe_dictionary['Total_Available_Empty_Slots']})

bike_dataframe_city_country = pd.DataFrame({'id': 1, 
                                           'City': bike_dataframe_dictionary['City'],
                                           'Country': bike_dataframe_dictionary['Country']})
bike_dataframe_station_details = pd.DataFrame({'id': 1, 
                                               'Station_Id': bike_dataframe_dictionary['Station_Id'],
                                                'Station_Name': bike_dataframe_dictionary['Station_Name'],
                                                'Latitude': bike_dataframe_dictionary['Latitude'],
                                                'Longitude': bike_dataframe_dictionary['Longitude'], 
                                                'Timestamp': bike_dataframe_dictionary['Timestamp'],  
                                                'Total_Available_Slots': bike_dataframe_dictionary['Total_Available_Slots'],
                                                'Total_Available_Free_Bikes': bike_dataframe_dictionary['Total_Available_Free_Bikes'],
                                                'Total_Available_EBikes': bike_dataframe_dictionary['Total_Available_EBikes'],
                                                #'Total_Available_Normal_Bikes': bike_dataframe_dictionary['Total_Available_Normal_Bikes'],
                                                'Total_Available_Empty_Slots': bike_dataframe_dictionary['Total_Available_Empty_Slots']})
bike_dataframe = pd.merge(bike_dataframe_city_country, bike_dataframe_station_details, on = 'id', how = 'outer')
print(bike_dataframe.shape)
bike_dataframe 

In [None]:

bike_dataframe.to_csv('City_Bike_Paris_Data_9th.csv')


## Google MAPS API Call

In [None]:
#!pip install googlemaps
#import googlemaps

# Function to check the googlemaps version
def main():
    print(f"googlemaps version: {googlemaps.__version__}")

if __name__ == "__main__":
    main()

# Load the Google Maps Key
google_maps_key = os.getenv('Google_Places_API_Key')

# Initialize Google Maps Client with API Key
gmaps = googlemaps.Client(key=google_maps_key)

# Set Pandas display options (Use cautiously with large datasets)
pd.set_option('display.max_columns', None)  
pd.set_option('display.max_rows', None) 

# Read in data
city_bike_df = pd.read_csv(r'City_Bike_Paris_Data_9th.csv')
sample_df = city_bike_df[['Latitude', 'Longitude']].drop_duplicates()

# Function to check if a value matches any in a list
def check_matching_value(lst, value):
    return value in lst

# Function to fetch results from Google Maps API and append to a list
def fetch_gmaps_results(lat, long, df_list):
    df_check = pd.DataFrame()
    radius = 1000
    data = gmaps.places_nearby((lat, long), radius)
    if data['status'] == 'OK':
        df_check = pd.DataFrame(pd.json_normalize(data['results']))
        df_check['latitude'] = lat
        df_check['longitude'] = long
        selected_columns = ['place_id', 'latitude', 'longitude', 'price_level', 'rating', 'user_ratings_total', 'types']
        for col in selected_columns:
            if col not in df_check.columns:
                df_check[col] = np.nan
        df_check = df_check[selected_columns]
        list_POI = ['lodging', 'airport', 'library', 'amusement_park', 'light_rail_station', 'aquarium', 'bus_station', 'casino', 'shopping_mall', 'stadium', 'subway_station', 'tourist_attraction', 'train_station', 'transit_station']
        for search_value in list_POI:
            df_check[search_value] = df_check['types'].apply(lambda x: check_matching_value(x, search_value))
        df_list.append(df_check)

# Initialize an empty list for storing DataFrames
df_list = []

# Loop over each row in sample_df to fetch Google Maps data
for index, row in sample_df.iterrows():
    fetch_gmaps_results(row['Latitude'], row['Longitude'], df_list)

# Concatenate all DataFrames in the list into df_final
df_final = pd.concat(df_list, ignore_index=True)


In [None]:
# Further processing can be done on df_final as needed

df_final.to_csv(r'Gmaps_Data_9th.csv', index = False)

## YELP API CALL

Function for API Call to Yelp


In [17]:
# Function to get restaurant ratings

response_variable = ''

def get_restaurant_ratings(lat, lng, api_key):
    url = 'https://api.yelp.com/v3/businesses/search'
    headers = {'Authorization': 'Bearer %s' % api_key}
    params = {'latitude': lat, 'longitude': lng, 'limit': 50}
    global response_variable
    try:
        response = requests.get(url, params=params, headers=headers)
        response_variable = response
        response.raise_for_status()  # Raises an HTTPError if the HTTP request returned an unsuccessful status code
        data = pd.DataFrame(pd.json_normalize(response.json()['businesses']))
        if not data.empty:
            new_df = data[data.distance <= 1000]
            new_df['Latitude'] = lat 
            new_df['Longitude'] = lng
            return new_df
    except requests.exceptions.HTTPError as errh:
        print(f"Http Error: {errh}")
    except requests.exceptions.ConnectionError as errc:
        print(f"Error Connecting: {errc}")
    except requests.exceptions.Timeout as errt:
        print(f"Timeout Error: {errt}")
    except requests.exceptions.RequestException as err:
        print(f"Error: {err}")



Part 1 of Fetching the results (Yelp allows 500 API calls per key per user in 24 hours). We have 1462 data points so will have to run the code and save the datasets in batches. 


In [None]:

# Read the initial bike data
bike_dataframe = pd.read_csv(r'City_Bike_Paris_Data_9th.csv')
bike_dataframe_lat_long = bike_dataframe[['Latitude', 'Longitude']].drop_duplicates()


# Batch 1
# Your Yelp API Key
api_key = os.getenv('Yelp_API_Key_2')

all_ratings = []  # List to store all ratings dataframes

# Iterate over rows efficiently
for _, row in bike_dataframe_lat_long.iterrows():
    lat, lng = row['Latitude'], row['Longitude']
    ratings_df = get_restaurant_ratings(lat, lng, api_key)
    all_ratings.append(ratings_df)

complete_ratings_df = pd.DataFrame()

# Concatenate all results
complete_ratings_df = pd.concat(all_ratings, ignore_index=True)
print(complete_ratings_df.shape)
complete_ratings_df.head(5)


In [None]:
# Save to CSV
complete_ratings_df.to_csv(r'Yelp_Lat_Long_Matching_Dataset_Part1.csv', index=False)

Part 2 of Fetching the results (Yelp allows 500 API calls per key per user in 24 hours). We have ~900 data points so will have to run the code and save the datasets in batches. 


In [None]:
# Read the Part 1 dataset 
part_1_ratings_df = pd.read_csv(r'Yelp_Lat_Long_Matching_Dataset_Part1.csv', index=False)

# Group by and count
ratings_df_agg = part_1_ratings_df.groupby(['Latitude', 'Longitude']).size().reset_index(name='counts')

# Merge with the bike data
merged = bike_dataframe_lat_long.merge(ratings_df_agg, on=['Latitude', 'Longitude'], how='outer', indicator=True)

# Create the set of remaining lat long pairs for which the API call needs to be done
batch2_lat_long = merged[merged['_merge'] == 'left_only'].drop(columns=['_merge'])

batch2_lat_long.shape

# Your Yelp API Key
api_key = os.getenv('Yelp_API_Key_2')

all_ratings = []  # List to store all ratings dataframes
ratings_df = pd.DataFrame()

# Iterate over rows efficiently
for _, row in batch2_lat_long.iterrows():
    lat, lng = row['Latitude'], row['Longitude']
    ratings_df = get_restaurant_ratings(lat, lng, api_key)
    all_ratings.append(ratings_df)

complete_ratings_df = pd.DataFrame()

# Concatenate all results
complete_ratings_df = pd.concat(all_ratings, ignore_index=True)
print(complete_ratings_df.shape)
complete_ratings_df.head(5)


In [None]:
# Save to CSV
complete_ratings_df.to_csv(r'C:\Path\To\Yelp_Lat_Long_Matching_Dataset_Part2.csv', index=False)

Part 3 of Fetching the results (Yelp allows 500 API calls per key per user in 24 hours). We have 499 data points so will have to run the code and save the datasets in batches. 


In [18]:
# Path needs to be updated

# Read the Part 1 and 2 dataset 

part_1_ratings_df = pd.read_csv(r'Yelp_Lat_Long_Matching_Dataset_Part1.csv')
part_2_ratings_df = pd.read_csv(r'Yelp_Lat_Long_Matching_Dataset_Part2.csv')

# Concatenate the 2
merged_ratings_df = pd.concat([part_1_ratings_df, part_2_ratings_df], axis = 0)

# Group by and count
ratings_df_agg = merged_ratings_df.groupby(['Latitude', 'Longitude']).size().reset_index(name='counts')

# Merge with the bike data
merged = bike_dataframe_lat_long.merge(ratings_df_agg, on=['Latitude', 'Longitude'], how='outer', indicator=True)

# Create the set of remaining lat long pairs for which the API call needs to be done
batch3_lat_long = merged[merged['_merge'] == 'left_only'].drop(columns=['_merge'])

batch3_lat_long.shape

# # Your Yelp API Key
api_key = os.getenv('Yelp_Key')
api_key

'pxDjF-nXu-yBhcR8qlY3UyZHhKnM1fvLKqUTqnhHtaJ8iJiylosBoPbtwscvktDBTk9LibvLPZZlg6b9HcJ3wJRBIyLlnJKTwQ90v-vx93zzfCozTW-qlR4sVv6eZXYx'

In [26]:


all_ratings = []  # List to store all ratings dataframes

# Iterate over rows efficiently
for _, row in batch3_lat_long.iterrows():
    lat, lng = row['Latitude'], row['Longitude']
    ratings_df = get_restaurant_ratings(lat, lng, api_key)
    all_ratings.append(ratings_df)


#Concatenate all results
complete_ratings_df = pd.concat(all_ratings, ignore_index=True)
print(complete_ratings_df.shape)
complete_ratings_df.head(5)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['Latitude'] = lat
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['Longitude'] = lng
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['Latitude'] = lat
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See t

Http Error: 429 Client Error: Too Many Requests for url: https://api.yelp.com/v3/businesses/search?latitude=48.87308754&longitude=2.413338013&limit=50
(7284, 26)


Unnamed: 0,id,alias,name,image_url,is_closed,url,review_count,categories,rating,transactions,...,location.address1,location.address2,location.address3,location.city,location.zip_code,location.country,location.state,location.display_address,Latitude,Longitude
0,QcpxI7Am6g6mc0D20w0Gng,il-etait-un-square-paris-6,Il Etait un Square,https://s3-media4.fl.yelpcdn.com/bphoto/0GLWzN...,False,https://www.yelp.com/biz/il-etait-un-square-pa...,47,"[{'alias': 'burgers', 'title': 'Burgers'}, {'a...",5.0,[],...,54 rue Corvisart,,,Paris,75013,FR,75,"[54 rue Corvisart, 75013 Paris, France]",48.830982,2.348165
1,paX4EDnCsGbKPyS7Jcml_A,chez-gladines-paris,Chez Gladines,https://s3-media2.fl.yelpcdn.com/bphoto/h0pVDJ...,False,https://www.yelp.com/biz/chez-gladines-paris?a...,256,"[{'alias': 'basque', 'title': 'Basque'}, {'ali...",4.0,[],...,30 rue des Cinq Diamants,,,Paris,75013,FR,75,"[30 rue des Cinq Diamants, 75013 Paris, France]",48.830982,2.348165
2,J1uJVDBqr9SbXz0RnUG-tQ,le-bistro-v-paris,Le Bistro V,https://s3-media3.fl.yelpcdn.com/bphoto/JSBhme...,False,https://www.yelp.com/biz/le-bistro-v-paris?adj...,26,"[{'alias': 'bistros', 'title': 'Bistros'}, {'a...",5.0,[],...,56 bd de Port-Royal,,,Paris,75005,FR,75,"[56 bd de Port-Royal, 75005 Paris, France]",48.830982,2.348165
3,PUmsiCvU9a8gYgt06aoAdQ,le-sirocco-paris,Le Sirocco,https://s3-media4.fl.yelpcdn.com/bphoto/HAMXia...,False,https://www.yelp.com/biz/le-sirocco-paris?adju...,46,"[{'alias': 'moroccan', 'title': 'Moroccan'}]",4.5,[],...,8 bis rue Gobelins,,,Paris,75013,FR,75,"[8 bis rue Gobelins, 75013 Paris, France]",48.830982,2.348165
4,PI6edk8v1mpFWpCS56e-0g,le-vaudésir-paris,Le Vaudésir,https://s3-media3.fl.yelpcdn.com/bphoto/D_rJqD...,False,https://www.yelp.com/biz/le-vaud%C3%A9sir-pari...,25,"[{'alias': 'bistros', 'title': 'Bistros'}]",5.0,[],...,41 rue Dareau,,,Paris,75014,FR,75,"[41 rue Dareau, 75014 Paris, France]",48.830982,2.348165


In [27]:

# Save to CSV
complete_ratings_df.to_csv(r'Yelp_Lat_Long_Matching_Dataset_Part3.csv', index=False)




# Population Data - 
https://data.humdata.org/dataset/france-high-resolution-population-density-maps-demographic-estimates?

In [2]:
# Population Data
import pandas as pd
check = pd.read_csv(r'C:\Users\aksha\Downloads\population_fra_2019-07-01.csv\population_fra_2019-07-01.csv')
check.head(100)
check.to_csv(r'Paris_Population_Data.csv', index=False)