In [19]:
# Imports
import pandas as pd
import json
import requests
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
import math

Join the data from Part 1 with the data from Part 2 to create a new dataframe.

In [20]:
# Citi Bike dataframe
# Code copied from citi_bike_completed file. Code comments are in citi_bike_completed file 

url = "http://api.citybik.es/v2/networks/velov"
response = requests.get(url)
data = response.json()

pretty_json = json.dumps(data, indent=2)

stations=data['network']['stations']

df_stations=pd.DataFrame(stations)


In [13]:
# According to Google 1∘ of latitude is 10,000/90=111 kilometers, and 0.001∘=0.111 kilometers or 111 meters
# I will select 50 bike stations for sample
# Since I chose before "center point" with latitude 45.7535 and longitude 4.8380 for FourSqr and Yelp searches
# I will choose the same center point for bike stations
# The radius for "bike station sample" will be the same as for FourSqr and Yelp, and it will be 40,000m
# Thus, all bike stations which are in a range latitude 45.7535° +/- 0.3604° and longitude 4.8380° +/- 0.3604° will be in sample

In [21]:
# To select bike station sample
selected_rows_latitude = df_stations[(df_stations["latitude"] >= 45.393140) & (df_stations["latitude"] <= 46.113860)]
selected_rows_longitude=selected_rows_latitude[
                                                (selected_rows_latitude["longitude"] >= 4.477640) 
                                               & (selected_rows_latitude["longitude"] <= 5.198360)
                                                ]
selected_rows_longitude
# The result gave me the same # of rows as in original dataframe
# Since the result is the same as original dataframe
# sample will be from first 50 rows

bike_sample=df_stations.head(50)


In [22]:
bike_sample.head(5)

Unnamed: 0,empty_slots,extra,free_bikes,id,latitude,longitude,name,timestamp
0,1,{'address': 'Place de la Paix - En face de La ...,18,2c40da662e019fb95f6d9ae00d6ce0e5,45.768023,4.831733,1031 - PLACE DE LA PAIX,2023-06-05T17:32:38.288000Z
1,8,{'address': 'Pierre Corneille/cours Franklin R...,11,a0df288b785197f14c37b0e8bf604bfb,45.769383,4.842364,6022 - PLACE MARÉCHAL LYAUTEY,2023-06-05T17:32:38.301000Z
2,8,{'address': 'Rue Roquette / angle rue de Bourg...,8,d8b49c9e23c2be797404310c6900f668,45.775679,4.805314,9003 - PLACE VALMY,2023-06-05T17:32:38.348000Z
3,10,{'address': 'Intersection Ferrandière/Jasseron...,10,f0c678190ffad3ee39a3c64ef0fffd61,45.759488,4.869346,3021 - PLACE DE LA FERRANDIÈRE,2023-06-05T17:32:38.609000Z
4,26,"{'address': '', 'banking': False, 'bonus': Fal...",4,f7ef88e4d393b48800f7be6856448bef,45.747835,4.872261,3070 - REBATEL / SAINT-MAXIMIN,2023-06-05T17:32:38.622000Z


In [23]:
# FourSqr dataframe

url = "https://api.foursquare.com/v3/places/search?ll=45.7535%2C4.8380&radius=40000&fields=name%2Cgeocodes%2Cdistance%2Crating%2Clocation%2Ccategories&sort=POPULARITY&limit=50"
headers = {
    "accept": "application/json",
    "Authorization": "fsq34wlOTO05hX+GoyoJT2bpzJA8WEKSwh5UNexig8A72jM="
}

response = requests.get(url, headers=headers)
json_data = json.loads(response.text)

POI=json_data

category_name_list=[]
for objects in POI["results"]:
    category_name=objects["categories"][0]["name"]
    category_name_list.append(category_name)
   
distance_list=[]
for objects in POI["results"]:
    distance=objects["distance"]
    distance_list.append(distance)
    
latitude_list=[]
for objects in POI["results"]:
    latitude=objects["geocodes"]["main"]["latitude"]
    latitude_list.append(latitude)
   
longitude_list=[]
for objects in POI["results"]:
    longitude=objects["geocodes"]["main"]["longitude"]
    longitude_list.append(longitude)
    
locality_list=[]
for objects in POI["results"]:
    locality=objects["location"]["locality"]
    locality_list.append(locality)
    
postcode_list=[]
for objects in POI["results"]:
    postcode=objects["location"].get("postcode")
    postcode_list.append(postcode)
    
region_list=[]
for objects in POI["results"]:
    region=objects["location"]["region"]
    region_list.append(region)
    
place_name_list=[]
for objects in POI["results"]:
    place_name=objects["name"]
    place_name_list.append(place_name)
    
rating_list=[]
for objects in POI["results"]:
    rating=objects["rating"]
    rating_list.append(rating)

POI_data={
    "business_name":place_name_list,
    "category_name":category_name_list,
    "distance":distance_list,
    "rating":rating_list,
    "latitude":latitude_list,
    "longitude":longitude_list,  
    "locality":locality_list,
#    "region":region_list,
    "postcode":postcode_list,        
}

# index set to 50 since we have limit of 50 business from Foursquare
index_values = []
for i in range (1,51):
    index_values.append(i)

# create FourSquare dataframe    
FourSqr_df=pd.DataFrame(POI_data,index=index_values)

In [24]:
FourSqr_df.head(5)


Unnamed: 0,business_name,category_name,distance,rating,latitude,longitude,locality,postcode
1,Auberge du Pont de Collonges-Paul Bocuse,French Restaurant,6940,8.1,45.815712,4.847452,Collonges-au-Mont-d'Or,69660
2,La Demeure du Chaos,Art Museum,9366,8.2,45.837487,4.826422,Saint-Romain-au-Mont-d'Or,69270
3,Guy Lassausaie,Fast Food Restaurant,14243,8.0,45.872851,4.770494,Chasselay,69380
4,Zoo de Saint-Martin-la-Plaine,Zoo,30022,8.4,45.549991,4.583793,Saint-Martin-la-Plaine,42800
5,Place Bellecour,Plaza,623,9.1,45.757567,4.832622,Lyon,69002


In [28]:
#bike_sample
# I created new columns for "center_point" From this point I will be calculating difference in km to bike stations

# Assign center_point. Randomly celected bike station in the downtown Lyon, France
center_point_lat=45.7535
center_point_long=4.8380
# will also calculate total slots per bike stations
total_slots=0

# Inserting column into bike_sample df
bike_sample["center_point_lat"] = center_point_lat
bike_sample["center_point_long"] = center_point_long
bike_sample["total_slots"] = total_slots

# will calculate the difference between "center_point" and bike stations
for index, row in bike_sample.iterrows():
    # #0.001∘=0.111 kilometers or 111 meters. Result in m
    difference_in_m_lat = float((center_point_lat - row["latitude"]) * 111 / 0.001)
    difference_in_m_long = float((center_point_long - row["longitude"]) * 111 / 0.001)
   
    
    # Pythagorean theorem in m
    distance_from_center = math.sqrt(difference_in_m_lat ** 2 + difference_in_m_long ** 2)
    

    #calculating station total slots
    total_slots = float(row["empty_slots"]+row["free_bikes"])
    
    # Assign calculated values to rows
    bike_sample.at[index, "difference_in_m_lat"] = difference_in_m_lat
    bike_sample.at[index, "difference_in_m_long"] = difference_in_m_long
    bike_sample.at[index, "difference_in_m_from_center"] = distance_from_center
    bike_sample.at[index, "total_slots"] = total_slots 

# To check result from above "for loop"
#print(bike_sample["difference_in_m_from_center"])
    
# Create new df wich will be used for joining
select_columns=[
    "empty_slots",
    "free_bikes",
    "total_slots",
    "name",
    "latitude",
    "longitude",
    "difference_in_m_from_center"
]

clean_bike_sample=bike_sample[select_columns].copy()
#clean_bike_sample

#sort by distance from center point
sorted_clean_bike_sample = clean_bike_sample.sort_values("difference_in_m_from_center")
#sorted_clean_bike_sample

#to save file
sorted_clean_bike_sample.to_csv(r"C:\Users\Anna\OneDrive\Desktop\New folder\citi_bike.csv", index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bike_sample["center_point_lat"] = center_point_lat
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bike_sample["center_point_long"] = center_point_long
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bike_sample["total_slots"] = total_slots


In [29]:
# Will use FourSqr df to join with bike station df
# first I will create column distance_from_center ( the same how i did for bike station df)

center_point_lat=45.7535
center_point_long=4.8380

# Inserting column into FourSqr_df df
FourSqr_df["center_point_lat"] = center_point_lat
FourSqr_df["center_point_long"] = center_point_long
FourSqr_df["total_slots"] = total_slots

# will calculate the difference between "center_point" and businesses
for index, row in FourSqr_df.iterrows():
    # #0.001∘=0.111 kilometers or 111 meters. Result in m
    difference_in_km_lat = float((center_point_lat - row["latitude"]) * 111 / 0.001)
    difference_in_km_long = float((center_point_long - row["longitude"]) * 111 / 0.001)
    
    
    # Pythagorean theorem
    distance_from_center = math.sqrt(difference_in_km_lat ** 2 + difference_in_km_long ** 2)
       
        
    # Assign calculated values to rows
    FourSqr_df.at[index, "difference_in_m_lat"] = difference_in_km_lat
    FourSqr_df.at[index, "difference_in_m_long"] = difference_in_km_long
    FourSqr_df.at[index, "difference_in_m_from_center"] = distance_from_center
    
    
# to select specific columns from FourSqr
select_columns_FourSqr=[
    "category_name",
    "distance",
    "rating",
    "latitude",
    "longitude",
    "difference_in_m_from_center"
]

clean_FourSqr_db=FourSqr_df[select_columns_FourSqr].copy()
#clean_FourSqr_db

#sort by distance from center point
sorted_clean_FourSqr_db = clean_FourSqr_db.sort_values("difference_in_m_from_center")
#sorted_clean_FourSqr_db
sorted_clean_FourSqr_db.to_csv(r"C:\Users\Anna\OneDrive\Desktop\New folder\FourSqr.csv", index=False)

In [None]:
# Join Bike Sample with FourSqr 
bike_FourSqr=pd.concat([sorted_clean_bike_sample,sorted_clean_FourSqr_db])


#to sort new db by difference from center point
bike_FourSqr_sorted=bike_FourSqr.sort_values("difference_in_m_from_center")
bike_FourSqr_sorted.head(8)

In [None]:
# find NaN values in a specific column
column_isna = bike_FourSqr_sorted["distance"].isna()

# Replace NaN values in 'distance' with values from 'replacement_column'
bike_FourSqr_sorted.loc[column_isna, "distance"] = bike_FourSqr_sorted.loc[column_isna, "difference_in_m_from_center"]
bike_FourSqr_sorted

Provide a visualization that you used as part of your EDA process. Explain the initial pattern or relationship you discoved through this visualization. 

In [None]:
# To select specific columns
selected_columns=bike_FourSqr_sorted[["total_slots","distance","rating"]]

cmap = sns.color_palette("pastel")
corr=selected_columns.corr()
sns.heatmap(corr,cmap=cmap,annot=True)
plt.show()

# results: we cannot see strong correclation between rating of the places to how close it is to downtown 


In [None]:
# to see if there is relation between total number of slots per station 
#and how far station is away from center point(downtown Lyon)
sorted_clean_bike_sample.plot(kind='scatter', x='difference_in_m_from_center', y='total_slots')

# results: we cannot see correlation between total slots in bike station and how far it is away from downtown Lyon

# Database

Put all your results in an SQLite3 database (remember, SQLite stores its databases as files in your local machine - make sure to create your database in your project's data/ directory!)

Look at the data before and after the join to validate your data.