<h1>Google Geolocation API Notebook</h1>

<p>This notebook has been created to call the Google Geolocation API to convert the addresses that haven been retrieved during the web scraping into long and lat coordinates. These coordinates can the later be used for further analysis.<p> 

In [2]:
import pandas as pd

filepath = '../TablesDB/Location.csv'

df = pd.read_csv(filepath, delimiter=";")
df

Unnamed: 0,LocationId,Street,ZIP,longitude,latitude
0,78753131-6d78-4d1e-a0e5-fc8b5f430570,,6598.0,,
1,f0c2bc0e-55ab-4eb1-98fa-edcb4a0ee01e,via albonago 43,6962.0,,
2,fbdcde66-0fd7-4304-a83e-70ecbb0f6ae7,Via San Gottardo 41,6500.0,,
3,2d7451a3-d7e1-4dc7-babf-ab237f2c02e4,Via F. Zorzi 17,6500.0,,
4,2bde1434-f480-4d46-9b05-99e0c0659671,Via San Gottardo,6900.0,,
...,...,...,...,...,...
11347,2dba2402-d23f-4ab8-9bb6-702bbe28cb4e,Chemin de la Petite-Californie,1222.0,,
11348,daed4779-995c-4c34-85e7-4b269d8bf5cc,Chemin William-Barbey,1292.0,,
11349,faddd38c-e9fc-4f4d-bf09-a41818cc620f,Rue du Temple 5,1236.0,,
11350,84d332ee-e1a8-4b5f-a432-82819fffe330,Chemin de la Pierre-à-Bochet,1226.0,,


<h2>Check the data for missing values</h2>

In [3]:
import pandas as pd
#counting missing addresses

missing_count = df['Street'].isnull().sum()
missing_zip = df['ZIP'].isnull().sum()
datapoints = df['LocationId'].count()

print("Total # of data points:", datapoints)
print("Missing addresses:", missing_count)
print("Missing ZIP codes:", missing_zip)

# Count the number of addresses without a street number
count_no_street_number = sum(~(df['Street'].str.match(r'^\d+.*\d+$') | df['Street'].str.match(r'^.*\d+$')))

print("Number of addresses without a street number:", count_no_street_number)




Total # of data points: 11352
Missing addresses: 708
Missing ZIP codes: 57
Number of addresses without a street number: 2377


<h2>Convert the addresses into lat/long coordinates using the Google Geocoding API</h2>

In [6]:


import json
import requests
import pandas as pd
import math

api_key = #add your API Key


# Request geolocation API


def get_geolocation(address, api_key):
    params = {
        'key': api_key,
        'address': address,
    }
    base_url = 'https://maps.googleapis.com/maps/api/geocode/json?'

    # Server response
    response = requests.get(base_url, params=params)
    data = response.json()
    if data['status'] == 'OK':
        location = data['results'][0]['geometry']['location']
        return location['lat'], location['lng']
    else:
        return None, None

data = df



# Iterate through each address
for index, row in data.iterrows():
    # Concatenate street and ZIP columns to form the address
    if not math.isnan(row['ZIP']):
        address = f"{row['Street']}, {int(row['ZIP'])}, {'Switzerland'}"
    else:
        address = f"{row['Street']}, {'Switzerland'}"

    # Get geolocation data
    latitude, longitude = get_geolocation(address, api_key)

    # Update Longitude and Latitude columns in the DataFrame
    data.at[index, 'latitude'] = latitude
    data.at[index, 'longitude'] = longitude

filepath_csv = '../TablesDB/Location_v3.csv'
data.to_csv(filepath_csv, index=False)
    






<h2>Check the final result</h2>

In [12]:
import pandas as pd

filepath = '../TablesDB/Location_v3.csv'

df2 = pd.read_csv(filepath, delimiter=",")
check_random = df.sample(n=5)
check_random

Unnamed: 0,LocationId,Street,ZIP,longitude,latitude
1699,219888c4-e529-4eda-810c-30254bbb2313,rossa,6543.0,9.094702,46.303763
5249,e0567123-aa56-4b79-ab5a-022eefe51d4a,Winterthurerstr. 346,8057.0,8.558359,47.403719
4364,e7209bc2-7710-4fff-8490-42a0a9eb8dd4,Rue des Préels 7B,2036.0,6.871696,46.982866
5269,4ffed803-4180-4b98-8d29-9ea42cb03a47,Sonnentalstrasse 13,8600.0,8.602142,47.397011
1312,d6c29ac9-31e9-4ed0-9b56-42030e617eae,Gerliswilstrasse 26,6020.0,8.279066,47.072576


In [8]:
import pandas as pd
#counting missing addresses

missing_long = df2['longitude'].isnull().sum()
missing_lat = df2['latitude'].isnull().sum()
datapoints = df2['LocationId'].count()

print("Total # of data points;", datapoints)
print("Missing long:", missing_long)
print("Missing lat:", missing_lat)

# Count the number of addresses without a street number
count_no_street_number = sum(~(df2['Street'].str.match(r'^\d+.*\d+$') | df2['Street'].str.match(r'^.*\d+$')))

print("Number of addresses without a street number:", count_no_street_number)

Total # of data points; 11352
Missing long: 0
Missing lat: 0
Number of addresses without a street number: 2377


In [9]:

# boundaries for latitude and longitude for Switzerland
min_lat = 45.8179 #(southernmost point of Switzerland)
max_lat = 47.8084 #(northernmost point of Switzerland)
min_lon = 5.9559 #(westernmost point of Switzerland)
max_lon = 10.4923 #(easternmost point of Switzerland)

# Check if each latitude and longitude value is within the specified boundaries
within_boundary = (df2['latitude'].between(min_lat, max_lat) & df2['longitude'].between(min_lon, max_lon))

# Create a new DataFrame containing data points outside the boundaries
outside_boundary_df = df2[~within_boundary]

if outside_boundary_df.empty:
    print("All latitude and longitude values are within the specified boundary.")
else:
    print(f"{len(outside_boundary_df)} data points are outside the specified boundary.")

    # Print the information of data points outside the boundaries
    print("Data points outside the boundary:")
    print(outside_boundary_df)


4 data points are outside the specified boundary.
Data points outside the boundary:
                                LocationId                 Street     ZIP  \
233   52c99267-62f2-4e66-8bcc-ff42bacdb648      Oberneudorf, Baar  6340.0   
3741  8fc1b274-be03-4371-9a20-438f151683ab  St. Benediktstrasse 2  4228.0   
3742  9d0a278e-7b3a-4a18-a286-d6f6723d5ad5  St. Benediktstrasse 2  4228.0   
7640  3974edc2-6435-4d7e-b6a8-73e13ced7f9a      Via al Pascolo 3B  6612.0   

      longitude   latitude  
233    9.267444  49.500309  
3741  10.980853  48.350111  
3742  10.980853  48.350111  
7640   9.309339  45.799605  


In [10]:
outside_boundary_df

Unnamed: 0,LocationId,Street,ZIP,longitude,latitude
233,52c99267-62f2-4e66-8bcc-ff42bacdb648,"Oberneudorf, Baar",6340.0,9.267444,49.500309
3741,8fc1b274-be03-4371-9a20-438f151683ab,St. Benediktstrasse 2,4228.0,10.980853,48.350111
3742,9d0a278e-7b3a-4a18-a286-d6f6723d5ad5,St. Benediktstrasse 2,4228.0,10.980853,48.350111
7640,3974edc2-6435-4d7e-b6a8-73e13ced7f9a,Via al Pascolo 3B,6612.0,9.309339,45.799605


In [11]:


import json
import requests
import pandas as pd

api_key = #add your API Key


# Request geolocation API


def get_geolocation(address, api_key):
    params = {
        'key': api_key,
        'address': address,
    }
    base_url = 'https://maps.googleapis.com/maps/api/geocode/json?'

    # Server response
    response = requests.get(base_url, params=params)
    data = response.json()
    if data['status'] == 'OK':
        location = data['results'][0]['geometry']['location']
        return location['lat'], location['lng']
    else:
        return None, None

data = outside_boundary_df



# Iterate through each address
for index, row in data.iterrows():
    # Concatenate street and ZIP columns to form the address
    address = f"{row['Street']}, {int(row['ZIP'])}, {'Switzerland'}"

    # Get geolocation data
    latitude, longitude = get_geolocation(address, api_key)

    # Update Longitude and Latitude columns in the DataFrame
    data.at[index, 'latitude'] = latitude
    data.at[index, 'longitude'] = longitude
    print(address)


data




Oberneudorf, Baar, 6340, Switzerland
St. Benediktstrasse 2, 4228, Switzerland
St. Benediktstrasse 2, 4228, Switzerland
Via al Pascolo 3B, 6612, Switzerland


Unnamed: 0,LocationId,Street,ZIP,longitude,latitude
233,52c99267-62f2-4e66-8bcc-ff42bacdb648,"Oberneudorf, Baar",6340.0,9.267444,49.500309
3741,8fc1b274-be03-4371-9a20-438f151683ab,St. Benediktstrasse 2,4228.0,10.980853,48.350111
3742,9d0a278e-7b3a-4a18-a286-d6f6723d5ad5,St. Benediktstrasse 2,4228.0,10.980853,48.350111
7640,3974edc2-6435-4d7e-b6a8-73e13ced7f9a,Via al Pascolo 3B,6612.0,9.309339,45.799605
