# Coordenadas de pueblos y ciudades

En este apartado usamos las APIs de OpenCage y Nominatim para asignar la latitud y longitud que le corresponde a cada ubicación.

In [2]:
# !pip install geopy
import pandas as pd
import numpy as np
import glob
import os
from opencage.geocoder import OpenCageGeocode
import time
import pickle

Cargamos todos los nombres en las columnas de condado, distrito y pueblos y ciudades:

In [88]:
def load_vars(name_list, folder="../data/Por fecha/"):
    # https://stackoverflow.com/questions/20906474/import-multiple-csv-files-into-pandas-and-concatenate-into-one-dataframe
    # https://stackoverflow.com/questions/26063231/read-specific-columns-with-pandas-or-other-python-module
    all_files = glob.glob(os.path.join(folder, "*.csv"))
    return pd.concat((pd.read_csv(f, usecols=name_list) for f in all_files), ignore_index=True)

In [89]:
df = load_vars(["County", "District", "Town/City"]).apply(lambda x: x.str.lower())
df["District"].replace(to_replace="the wrekin", value="wrekin", inplace=True)
df.head(2)

Unnamed: 0,Town/City,District,County
0,plymouth,plymouth,devon
1,bristol,bristol,avon


In [90]:
df["address"] = df["County"] + ", " + df["District"] + ", " + df["Town/City"]
df.head()

Unnamed: 0,Town/City,District,County,address
0,plymouth,plymouth,devon,"devon, plymouth, plymouth"
1,bristol,bristol,avon,"avon, bristol, bristol"
2,hessle,the east yorkshire borough of beverley,humberside,"humberside, the east yorkshire borough of beve..."
3,wakefield,wakefield,west yorkshire,"west yorkshire, wakefield, wakefield"
4,corby,corby,northamptonshire,"northamptonshire, corby, corby"


In [15]:
adresses = df["address"].unique()
len(adresses)

7563

In [36]:
adresses_batch = {}
batch_size = 50
count = 0
while count * batch_size < len(adresses):
    first = count*batch_size
    last = (count+1)*batch_size
    adresses_batch[str(count)] = adresses[first:last]

    count += 1
    print(f"\rNumber of rows stored: {count*batch_size}", end="")

Number of rows stored: 7600

## Extracción de datos de la API Geocode

Tenemos 2000 peticiones diarias, por lo que vamos a hacer batches más pequeños. De esta forma, podemos evaluar con mayor flexibilidad y, en caso de que haya errores, podemos localizar mejor el problema.

In [1]:
# !pip install geopy
import pandas as pd
import numpy as np
import glob
import os
from opencage.geocoder import OpenCageGeocode
import time
import pickle
batch_size = 50

In [2]:
with open('../output/addresses_batch.pkl', 'rb') as f:
    adresses_batch = pickle.load(f)
adresses_batch["151"]

array(['north yorkshire, north yorkshire, saltburn-by-the-sea',
       'cumberland, cumberland, holmrook',
       'somerset, somerset, hinton st george',
       'north yorkshire, north yorkshire, pontefract',
       'cumberland, cumberland, seascale',
       'cumberland, cumberland, moor row', 'york, york, pickering',
       'north yorkshire, north yorkshire, darlington',
       'westmorland and furness, westmorland and furness, keswick',
       'somerset, somerset, beaminster',
       'cumberland, cumberland, ravenglass',
       'west berkshire, west berkshire, chipping norton',
       'somerset, somerset, winscombe'], dtype=object)

In [3]:
with open('../output/API_KEY.pkl', 'rb') as f:
    API_KEY = pickle.load(f)

geocoder = OpenCageGeocode(API_KEY)

# Function to get latitude and longitude
def get_lat_long(adress):
    try:
        result = geocoder.geocode(adress)
        time.sleep(1)  # Add a delay of 1 second to respect usage limits
        if result and len(result):
            return result[0]['geometry']['lat'], result[0]['geometry']['lng']
        else:
            return None, None
    except Exception as e:
        print(f"Error: {e}")
        return None, None

In [4]:
len(adresses_batch.keys())

152

In [5]:
def control_batches(ini_idx, end_idx, batches=adresses_batch):
    for idx in np.arange(ini_idx, end_idx):
        try:
            temp = pd.DataFrame(adresses_batch[str(idx)], columns=["address"])
            temp[['Latitude', 'Longitude']] = 0.0
        except:
            temp = adresses_batch[str(idx)]

        temp[['Latitude', 'Longitude']] = temp['address'].apply(lambda x: pd.Series(get_lat_long(x)))
        batches[str(idx)] = temp

        with open('../output/addresses_batch.pkl', 'wb') as f:
            pickle.dump(adresses_batch, f)
        print(f"\rBatch index computed: {idx}/{end_idx-1}", end="")

        

ini_idx = 148
# end_idx = int(ini_idx+2500/batch_size)
end_idx = 152
control_batches(ini_idx, end_idx)

Batch index computed: 151/151

In [7]:
adresses_batch["151"]

Unnamed: 0,address,Latitude,Longitude
0,"north yorkshire, north yorkshire, saltburn-by-...",54.55402,-0.794541
1,"cumberland, cumberland, holmrook",54.383566,-3.423382
2,"somerset, somerset, hinton st george",50.908519,-2.828491
3,"north yorkshire, north yorkshire, pontefract",53.737847,-1.264782
4,"cumberland, cumberland, seascale",54.398941,-3.477168
5,"cumberland, cumberland, moor row",54.514468,-3.537689
6,"york, york, pickering",44.046893,-79.451373
7,"north yorkshire, north yorkshire, darlington",54.359291,-1.450427
8,"westmorland and furness, westmorland and furne...",54.644739,-2.834147
9,"somerset, somerset, beaminster",50.809,-2.7391


In [82]:
with open('../output/addresses_batch.pkl', 'rb') as f:
    loaded_dict = pickle.load(f)

town_city_coords = pd.DataFrame(columns=["address", "Latitude", "Longitude"])
for _, value in adresses_batch.items():
    town_city_coords = pd.concat([town_city_coords, value], axis=0)

town_city_coords = town_city_coords.reset_index()
town_city_coords.drop(columns=["index"], axis=1, inplace=True)
town_city_coords

Unnamed: 0,address,Latitude,Longitude
0,"devon, plymouth, plymouth",50.384416,-4.078946
1,"avon, bristol, bristol",51.449620,-2.557939
2,"humberside, the east yorkshire borough of beve...",,
3,"west yorkshire, wakefield, wakefield",53.683046,-1.496800
4,"northamptonshire, corby, corby",52.488849,-0.688383
...,...,...,...
7558,"westmorland and furness, westmorland and furne...",54.644739,-2.834147
7559,"somerset, somerset, beaminster",50.809000,-2.739100
7560,"cumberland, cumberland, ravenglass",54.353916,-3.410166
7561,"west berkshire, west berkshire, chipping norton",51.941090,-1.545300


## Corregir casos erróneos usando Nominatum (Open Source)

### NAs
Localizamos los casos que no se hayan asignado correctamente.

In [74]:
addresses_NA = town_city_coords.loc[town_city_coords["Latitude"].isna(), :]

temp = addresses_NA["address"].str.split(",", expand=True)
temp.columns = ["County", "District", "Town/City"]

addresses_NA = pd.concat([temp, addresses_NA], axis=1)

addresses_NA.head()

Unnamed: 0,County,District,Town/City,address,Latitude,Longitude
2,humberside,the east yorkshire borough of beverley,hessle,"humberside, the east yorkshire borough of beve...",,
147,humberside,the east yorkshire borough of beverley,cottingham,"humberside, the east yorkshire borough of beve...",,
312,cornwall,kerrier,redruth,"cornwall, kerrier, redruth",,
598,powys,montgomeryshire,llanfyllin,"powys, montgomeryshire, llanfyllin",,
731,humberside,cleethorpes,grimsby,"humberside, cleethorpes, grimsby",,


In [75]:
import requests
import time
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def geocode_address(address):
    base_url = "https://nominatim.openstreetmap.org/search"
    
    # Restructure the address
    address_parts = address.split(', ')
    address_parts.reverse()  # Put the most specific part first
    restructured_address = ", ".join(address_parts)
    
    params = {
        "q": restructured_address,
        "format": "json",
        "limit": 1,
        "countrycodes": "gb",  # Limit results to the UK
        "addressdetails": 1,
        "accept-language": "en"
    }
    
    headers = {
        "User-Agent": "AppName/1.0" 
    }
    
    try:
        response = requests.get(base_url, params=params, headers=headers)
        response.raise_for_status()  # Raise an exception for bad status codes
        
        results = response.json()
        if results:
            return {
                "address": address,
                "latitude": float(results[0]["lat"]),
                "longitude": float(results[0]["lon"]),
                "display_name": results[0].get("display_name")
            }
        else:
            logger.warning(f"No results found for address: {address}")
    except requests.RequestException as e:
        logger.error(f"Error geocoding address '{address}': {str(e)}")
    except (KeyError, IndexError, ValueError) as e:
        logger.error(f"Error parsing result for address '{address}': {str(e)}")
    
    return {"address": address, "latitude": None, "longitude": None, "display_name": None}


addresses = list(addresses_NA["Town/City"].values)


geocoded_results = []

for address in addresses:
    result = geocode_address(address)
    geocoded_results.append(result)
    logger.info(f"Geocoded: {result['address']}")
    logger.info(f"Latitude: {result['latitude']}, Longitude: {result['longitude']}")
    logger.info(f"Display Name: {result['display_name']}")
    logger.info("---")
    
    # Add a delay to avoid overwhelming the API
    time.sleep(1)

# You can now use the geocoded_results list for further processing

INFO:__main__:Geocoded:  hessle
INFO:__main__:Latitude: 53.7231588, Longitude: -0.4349309
INFO:__main__:Display Name: Hessle, East Riding of Yorkshire, England, United Kingdom
INFO:__main__:---
INFO:__main__:Geocoded:  cottingham
INFO:__main__:Latitude: 53.7804805, Longitude: -0.4178553
INFO:__main__:Display Name: Cottingham, East Riding of Yorkshire, England, United Kingdom
INFO:__main__:---
INFO:__main__:Geocoded:  redruth
INFO:__main__:Latitude: 50.233989, Longitude: -5.2276468
INFO:__main__:Display Name: Redruth, Cornwall, England, United Kingdom
INFO:__main__:---
INFO:__main__:Geocoded:  llanfyllin
INFO:__main__:Latitude: 52.7669146, Longitude: -3.2717467
INFO:__main__:Display Name: Llanfyllin, Powys, Wales, SY22 5AU, United Kingdom
INFO:__main__:---
INFO:__main__:Geocoded:  grimsby
INFO:__main__:Latitude: 53.5671407, Longitude: -0.0788045
INFO:__main__:Display Name: Grimsby, North East Lincolnshire, England, DN31 1NR, United Kingdom
INFO:__main__:---
INFO:__main__:Geocoded:  port

In [76]:
geocoded_results = pd.DataFrame(geocoded_results)
geocoded_results.head()

Unnamed: 0,address,latitude,longitude,display_name
0,hessle,53.723159,-0.434931,"Hessle, East Riding of Yorkshire, England, Uni..."
1,cottingham,53.780481,-0.417855,"Cottingham, East Riding of Yorkshire, England,..."
2,redruth,50.233989,-5.227647,"Redruth, Cornwall, England, United Kingdom"
3,llanfyllin,52.766915,-3.271747,"Llanfyllin, Powys, Wales, SY22 5AU, United Kin..."
4,grimsby,53.567141,-0.078804,"Grimsby, North East Lincolnshire, England, DN3..."


In [85]:
geocoded_results.index = addresses_NA.index

addresses_NA["Latitude"] = geocoded_results["latitude"]
addresses_NA["Longitude"] = geocoded_results["longitude"]
addresses_NA["new_address"] = geocoded_results["display_name"]
addresses_NA.isna().value_counts()

County  District  Town/City  address  Latitude  Longitude  new_address
False   False     False      False    False     False      False          130
dtype: int64

In [91]:
display(town_city_coords)

Unnamed: 0,address,Latitude,Longitude
0,"devon, plymouth, plymouth",50.384416,-4.078946
1,"avon, bristol, bristol",51.449620,-2.557939
2,"humberside, the east yorkshire borough of beve...",53.723159,-0.434931
3,"west yorkshire, wakefield, wakefield",53.683046,-1.496800
4,"northamptonshire, corby, corby",52.488849,-0.688383
...,...,...,...
7558,"westmorland and furness, westmorland and furne...",54.644739,-2.834147
7559,"somerset, somerset, beaminster",50.809000,-2.739100
7560,"cumberland, cumberland, ravenglass",54.353916,-3.410166
7561,"west berkshire, west berkshire, chipping norton",51.941090,-1.545300


In [83]:
town_city_coords.loc[addresses_NA.index, "Latitude"] = addresses_NA["Latitude"]
town_city_coords.loc[addresses_NA.index, "Longitude"] = addresses_NA["Longitude"]
display(town_city_coords)

Unnamed: 0,address,Latitude,Longitude
0,"devon, plymouth, plymouth",50.384416,-4.078946
1,"avon, bristol, bristol",51.449620,-2.557939
2,"humberside, the east yorkshire borough of beve...",53.723159,-0.434931
3,"west yorkshire, wakefield, wakefield",53.683046,-1.496800
4,"northamptonshire, corby, corby",52.488849,-0.688383
...,...,...,...
7558,"westmorland and furness, westmorland and furne...",54.644739,-2.834147
7559,"somerset, somerset, beaminster",50.809000,-2.739100
7560,"cumberland, cumberland, ravenglass",54.353916,-3.410166
7561,"west berkshire, west berkshire, chipping norton",51.941090,-1.545300


In [102]:
df = df.drop_duplicates()
town_city_coords = pd.merge(left=town_city_coords, right=df, on="address", how="left")
town_city_coords

Unnamed: 0,address,Latitude,Longitude,Town/City,District,County
0,"devon, plymouth, plymouth",50.384416,-4.078946,plymouth,plymouth,devon
1,"avon, bristol, bristol",51.449620,-2.557939,bristol,bristol,avon
2,"humberside, the east yorkshire borough of beve...",53.723159,-0.434931,hessle,the east yorkshire borough of beverley,humberside
3,"west yorkshire, wakefield, wakefield",53.683046,-1.496800,wakefield,wakefield,west yorkshire
4,"northamptonshire, corby, corby",52.488849,-0.688383,corby,corby,northamptonshire
...,...,...,...,...,...,...
7558,"westmorland and furness, westmorland and furne...",54.644739,-2.834147,keswick,westmorland and furness,westmorland and furness
7559,"somerset, somerset, beaminster",50.809000,-2.739100,beaminster,somerset,somerset
7560,"cumberland, cumberland, ravenglass",54.353916,-3.410166,ravenglass,cumberland,cumberland
7561,"west berkshire, west berkshire, chipping norton",51.941090,-1.545300,chipping norton,west berkshire,west berkshire


In [103]:
with open('../output/addresses_NAs.pkl', 'wb') as f:
            pickle.dump(addresses_NA, f)

with open('../output/town_city_coords.pkl', 'wb') as f:
            pickle.dump(town_city_coords, f)

### Comprobar y corregir resultados que caen fuera de Reino Unido

Sabemos que el meridiano de Greenwich pasa por Reino Unido, por lo que la longitud debe estar entorno a cero.

In [58]:
# Load all coordinates
with open("../output/town_city_coords.pkl", "rb") as f:
    town_city_coords = pickle.load(f)

# UK boundaries
import geopandas as gpd
name_temp1 = "NAME_1"
temp1 = gpd.read_file('../_00_Informacion/UK shapefile/gadm41_GBR_3.shp')
temp1 = temp1.set_crs(4326)
temp1 = temp1.loc[:, [name_temp1, "geometry"]]
temp1 = temp1.loc[temp1.loc[:, name_temp1].isin(["England", "Wales"]), :]

minLong, minLat, maxLong, maxLat = temp1.total_bounds
print(f"Rango de valores en Longitud: ({round(minLong, 2)}º, {round(maxLong, 2)}º)\n"
      + f"Rango de valores en Latitud: ({round(minLat, 2)}º, {round(maxLat, 2)}º)")

Rango de valores en Longitud: (-6.42º, 1.76º)
Rango de valores en Latitud: (49.87º, 55.81º)


In [41]:
mask = ((town_city_coords["Latitude"]<minLat) | 
        (town_city_coords["Latitude"]>maxLat) | 
        (town_city_coords["Longitude"]<minLong) | 
        (town_city_coords["Longitude"]>maxLong))

out_of_bound = town_city_coords.loc[mask, :].copy()
out_of_bound.shape

(441, 6)

In [43]:
out_of_bound["address"] = out_of_bound["Town/City"] + ", UK"
out_of_bound.head()

Unnamed: 0,address,Latitude,Longitude,Town/City,District,County
61,"southampton, UK",40.884267,-72.38953,southampton,southampton,southampton
77,"blackwood, UK",39.80234,-75.06406,blackwood,islwyn,gwent
99,"chester, UK",37.37236,-77.448623,chester,chester,cheshire
113,"durham, UK",35.996653,-78.901805,durham,durham,durham
125,"leominster, UK",42.52509,-71.75979,leominster,leominster,hereford and worcester


Usamos la API de Geocode para extraer las nuevas posiciones geográficas:

In [44]:
API_KEY = '275c02ba5e37400eb5ba975673e0acd5'
geocoder = OpenCageGeocode(API_KEY)

# Function to get latitude and longitude
def get_lat_long(adress):
    try:
        result = geocoder.geocode(adress)
        time.sleep(1)  # Add a delay of 1 second to respect usage limits
        if result and len(result):
            return result[0]['geometry']['lat'], result[0]['geometry']['lng']
        else:
            return None, None
    except Exception as e:
        print(f"Error: {e}")
        return None, None

# Perform requests
new_coordinates = out_of_bound.copy()

new_coordinates[['Latitude', 'Longitude']] = new_coordinates['address'].apply(
    lambda x: pd.Series(get_lat_long(x))
    )

new_coordinates.head()

Unnamed: 0,address,Latitude,Longitude,Town/City,District,County
61,"southampton, UK",50.902535,-1.404189,southampton,southampton,southampton
77,"blackwood, UK",51.666319,-3.197433,blackwood,islwyn,gwent
99,"chester, UK",53.190887,-2.890896,chester,chester,cheshire
113,"durham, UK",54.666667,-1.75,durham,durham,durham
125,"leominster, UK",52.227356,-2.737533,leominster,leominster,hereford and worcester


In [51]:
# Check if values are changed
((out_of_bound[["Latitude", "Longitude"]] - new_coordinates[["Latitude", "Longitude"]]) == 0).sum()

Latitude     0
Longitude    0
dtype: int64

In [59]:
# Replace values in town_city_coords
town_city_coords.loc[new_coordinates.index, ["address", "Latitude", "Longitude"]] = new_coordinates.loc[:, ["address", "Latitude", "Longitude"]]
town_city_coords.head()

Unnamed: 0,address,Latitude,Longitude,Town/City,District,County,adress
0,"devon, plymouth, plymouth",50.384416,-4.078946,plymouth,plymouth,devon,
1,"avon, bristol, bristol",51.44962,-2.557939,bristol,bristol,avon,
2,"humberside, the east yorkshire borough of beve...",53.723159,-0.434931,hessle,the east yorkshire borough of beverley,humberside,
3,"west yorkshire, wakefield, wakefield",53.683046,-1.4968,wakefield,wakefield,west yorkshire,
4,"northamptonshire, corby, corby",52.488849,-0.688383,corby,corby,northamptonshire,


In [60]:
# Save results
with open('../output/town_city_coords.pkl', 'wb') as f:
            pickle.dump(town_city_coords, f)