In [None]:
import urbanpy as up
import geopandas as gpd
import pandas as pd
import numpy as np
import requests
import time
import os
from tqdm.auto import tqdm
from unidecode import unidecode

In [None]:
df = pd.read_excel("outputs/encuesta_limpieza_en_proceso_22_02_23.xlsx", index_col=0)

In [None]:
df.shape

## Geocoding

Convertir direcciones a coordenadas (latitude, longitude)

#### Helper functions

In [None]:
def clean_address(address):
    remove_list = [
        "Pto De Ref",
        "Entre",
        "Detras De",
        "Cruce Con",
        "A La Espalda De La  Angamos",
        "Con ",
        "/",
    ]
    if isinstance(address, str):
        for seps in remove_list:
            if seps in address:
                address = address.split(seps)[0].strip()

    return address


def geocode(query, viewbox):
    params = {
        "q": query,
        "viewbox": viewbox,
        "limit": 1,
        "format": "json",
        "email": "claudio.rtega2701@gmail.com",
    }

    headers = {"user-agent": "inmigrantes-vulnerables-app"}

    response = requests.get(
        "https://nominatim.openstreetmap.org/search", params=params, headers=headers
    )
    data = response.json()

    time.sleep(1)  # No heavy uses (an absolute maximum of 1 request per second).

    return data


def geocode_df(df, query_col, viewbox, return_gdf=False):
    geocodes = []
    for index, row in tqdm(df.iterrows(), total=df.shape[0]):
        data = geocode(row[query_col], viewbox)
        geocodes.append((index, data))

    geocodes_df = pd.DataFrame.from_records(
        {i: data[0] for i, data in geocodes if len(data) > 0}
    ).T

    if return_gdf:
        new_df = df.merge(
            geocodes_df[["lat", "lon"]], how="left", left_index=True, right_index=True
        )
        gdf = gpd.GeoDataFrame(
            new_df,
            crs="EPSG:4326",
            geometry=gpd.points_from_xy(new_df["lon"], new_df["lat"]),
        )
        return gdf
    else:
        return geocodes_df

###  Subset donde tipo de via es una de las opciones predeterminadas

In [None]:
# Columnas relacionadas a la ubicacion geografica
df.columns[4:12]

In [None]:
address_cols = [
    "Provincia donde vive",
    "Distrito donde vive",
    "Tipo de ubicacion: Manzana (Mz), Lote, Urbanizacion (urb) y/o Etapa.",
    "Tipo de vía",
    "Especifica otro tipo de via",
    "Nombre de la vía",
    "Número de cuadra",
    "Número de puerta de la vivienda",
]

new_address_cols = [
    "provincia_clean",
    "distrito_clean",
    "tipo_ubicacion_clean",
    "tipo_via_clean",
    "otro_tipo_via_clean",
    "nombre_via_clean",
    "num_cuadra_clean",
    "num_puerta_clean",
]

In [None]:
# Check missing values
df[address_cols].isna().sum()

In [None]:
# Check road name values
df["Nombre de la vía"].notna().sum()

In [None]:
# For road name values, check street/house number missing values
df[df["Nombre de la vía"].notna()][
    ["Número de cuadra", "Número de puerta de la vivienda"]
].isna().sum()

In [None]:
def clean_df_string(col):
    clean = (
        col.apply(lambda x: int(x) if (type(x) == float) and pd.notna(x) else x)
        .fillna("")  # Replace nan with emptry string
        .astype(str)
        .str.lower()  # Lower all strings
        .apply(unidecode)  # Remove all accents and special characters
        .str.replace(
            r"[^a-zA-Z0-9 ]", " ", regex=True
        )  # Remove any non-alpanumeric or space character
        .str.replace(r"[ ]+", " ", regex=True)  # Reduce multiple spaces to one
        .str.strip()  # Remove spaces at the start and end of the strings
    )
    return clean

In [None]:
for col, new_col in zip(address_cols, new_address_cols):
    df[new_col] = clean_df_string(df[col])

In [None]:
df[new_address_cols]

In [None]:
new_address_cols

In [None]:
new_address_cols_ordered = [
    "num_puerta_clean",
    "num_cuadra_clean",
    "tipo_ubicacion_clean",
    "tipo_via_clean",
    "nombre_via_clean",
    "otro_tipo_via_clean",
]

adm_region_cols = ["distrito_clean", "provincia_clean"]

In [None]:
df[adm_region_cols]

In [None]:
df["nominatim_query"] = (
    df[new_address_cols_ordered]
    .apply(" ".join, axis=1)
    .str.replace("[ ][0 ]*", " ", regex=True)  # Remove "0" that dont make sense alone
    .str.replace(r"[ ]+", " ", regex=True)  # Reduce multiple spaces to one
    .str.strip()  # Remove spaces at the start and end of the strings
)

In [None]:
df["nominatim_query"].str.len().hist(), df["nominatim_query"].str.len().describe(),

In [None]:
df_not_empty_query = df[df["nominatim_query"].str.len() > 0]

In [None]:
# Add administrative region information
df_not_empty_query.loc[:, "nominatim_query_full"] = (
    df_not_empty_query.loc[:, ["nominatim_query"] + adm_region_cols]
    .apply(", ".join, axis=1)
    .str.replace(r"[ ]+", " ", regex=True)  # Reduce multiple spaces to one
    .str.strip()  # Remove spaces at the start and end of the strings
)

In [None]:
df_not_empty_query.loc[:, "nominatim_query_full"].tolist()

In [None]:
# Get Ica deparment administrative region polygon for query
ica = up.download.nominatim_osm("Ica, Peru")
ica.plot()

In [None]:
# total_bounds: Returns a tuple containing ``minx``, ``miny``, ``maxx``, ``maxy``

In [None]:
ica.total_bounds

In [None]:
viewbox = ", ".join(ica.total_bounds.astype(str))

In [None]:
viewbox

In [None]:
output_fn = "outputs/geocodes_ALL.geojson"
## Expensive query run only if file doesnt exist
if os.path.exists(output_fn):
    gdf = gpd.read_file(output_fn)
else:
    gdf = geocode_df(
        df_not_empty_query, "nominatim_query_full", viewbox, return_gdf=True
    )
    gdf.to_file(output_fn, driver="GeoJSON")

In [None]:
gdf[gdf.geometry.isna()]

In [None]:
gdf["lat"].isna().sum()

In [None]:
gdf.shape

In [None]:
gdf["nominatim_query_full"].values.tolist()

In [None]:
gdf[gdf["lat"].notna()].drop("geometry", axis=1).to_excel("outputs/datos_georef.xlsx")

In [None]:
import contextily as cx

In [None]:
gdf.crs

In [None]:
ax = gdf.plot()
cx.add_basemap(ax=ax, source=cx.providers.CartoDB.Positron, crs=gdf.crs)

In [None]:
gdf.geometry.notna().sum()

In [None]:
map_ = gdf.loc[gdf.lat.notna(), ["nominatim_query_full", "geometry"]].explore()

In [None]:
map_.save("encuestas_georeferenciadas.html")

In [None]:
df["tipo_ubicacion_clean"] = clean_df_string(
    df["Tipo de ubicacion: Manzana (Mz), Lote, Urbanizacion (urb) y/o Etapa."]
)

In [None]:
df["tipo_ubicacion_clean"].sort_values().unique().shape

In [None]:
df["Tipo de vía"].unique()

In [None]:
df["Tipo de vía"] = df["Tipo de vía"].replace(
    {
        "AV (Avenida)": "Avenida",
        "CA (Calle)": "Calle",
        "AL (Alameda)": "Alameda",
        "PRLG (Prolongación)": "Prolongacion",
        "JR (Jirón)": "Jiron",
        "Otro, especificar": "",
        "No cuenta con información": "",
    }
)

In [None]:
df["Especifica otro tipo de via"].sort_values().unique().shape

In [None]:
df["Especifica otro tipo de via"].str.lower().fillna("").sort_values().unique().shape

In [None]:
df["Especifica otro tipo de via"].str.lower().fillna("").apply(
    unidecode
).sort_values().unique().shape

In [None]:
df[
    "Tipo de ubicacion: Manzana (Mz), Lote, Urbanizacion (urb) y/o Etapa."
].str.lower().fillna("").apply(unidecode).sort_values().unique().shape

In [None]:
df["Especifica otro tipo de via"].str.lower().fillna("").apply(
    unidecode
).sort_values().unique().tolist()

In [None]:
{
    'asentamiento humano': ['aa hh', 'aahh', 'acentamiento', 'acentamiento humano', 'ah', 'asentamiento humano'],
    'aa hh pueblo joven senor de luren 3era etapa d-01': ['aa hhpueblo joven seno de luren 3era etapa d-01'],
    'urbanizacion': ['urbanzacion'],
    'frente a prefectura',
    

    
 'acceso',
 'aldea',
 'asociacion',
 'carretera',
 'caserio',
 'centro poblado',
 'condominio',
 'cooperativa',
 'expasion',
 'fundo',
 'invasion',   
 'lotizacion',
 'malecon',   
 'ovalo',
 'panamericana',
 'pasaje',
 'prologacion',
 'pueblo joven',
 'sector',
 'upis',
 'urbanizacion',
 
# Direccion especifica
 'aa hh el oasis 2da etapa l 15',
 'aa hh pueblo joven senor de luren "i 13"',
 'aa hh pueblo joven senor de luren, 1era etapa e 22',
 'aa.hh los medanos manzana a, lote 13',
 'aahh el huarangal s/n tierra prometida, a la espalda del colegio el huarangal',
 'cetpro saraja b 9',
 'francisco sotelo',
 'fundo san pedro',
 'habana con madrid',
 'los angeles',
 'miguel ciani, cerca a la plaza miguel grau, s/n',
 'mz c',
 'pachacutec',
 'panamericana km 298',
 'pasaje acomayo mz b s/n',
 'san juan bautista mz c lt 6 ',
 'san juan de san martin 4ta cuadra s/n',
 'santa rosa',
 'sector b2',
 'siete, calle los florales s/n',
 'tupac amaru',
    
# Remove or replace from specific address strings
    
    'asentamiento humano': ['aa hh', 'aa.hh', 'aahh'],
    np.nan: ['s/n'],
    
 
 
 
}

In [None]:
'aa hh', 'aa hh el oasis 2da etapa l 15',
       'aa hh pueblo joven señor de luren "i 13"',
       'aa hh pueblo joven señor de luren, 1era etapa e 22',
       'aa hhpueblo joven seño de luren 3era etapa d-01',
       'aa.hh los médanos manzana a, lote 13', 'aahh',
       'aahh el huarangal s/n tierra prometida, a la espalda del colegio el huarangal',
       'acceso', 'acentamiento', 'acentamiento humano', 'ah', 'aldea',
       'asentamiento humano', 'asociacion', 'carretera', 'caserio',
       'caserío', 'centro poblado', 'cetpro saraja b 9', 'condominio',
       'cooperativa', 'expasion', 'francisco sotelo',
       'frente a prefectura', 'fundo', 'fundo san pedro',
       'habana con madrid', 'invasión', 'los ángeles', 'lotizacion',
       'malecon', 'miguel ciani, cerca a la plaza miguel grau, s/n',
       'mz c', 'pachacutec', 'panamericana', 'panamericana km 298',
       'pasaje', 'pasaje acomayo mz b s/n', 'prologacion', 'pueblo joven',
       'san juan bautista mz c lt 6 ',
       'san juan de san martin 4ta cuadra s/n', 'santa rosa', 'sector',
       'sector b2', 'siete, calle los florales s/n', 'túpac amaru',
       'upis', 'urbanizacion', 'urbanización', 'urbanzacion', 'óvalo',

In [None]:
{
    "Asentamiento Humano": [
        "AAHH",
        "Acentamiento",
        "AH",
        "AA HH",
        "Acentamiento humano",
        "Asentamiento humano",
        "ASENTAMIENTO HUMANO",
    ],
    # Fix typo
    "AA HHPueblo Joven Seño de Luren 3era Etapa D-01": [
        "AA HHPueblo Joven Seño de Luren 3era Etapa D-01"
    ],
}

# Detalle
"AAHH El Huarangal S/N Tierra Prometida, a la espalda del colegio el Huarangal",
"AA.HH Los Médanos Manzana A, Lote 13",
"AA HH Pueblo Joven Señor De Luren, 1era Etapa E 22",
'AA HH Pueblo Joven Señor de Luren "I 13"',
"AA HH El oasis 2da Etapa L 15",
"Cetpro Saraja B 9",

## Replace in string
("AA HH", "Asentamiento Humano")
("AAHH", "Asentamiento Humano")
("AA.HH", "Asentamiento Humano")

# Urbanización Popular de Interés Social (UPIS),


"UPIS", "Upis",
"PASAJE",
"Caserío", "caserío", "Caserio", "CASERIO",
"Urbanización", "URBANZACION", "URBANIZACION", "URBANIZACIÓN",
"Condominio",
"Centro Poblado", "Centro poblado",
"Panamericana", "PANAMERICANA",
"Sector",
"Lotizacion",
"Invasión",
"ASOCIACION",
"Pasaje",
"PROLOGACION",
"Fundo",
"Frente a prefectura",

"SECTOR B2",

"Aldea",

"Acceso"

"Mz C",
"Pueblo Joven",
"Carretera",
"FUNDO SAN PEDRO",
"COOPERATIVA",

"EXPASION",
"Óvalo",
"Malecon",

"San Juan de San Martin 4ta Cuadra S/N",
"Miguel Ciani, cerca a la plaza Miguel Grau, S/N",
"Siete, calle los Florales S/N",
"centro Poblado",
"Túpac Amaru",
"Santa Rosa",
"Francisco Sotelo",
"Los Ángeles",
"Panamericana Km 298",
"Habana con Madrid",
"Pachacutec",
"Pasaje acomayo Mz B S/N",
"San Juan Bautista Mz C Lt 6 ",

In [None]:
df["Nombre de la vía"].unique()

In [None]:
df[]

In [None]:
df["Tipo de vía"].unique()

In [None]:
subset = df[
    df["Tipo de vía"].isin(
        [
            "AV (Avenida)",
            "CA (Calle)",
            "AL (Alameda)",
            "PRLG (Prolongación)",
            "JR (Jirón)",
        ]
    )
]

In [None]:
subset["tipo_via"] = subset["Tipo de vía"].replace(
    {
        "AV (Avenida)": "Avenida",
        "CA (Calle)": "Calle",
        "AL (Alameda)": "Alameda",
        "PRLG (Prolongación)": "Prolongacion",
        "JR (Jirón)": "Jiron",
    }
)

In [None]:
subset["tipo_via"]

Clean street name

In [None]:
clean = (
    subset.loc[:, "Nombre de la vía"]
    .str.title()
    .str.replace(
        "((N(ú|u)mero|Calle|Jr|Jir(o|ó)n|Pasaje|Prolongaci(o|ó)n|Pasaje|Avenida|Aveida|Ave|Av|Urb)\.*\s+)",
        "",
        regex=True,
    )
    .str.replace(
        "((Manzana|Mz|Manza)\s*[a-zA-z]*[0-9]*|(Lote|Lt)\s*\.*[a-zA-z]*[0-9]*),*",
        "",
        regex=True,
    )
    .str.replace(".", "", regex=False)
    .str.strip()
)

In [None]:
subset.loc[:, "nombre_via_clean"] = clean.apply(clean_address)

Clean number

In [None]:
subset["Número de puerta de la vivienda"].isna().sum()

In [None]:
subset["Número de cuadra"].isna().sum()

In [None]:
(
    subset[["Número de puerta de la vivienda", "Número de cuadra"]].isna().sum(axis=1)
    == 2
).sum()

In [None]:
subset["num_puerta_clean"] = (
    subset["Número de puerta de la vivienda"]
    .fillna(subset["Número de cuadra"])
    .astype("Int64")
    .astype(str)
    .replace("<NA>", "")
)

Build query string for Nominatim API

In [None]:
subset["nominatim_street"] = (
    subset["num_puerta_clean"]
    + " "
    + subset["tipo_via"]
    + " "
    + subset["nombre_via_clean"]
)

In [None]:
subset["nominatim_street"].head()

In [None]:
subset["nominatim_street"].isna().sum()

In [None]:
subset = subset.dropna(subset="nominatim_street")

In [None]:
subset.shape

In [None]:
subset["Provincia donde vive"].isna().sum()

In [None]:
subset["state_nominatim"] = subset["Provincia donde vive"]

In [None]:
subset["city_nominatim"] = subset["Distrito donde vive"]

In [None]:
subset[["nominatim_street", "city_nominatim", "state_nominatim"]]

In [None]:
output_fn = "outputs/geocodes_.geojson"
## Expensive query run only if file doesnt exist
if os.path.exists(output_fn):
    gdf = gpd.read_file(output_fn)
else:
    gdf = geocode_df(
        subset.sample(n=60),
        "nominatim_street",
        "city_nominatim",
        "state_nominatim",
        return_gdf=True,
    )
    gdf.to_file(output_fn, driver="GeoJSON")

In [None]:
gdf.shape, subset.shape

In [None]:
gdf.geometry.duplicated().sum()

In [None]:
gdf[gdf.geometry.duplicated()][
    ["nominatim_street", "city_nominatim", "state_nominatim", "geometry"]
]

In [None]:
gdf.plot()

#### Geocode new observations

In [None]:
subset_wo_loc = subset[478:]

In [None]:
subset_wo_loc["nominatim_street"].shape

In [None]:
output_fn = "outputs/complement_geocodes.geojson"
## Expensive query run only if file doesnt exist
if os.path.exists(output_fn):
    gdf_complement = gpd.read_file(output_fn)
else:
    gdf_complement = geocode_df(
        subset_wo_loc,
        "nominatim_street",
        "city_nominatim",
        "state_nominatim",
        return_gdf=True,
    )
    gdf_complement.to_file(output_fn, driver="GeoJSON")

In [None]:
gdf_complement.shape, gdf.shape

In [None]:
gdf_ = subset.copy()

In [None]:
gdf_.shape

In [None]:
gdf_["geometry"] = gdf.geometry.append(gdf_complement.geometry).values

In [None]:
gdf = gpd.GeoDataFrame(gdf_, crs="EPSG:4326")

In [None]:
gdf.head()

In [None]:
gdf.plot()

In [None]:
print("Total observations:", gdf.shape[0])
print("Geocoded observations:", gdf.shape[0] - gdf.geometry.x.isna().sum())

In [None]:
empty_geoms = gdf[gdf.geometry.x.isna()]

In [None]:
output_fn = "outputs/new_geoms.geojson"
## Expensive query run only if file doesnt exist
if os.path.exists(output_fn):
    new_geoms = gpd.read_file(output_fn)
else:
    new_geoms = geocode_df(empty_geoms, "nominatim_street", return_gdf=True)
    new_geoms.to_file(output_fn, driver="GeoJSON")

In [None]:
print("Observations w empty geom before:", empty_geoms.shape[0])
print("Geocoded observations:", new_geoms.shape[0] - new_geoms.geometry.x.isna().sum())

In [None]:
gdf.geometry.x.isna().sum()

In [None]:
# Fill missing geoms
gdf.loc[gdf[gdf.geometry.x.isna()].index, "geometry"] = new_geoms.geometry.values

In [None]:
gdf.geometry.x.isna().sum()

In [None]:
print("Total observations:", gdf.shape[0])
print("Geocoded observations:", gdf.shape[0] - gdf.geometry.x.isna().sum())

### Subset donde tipo de via NO es una de las opciones predeterminadas

In [None]:
other_subset = df.query("distrito =='Surquillo'").query(
    "tipo_via not in ['Calle', 'Avenida', 'Jirón', 'Pasaje', 'Callejón', 'Prolongación', 'Urbanización']"
)

In [None]:
other_subset.shape

In [None]:
other_subset.head()

Clean street name

In [None]:
other_clean = (
    other_subset["nombre_via"]
    .str.title()
    .str.replace("((N(ú|u)mero)\s+)", "", regex=True)
    .str.replace(
        "((Manzana|Mz|Manza)\s*[a-zA-z]*[0-9]*|(Lote|Lt)\s*\.*[a-zA-z]*[0-9]*),*",
        "",
        regex=True,
    )
    .str.replace(".", "", regex=False)
    .str.strip()
)

In [None]:
other_clean = other_clean.apply(clean_address)

In [None]:
other_subset["nombre_via_clean"] = other_clean

Clean number

In [None]:
other_subset["num_puerta_clean"] = (
    other_subset["num_puerta"]
    .fillna(subset["num_cuadra"])
    .astype("Int64")
    .astype(str)
    .replace("<NA>", "")
)

In [None]:
other_subset["nominatim_street"] = (
    other_subset["num_puerta_clean"] + " " + other_subset["nombre_via_clean"]
)

In [None]:
other_subset["nominatim_street"].shape, other_subset["nominatim_street"].isna().sum()

In [None]:
other_subset = other_subset.dropna(subset=["nominatim_street"])

In [None]:
other_subset["nominatim_street"].head()

In [None]:
other_subset["nominatim_street"].tail()

In [None]:
output_fn = "outputs/other_geocodes.geojson"
## Expensive query run only if file doesnt exist
if os.path.exists(output_fn):
    other_geoms = gpd.read_file(output_fn)
else:
    other_geoms = geocode_df(other_subset, "nominatim_street", return_gdf=True)
    other_geoms.to_file(output_fn, driver="GeoJSON")

In [None]:
print("Number of observations before geocoding:", other_subset.shape[0])
print(
    "Geocoded observations:", other_geoms.shape[0] - other_geoms.geometry.x.isna().sum()
)

## Save geocoded values

In [None]:
final_gdf = pd.concat(
    (
        gdf.loc[gdf[~gdf.geometry.x.isna()].index],
        other_geoms.loc[other_geoms[~other_geoms.geometry.x.isna()].index],
    )
)

In [None]:
final_gdf.shape

In [None]:
final_gdf.geometry.is_empty.sum(), final_gdf.geometry.isna().sum(), final_gdf.geometry.x.isna().sum()

In [None]:
final_gdf.plot()

In [None]:
if 1 == 2:
    final_gdf.to_file("outputs/geocoded_gdf")

# Trying to geocode more observations

In [None]:
missing_geoms = df[~df.index.isin(final_gdf.index)]

In [None]:
missing_geoms.shape

Clean street name

In [None]:
missing_geoms["nombre_via"].isna().sum()

In [None]:
missing_geoms["nombre_urbanizacion"].isna().sum()

In [None]:
filled_nombre_via = (
    missing_geoms["nombre_via"].fillna(missing_geoms["nombre_urbanizacion"]).values
)
missing_geoms = missing_geoms.assign(filled_nombre_via=filled_nombre_via)

In [None]:
missing_geoms["nombre_via_clean"] = (
    missing_geoms["filled_nombre_via"]
    .str.title()
    .str.replace(
        "((N(ú|u)mero|Calle|Jr|Jir(o|ó)n|Pasaje|Prolongaci(o|ó)n|Pasaje|Avenida|Aveida|Ave|Av|Urb)\.*\s+)",
        "",
        regex=True,
    )
    .str.replace(
        "((Manzana|Mz|Manza)\s*[a-zA-z]*[0-9]*|(Lote|Lt)\s*\.*[a-zA-z]*[0-9]*),*",
        "",
        regex=True,
    )
    .str.replace(".", "", regex=False)
    .str.strip()
    .apply(clean_address)
)

Clean street number

In [None]:
missing_geoms["num_puerta_clean"] = (
    missing_geoms["num_puerta"]
    .fillna(missing_geoms["num_cuadra"])
    .astype("Int64")
    .astype(str)
    .replace("<NA>", "")
)

Build query string for Nominatim API

In [None]:
missing_geoms["nominatim_street"] = (
    missing_geoms["num_puerta_clean"] + " " + missing_geoms["nombre_via_clean"]
)

In [None]:
missing_geoms["nominatim_street"].head()

In [None]:
output_fn = "outputs/missing_geocodes.geojson"
## Expensive query run only if file doesnt exist
if os.path.exists(output_fn):
    missing_geocodes = gpd.read_file(output_fn)
else:
    missing_geocodes = geocode_df(missing_geoms, "nominatim_street", return_gdf=True)
    missing_geocodes.to_file(output_fn, driver="GeoJSON")

In [None]:
print("Observations before:", missing_geoms.shape[0])
print(
    "Geocoded observations:",
    missing_geocodes.shape[0] - missing_geocodes.geometry.x.isna().sum(),
)

In [None]:
missing_surquillo = missing_geocodes.query("distrito == 'Surquillo'")

In [None]:
complete_gdf = final_gdf.append(
    missing_surquillo.loc[missing_surquillo[~missing_surquillo.geometry.x.isna()].index]
)

In [None]:
complete_gdf.shape

In [None]:
complete_gdf.shape[0] - final_gdf.shape[0]

In [None]:
complete_gdf.geometry.is_empty.sum(), complete_gdf.geometry.isna().sum(), complete_gdf.geometry.x.isna().sum()

In [None]:
ax = complete_gdf.plot(color="r")
final_gdf.plot(ax=ax)

In [None]:
complete_gdf["consentimiento"].value_counts()

In [None]:
export_gdf = complete_gdf.query("consentimiento == 'Sí'")

In [None]:
export_gdf["lat"] = export_gdf.geometry.x
export_gdf["lon"] = export_gdf.geometry.y

In [None]:
if 1 == 2:
    export_gdf.to_file("outputs/complete_gdf.geojson", driver="GeoJSON")

### Conclusiones 

In [None]:
n_total = df.query("distrito == 'Surquillo'").query("consentimiento == 'Sí'").shape[0]

In [None]:
n_geocoded = (
    export_gdf.query("distrito == 'Surquillo'").query("consentimiento == 'Sí'").shape[0]
)

In [None]:
print(
    f"Se lograron georreferenciar {n_geocoded} ({n_geocoded/n_total*100:.2f}%) observaciones de {n_total}"
)

In [None]:
n_total = df.query("distrito == 'Surquillo'").shape[0]

In [None]:
n_geocoded = export_gdf.query("distrito == 'Surquillo'").shape[0]

In [None]:
print(
    f"Se lograron georreferenciar {n_geocoded} ({n_geocoded/n_total*100:.2f}%) observaciones de {n_total}"
)

## No se pudieron georreferenciar

In [None]:
df_last = df[~df.index.isin(export_gdf.index)]

In [None]:
df_last.shape

In [None]:
df_last.nombre_via.unique().shape

In [None]:
pd.Series(df_last.nombre_via.unique()).to_excel("outputs/nombres_via_corregir.xlsx")

In [None]:
manual_addrs = pd.read_excel(
    "inputs/Nombres de vias a Corregir.Surquillo.xlsx", index_col=0
)
manual_addrs = manual_addrs.dropna(subset=["Dirección Corregida Norys"])

In [None]:
manual_addrs.head()

In [None]:
rlpc = {
    l[0]: l[1] for l in manual_addrs[[0, "Dirección Corregida Norys"]].values.tolist()
}

In [None]:
df_last["nombre_via"] = df_last["nombre_via"].replace(rlpc)

In [None]:
output_fn = "outputs/last_geocodes.geojson"
## Expensive query run only if file doesnt exist
if os.path.exists(output_fn):
    last_geocodes = gpd.read_file(output_fn)
else:
    last_geocodes = geocode_df(df_last, "nombre_via", return_gdf=True)
    last_geocodes.to_file(output_fn, driver="GeoJSON")

In [None]:
print("Observations before:", last_geocodes.shape[0])
print(
    "Geocoded observations:",
    last_geocodes.shape[0] - last_geocodes.geometry.x.isna().sum(),
)

In [None]:
import urbanpy as up

In [None]:
surquillo = up.download.nominatim_osm("Surquillo, Lima")

In [None]:
ax = surquillo.plot(figsize=(10, 10))
last_geocodes.plot(ax=ax, color="red")

In [None]:
xx = last_geocodes.query("distrito == 'Surquillo'").query("consentimiento == 'Sí'")

In [None]:
print("Observations before:", xx.shape[0])
print("Geocoded observations:", xx.shape[0] - xx.geometry.x.isna().sum())

In [None]:
export_gdf.shape

In [None]:
export_gdf_wlast = export_gdf.append(xx.loc[xx[~xx.geometry.x.isna()].index])

In [None]:
export_gdf_wlast.shape

In [None]:
534 / 619

In [None]:
export_gdf_wlast.to_file("outputs/complete_gdf_wlast.geojson", driver="GeoJSON")