In [11]:
#Request the data from https://www.datos.gov.co/resource/gdxc-w37w.json
#and save it in a file called municipios.json and municipios.csv
import pandas as pd
from sodapy import Socrata
import requests
import json
import os

def get_municipios():
    """
    Function to get the municipios from the API

    Returns
    -------
    Municipios:
        pd.DataFrame: DataFrame with the municipios
    """
    client = Socrata("www.datos.gov.co", None)
    results = client.get("xdk5-pm3f", limit=2000)

    # c_digo_dane_del_municipio read as int64 with the following format: #####
    # for example: "5.04" -> "05040"
    # and "54.313" -> "54313"
    # texts = ["54.313", "5.04"]
    # for text in texts:
    #     if len(text) < 6:
    #         text = text + "0"
    #         if len(text) < 6:
    #             text = "0" + text
    #     text = text.replace(".", "")
    #     print(text)

    for result in results:
        if len(result["c_digo_dane_del_municipio"]) < 6:
            result["c_digo_dane_del_municipio"] = result["c_digo_dane_del_municipio"] + "0"
            if len(result["c_digo_dane_del_municipio"]) < 6:
                result["c_digo_dane_del_municipio"] = "0" + result["c_digo_dane_del_municipio"]
        result["c_digo_dane_del_municipio"] = result["c_digo_dane_del_municipio"].replace(".", "")
    results_df = pd.DataFrame.from_records(results)
    return results_df

def get_search_query(municipios: pd.DataFrame)->pd.DataFrame:
    """
    Function to get the search query from the municipios dataframe
    and makes a new dataframe with the search query as follows:
    `municipio, departamento, pais`

    Parameters
    ----------
    municipios : pd.DataFrame
        DataFrame with the municipios
            columns: ['municipio', 'departamento'], pais is 'Colombia'
    
    Returns
    -------
    search_query:
        pd.DataFrame: DataFrame with the search query
            columns: ['search_query']
    """
    search_query = pd.DataFrame()
    for index, row in municipios.iterrows():
        # - Dibula, La Guajira, Colombia -> Dibulla, La Guajira, Colombia
        # - Tolú Viejo, Sucre, Colombia -> Tolúviejo, Sucre, Colombia
        # - San Juan de Río Seco, Cundinamarca, Colombia -> San Juan de Ríoseco, Cundinamarca, Colombia
        # - San Luis de Gaceno, Casanare, Colombia -> San Luis de Gaceno, <a style='color:red'> Boyacá </a>, Colombia
        # > **Nota:** Se Obta por San Luis de Gaceno, Boyacá, Colombia, dado que San Luis de Gaceno, Casanare, Colombia no se encuentra, y San Luis de Gaceno, Boyacá, Colombia es el municipio más cercano.
        # - Villa de San Diego de Ubate, Cundinamarca, Colombia -> Ubaté, Provincia de Ubaté, Colombia
        # - El Cantón del San Pablo, Chocó, Colombia -> El Cantón de San Pablo, Chocó, Colombia
        # - Valle de Guamez, Putumayo, Colombia -> Valle Del Guamuez, Putumayo, Colombia
        # - San Pablo de Borbur, Bolívar, Colombia -> San Pablo de Borbur, Boyaca, Colombia o San Pablo, Bolívar, Colombia
        # > **Nota:** Se Obta por San Pablo, Bolívar, Colombia, dado que San Pablo de Borbur, Boyaca, Colombia ya se encuentra en la base de datos.
        # - San Andrés de Tumaco, Nariño, Colombia -> Tumaco, Nariño, Colombia

        if f'{row["municipio"]}, {row["departamento"]}, Colombia' == 'Dibula, La Guajira, Colombia':
            search_query = pd.concat([search_query, pd.DataFrame({'search_query': 'Dibulla, La Guajira, Colombia'}, index=[0])])
        elif f'{row["municipio"]}, {row["departamento"]}, Colombia' == 'Tolú Viejo, Sucre, Colombia':
            search_query = pd.concat([search_query, pd.DataFrame({'search_query': 'Tolúviejo, Sucre, Colombia'}, index=[0])])
        elif f'{row["municipio"]}, {row["departamento"]}, Colombia' == 'San Juan de Río Seco, Cundinamarca, Colombia':
            search_query = pd.concat([search_query, pd.DataFrame({'search_query': 'San Juan de Ríoseco, Cundinamarca, Colombia'}, index=[0])])
        elif f'{row["municipio"]}, {row["departamento"]}, Colombia' == 'San Luis de Gaceno, Casanare, Colombia':
            search_query = pd.concat([search_query, pd.DataFrame({'search_query': 'San Luis de Gaceno, Boyacá, Colombia'}, index=[0])])
        elif f'{row["municipio"]}, {row["departamento"]}, Colombia' == 'Villa de San Diego de Ubate, Cundinamarca, Colombia':
            search_query = pd.concat([search_query, pd.DataFrame({'search_query': 'Ubaté, Provincia de Ubaté, Colombia'}, index=[0])])
        elif f'{row["municipio"]}, {row["departamento"]}, Colombia' == 'El Cantón del San Pablo, Chocó, Colombia':
            search_query = pd.concat([search_query, pd.DataFrame({'search_query': 'El Cantón de San Pablo, Chocó, Colombia'}, index=[0])])
        elif f'{row["municipio"]}, {row["departamento"]}, Colombia' == 'Valle de Guamez, Putumayo, Colombia':
            search_query = pd.concat([search_query, pd.DataFrame({'search_query': 'Valle Del Guamuez, Putumayo, Colombia'}, index=[0])])
        elif f'{row["municipio"]}, {row["departamento"]}, Colombia' == 'San Pablo de Borbur, Bolívar, Colombia':
            search_query = pd.concat([search_query, pd.DataFrame({'search_query': 'San Pablo, Bolívar, Colombia'}, index=[0])])
        elif f'{row["municipio"]}, {row["departamento"]}, Colombia' == 'San Andrés de Tumaco, Nariño, Colombia':
            search_query = pd.concat([search_query, pd.DataFrame({'search_query': 'Tumaco, Nariño, Colombia'}, index=[0])])
        else:
            search_query = pd.concat([search_query, pd.DataFrame({'search_query': f'{row["municipio"]}, {row["departamento"]}, Colombia'}, index=[0])])

    search_query.reset_index(drop=True, inplace=True)    
    return search_query

def get_location_info(search_query: pd.DataFrame)->pd.DataFrame:
    """
    Function that gets all the information from the api 
    https://nominatim.openstreetmap.org/search.php?q={search_query}&format=jsonv2
    and returns a dataframe with the information
    """
    location_info = pd.DataFrame()
    for index, row in search_query.iterrows():
        url = f'https://nominatim.openstreetmap.org/search.php?q={row["search_query"]}&format=jsonv2'
        response = requests.get(url)
        # get only the first result
        try:
            response_json = response.json()[0]
            print(f'Getting info from {index+1}/{len(search_query)} {row["search_query"]}')
            print(response_json)
        except:
            print(f'Error with {row["search_query"]}')
            continue
        # print the length of the response
        print(len(response_json))
        # add the info to the dataframe keep boundingbox as a list
        response_json['boundingbox'] = [response_json['boundingbox']]
        location_info = pd.concat([location_info, pd.DataFrame(response_json, index=[0])])
    return location_info


municipios = get_municipios()
# add the search query to the municipios dataframe
search_query = get_search_query(municipios)
municipios = pd.concat([municipios, search_query], axis=1)
# get the location info from the search query
location_info = get_location_info(search_query)
location_info.reset_index(drop=True, inplace=True)
# save the data
location_info.to_csv('location_info.csv', index=False)
# check if municipios and location_info have the same length
if len(municipios) != len(location_info):
    print('Error: municipios and location_info have different length')
# add the location info to the municipios dataframe
municipios = pd.concat([municipios, location_info], axis=1)
# save the data
municipios.to_csv('municipios.csv', index=False)




Getting info from 1/10 Medellín, Antioquia, Colombia
{'place_id': 269992877, 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. http://osm.org/copyright', 'osm_type': 'relation', 'osm_id': 1343264, 'lat': '6.2697018', 'lon': '-75.60252574475943', 'category': 'boundary', 'type': 'administrative', 'place_rank': 12, 'importance': 0.5934777156602727, 'addresstype': 'city', 'name': 'Medellín', 'display_name': 'Medellín, Valle de Aburrá, Antioquia, RAP del Agua y la Montaña, 0500, Colombia', 'boundingbox': ['6.1626165', '6.3764208', '-75.7194224', '-75.4734083']}
14
Getting info from 2/10 Abejorral, Antioquia, Colombia
{'place_id': 269620863, 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. http://osm.org/copyright', 'osm_type': 'relation', 'osm_id': 1307016, 'lat': '5.80498375', 'lon': '-75.4304641628398', 'category': 'boundary', 'type': 'administrative', 'place_rank': 12, 'importance': 0.45000999999999997, 'addresstype': 'county', 'name': 'Abejorral', 'display_name': 'Abejorr