In [21]:
import pandas as pd
from sodapy import Socrata
import requests
import time

def get_municipios():
    """
    Function to get the municipios from the API

    Returns
    -------
    Municipios:
        pd.DataFrame: DataFrame with the municipios
    """
    client = Socrata("www.datos.gov.co", None)
    results = client.get("xdk5-pm3f", limit=2000)
    for result in results:
        if len(result["c_digo_dane_del_municipio"]) < 6:
            result["c_digo_dane_del_municipio"] = "0" + result["c_digo_dane_del_municipio"]
            if len(result["c_digo_dane_del_municipio"]) < 6:
                result["c_digo_dane_del_municipio"] = result["c_digo_dane_del_municipio"]+"0"
        result["c_digo_dane_del_municipio"] = result["c_digo_dane_del_municipio"].replace(".", "")
    results_df = pd.DataFrame.from_records(results)
    return results_df

def get_search_query(municipios: pd.DataFrame)->pd.DataFrame:
    """
    Function to get the search query from the municipios dataframe
    and makes a new dataframe with the search query as follows:
    `municipio, departamento, pais`

    Parameters
    ----------
    municipios : pd.DataFrame
        DataFrame with the municipios
            columns: ['municipio', 'departamento'], pais is 'Colombia'
    
    Returns
    -------
    search_query:
        pd.DataFrame: DataFrame with the search query
            columns: ['search_query']
    """
    search_query = pd.DataFrame()
    for index, row in municipios.iterrows():
        # - Dibula, La Guajira, Colombia -> Dibulla, La Guajira, Colombia
        # - Tolú Viejo, Sucre, Colombia -> Tolúviejo, Sucre, Colombia
        # - San Juan de Río Seco, Cundinamarca, Colombia -> San Juan de Ríoseco, Cundinamarca, Colombia
        # - San Luis de Gaceno, Casanare, Colombia -> San Luis de Gaceno, <a style='color:red'> Boyacá </a>, Colombia
        # > **Nota:** Se Obta por San Luis de Gaceno, Boyacá, Colombia, dado que San Luis de Gaceno, Casanare, Colombia no se encuentra, y San Luis de Gaceno, Boyacá, Colombia es el municipio más cercano.
        # - Villa de San Diego de Ubate, Cundinamarca, Colombia -> Ubaté, Provincia de Ubaté, Colombia
        # - El Cantón del San Pablo, Chocó, Colombia -> El Cantón de San Pablo, Chocó, Colombia
        # - Valle de Guamez, Putumayo, Colombia -> Valle Del Guamuez, Putumayo, Colombia
        # - San Pablo de Borbur, Bolívar, Colombia -> San Pablo de Borbur, Boyaca, Colombia o San Pablo, Bolívar, Colombia
        # > **Nota:** Se Obta por San Pablo, Bolívar, Colombia, dado que San Pablo de Borbur, Boyaca, Colombia ya se encuentra en la base de datos.
        # - San Andrés de Tumaco, Nariño, Colombia -> Tumaco, Nariño, Colombia

        if f'{row["municipio"]}, {row["departamento"]}, Colombia' == 'Dibula, La Guajira, Colombia':
            search_query = pd.concat([search_query, pd.DataFrame({'search_query': 'Dibulla, La Guajira, Colombia'}, index=[0])])
        elif f'{row["municipio"]}, {row["departamento"]}, Colombia' == 'Tolú Viejo, Sucre, Colombia':
            search_query = pd.concat([search_query, pd.DataFrame({'search_query': 'Tolúviejo, Sucre, Colombia'}, index=[0])])
        elif f'{row["municipio"]}, {row["departamento"]}, Colombia' == 'San Juan de Río Seco, Cundinamarca, Colombia':
            search_query = pd.concat([search_query, pd.DataFrame({'search_query': 'San Juan de Ríoseco, Cundinamarca, Colombia'}, index=[0])])
        elif f'{row["municipio"]}, {row["departamento"]}, Colombia' == 'San Luis de Gaceno, Casanare, Colombia':
            search_query = pd.concat([search_query, pd.DataFrame({'search_query': 'San Luis de Gaceno, Boyacá, Colombia'}, index=[0])])
        elif f'{row["municipio"]}, {row["departamento"]}, Colombia' == 'Villa de San Diego de Ubate, Cundinamarca, Colombia':
            search_query = pd.concat([search_query, pd.DataFrame({'search_query': 'Ubaté, Provincia de Ubaté, Colombia'}, index=[0])])
        elif f'{row["municipio"]}, {row["departamento"]}, Colombia' == 'El Cantón del San Pablo, Chocó, Colombia':
            search_query = pd.concat([search_query, pd.DataFrame({'search_query': 'El Cantón de San Pablo, Chocó, Colombia'}, index=[0])])
        elif f'{row["municipio"]}, {row["departamento"]}, Colombia' == 'Valle de Guamez, Putumayo, Colombia':
            search_query = pd.concat([search_query, pd.DataFrame({'search_query': 'Valle Del Guamuez, Putumayo, Colombia'}, index=[0])])
        elif f'{row["municipio"]}, {row["departamento"]}, Colombia' == 'San Pablo de Borbur, Bolívar, Colombia':
            search_query = pd.concat([search_query, pd.DataFrame({'search_query': 'San Pablo, Bolívar, Colombia'}, index=[0])])
        elif f'{row["municipio"]}, {row["departamento"]}, Colombia' == 'San Andrés de Tumaco, Nariño, Colombia':
            search_query = pd.concat([search_query, pd.DataFrame({'search_query': 'Tumaco, Nariño, Colombia'}, index=[0])])
        else:
            search_query = pd.concat([search_query, pd.DataFrame({'search_query': f'{row["municipio"]}, {row["departamento"]}, Colombia'}, index=[0])])

    search_query.reset_index(drop=True, inplace=True)    
    return search_query

def get_location_info(search_query: pd.DataFrame)->pd.DataFrame:
    """
    Function that gets all the information from the api 
    https://nominatim.openstreetmap.org/search.php?q={search_query}&format=jsonv2
    and returns a dataframe with the information
    """
    
    location_info = pd.DataFrame()
    for index, row in search_query.iterrows():
        start_time = time.time()
        url = f'https://nominatim.openstreetmap.org/search.php?q={row["search_query"]}&format=jsonv2'
        response = requests.get(url)
        # get only the first result
        try:
            response_json = response.json()[0]
            print(f'Getting info from {index+1}/{len(search_query)} {row["search_query"]}')
            print(response_json)
        except:
            print(f'Error with {row["search_query"]}')
            continue
        # print the length of the response
        print(len(response_json))
        # add the info to the dataframe keep boundingbox as a list
        response_json['boundingbox'] = [response_json['boundingbox']]
        location_info = pd.concat([location_info, pd.DataFrame(response_json, index=[0])])
        # print the time
        print("--- %s seconds ---" % (time.time() - start_time))
    return location_info

def get_distance_and_time_matrix(origins_info: pd.DataFrame, destinations_info: pd.DataFrame)->pd.DataFrame:
    """
    Function to get the distance and time matrix from the coordinates
    uses the OSRM API to obtain the values
    
    Parameters
    --------
    origins_info : pd.DataFrame
        DataFrame with the origins info
            columns: ['name', 'lat', 'lon']

    destinations_info : pd.DataFrame
        DataFrame with the destinations info
            columns: ['name', 'lat', 'lon']
            
    Returns
    -------
    distance_matrix:
        pd.DataFrame: DataFrame with the distance matrix
        
    time_matrix:
        pd.DataFrame: DataFrame with the time matrix

    Examples
    --------

    >>> origins_info = pd.DataFrame({'name': ['Bogotá', 'Medellín', 'Cali'], 'lat': [4.60971, 6.25184, 3.43722], 'lon': [-74.08175, -75.56359, -76.5225]})
    >>> destinations_info = pd.DataFrame({'name': ['Barraquilla', 'Cartagena', 'Bucaramanga'], 'lat': [10.96389, 10.39972, 7.12539], 'lon': [-74.79639, -75.51444, -73.1198]})
    >>> distance_matrix, time_matrix = get_distance_and_time_matrix_v2(origins_info, destinations_info)
    >>> display(distance_matrix)

    |    | name        |   Barraquilla |   Cartagena |   Bucaramanga |
    |---:|:------------|--------------:|------------:|--------------:|
    |  0 | Bogotá      |       1145.79 |     1059.79 |        396.06 |
    |  1 | Medellín    |       1014.61 |      928.61 |        264.88 |
    |  2 | Cali        |       1045.02 |      959.02 |        295.29 |

    >>> display(time_matrix)

    |    | name        |   Barraquilla |   Cartagena |   Bucaramanga |
    |---:|:------------|--------------:|------------:|--------------:|
    |  0 | Bogotá      |        15.55  |       14.35 |          5.35 |
    |  1 | Medellín    |        13.8   |       12.6  |          4.5  |
    |  2 | Cali        |        14.45  |       13.25 |          5.15 |

    distances in meters
    times in seconds
    """
    url_base = 'http://router.project-osrm.org/table/v1/driving/'
    #/table/v1/{profile}/{coordinates}?{sources}=[{elem}...];&{destinations}=[{elem}...]&annotations={duration|distance|duration,distance}
    url_origins = ''
    for index, municipio in origins_info.iterrows():
        # get the coordinates
        lon = municipio['lon']
        lat = municipio['lat']
        coordinates = f'{lon},{lat};'
        url_origins += coordinates
    url_destinations = ''
    for index, municipio in destinations_info.iterrows():
        # get the coordinates
        lon = municipio['lon']
        lat = municipio['lat']
        coordinates = f'{lon},{lat};'
        url_destinations += coordinates
    # remove the last ;
    url_destinations = url_destinations[:-1]

    sources = ''
    for i in range(len(origins_info)):
        sources += f'{i};'
    # remove the last ;
    sources = sources[:-1]

    destinations = ''
    for i in range(len(destinations_info)):
        destinations += f'{i+len(origins_info)};'
    # remove the last ;
    destinations = destinations[:-1]

    # create the url
    url = url_base + url_origins + url_destinations +'?sources=' + sources + '&destinations=' + destinations + '&annotations=distance,duration'
    
    response = requests.get(url)
    # check if the response is ok
    if response.status_code != 200:
        print(f'Error: {response.status_code}')
        return None, None
    # get the distance and time matrix
    response_json = response.json()
    distance_matrix = pd.DataFrame(response_json['distances'], index=origins_info['name'], columns=destinations_info['name'])
    time_matrix = pd.DataFrame(response_json['durations'], index=origins_info['name'], columns=destinations_info['name'])
    return distance_matrix, time_matrix

def get_distance_and_time_matrix_full_size(origins_info: pd.DataFrame, destinations_info):
    """
    # TODO: #6 add docstring
    """
    #get_distance_and_time_matrix() only allows size 100x100
    #this function splits the data in chunks of 100x100 and then merges the results

    #get the number of chunks
    num_chunks = int(len(origins_info)/100)
    if len(origins_info)%100 != 0:
        num_chunks += 1

    #split the data in chunks
    origins_info_chunks = []
    destinations_info_chunks = []
    for i in range(num_chunks):
        origins_info_chunks.append(origins_info[i*100:(i+1)*100])
        destinations_info_chunks.append(destinations_info[i*100:(i+1)*100])

    #get the distance and time matrix for each chunk
    distance_matrix_chunks = []
    time_matrix_chunks = []
    for i in range(num_chunks):
        for j in range(num_chunks):
            print(f'Getting chunk {i*num_chunks+j+1}/{num_chunks*num_chunks}')
            distance_matrix, time_matrix = get_distance_and_time_matrix(origins_info_chunks[i], destinations_info_chunks[j])

            #append the results
            distance_matrix_chunks.append(distance_matrix)
            time_matrix_chunks.append(time_matrix)
    # save to csv
    for i in range(num_chunks):
        for j in range(num_chunks):
            distance_matrix_chunks[i*num_chunks+j].to_csv(f'matrix/distance_matrix/{i*num_chunks+j}.csv')
            time_matrix_chunks[i*num_chunks+j].to_csv(f'matrix/time_matrix/{i*num_chunks+j}.csv')
    # merge the chunks
    distance_matrix = pd.DataFrame()
    time_matrix = pd.DataFrame()
    # add horizontally and vertically the chunks
    for i in range(num_chunks):
        row_distance = pd.DataFrame()
        row_time = pd.DataFrame()
        for j in range(num_chunks):
            row_distance = pd.concat([row_distance, distance_matrix_chunks[i*num_chunks+j]], axis=1)
            row_time = pd.concat([row_time, time_matrix_chunks[i*num_chunks+j]], axis=1)
        distance_matrix = pd.concat([distance_matrix, row_distance], axis=0)
        time_matrix = pd.concat([time_matrix, row_time], axis=0)
    # force the diagonal to be 0
    for i in range(len(distance_matrix)):
        distance_matrix.iloc[i,i] = 0
        time_matrix.iloc[i,i] = 0
    # save to csv
    distance_matrix.to_csv('distance_matrix.csv')
    time_matrix.to_csv('time_matrix.csv')
    return distance_matrix, time_matrix

    display(distance_matrix)
            
municipios = pd.read_csv('municipios.csv')
distance_matrix, time_matrix = get_distance_and_time_matrix_full_size(municipios, municipios)

display(distance_matrix)
display(time_matrix)
# municipios = get_municipios()
# display(municipios)
# # add the search query to the municipios dataframe
# search_query = get_search_query(municipios)
# municipios = pd.concat([municipios, search_query], axis=1)
# # get the location info from the search query
# location_info = get_location_info(search_query)
# location_info.reset_index(drop=True, inplace=True)
# # save the data
# location_info.to_csv('location_info.csv', index=False)
# # check if municipios and location_info have the same length
# if len(municipios) != len(location_info):
#     print('Error: municipios and location_info have different length')
# # add the location info to the municipios dataframe
# municipios = pd.concat([municipios, location_info], axis=1)
# # save the data
# municipios.to_csv('municipios.csv', index=False)




Getting chunk 1/144
Getting chunk 2/144
Getting chunk 3/144
Getting chunk 4/144
Getting chunk 5/144
Getting chunk 6/144
Getting chunk 7/144
Getting chunk 8/144
Getting chunk 9/144
Getting chunk 10/144
Getting chunk 11/144
Getting chunk 12/144
Getting chunk 13/144
Getting chunk 14/144
Getting chunk 15/144
Getting chunk 16/144
Getting chunk 17/144
Getting chunk 18/144
Getting chunk 19/144
Getting chunk 20/144
Getting chunk 21/144
Getting chunk 22/144
Getting chunk 23/144
Getting chunk 24/144
Getting chunk 25/144
Getting chunk 26/144
Getting chunk 27/144
Getting chunk 28/144
Getting chunk 29/144
Getting chunk 30/144
Getting chunk 31/144
Getting chunk 32/144
Getting chunk 33/144
Getting chunk 34/144
Getting chunk 35/144
Getting chunk 36/144
Getting chunk 37/144
Getting chunk 38/144
Getting chunk 39/144
Getting chunk 40/144
Getting chunk 41/144
Getting chunk 42/144
Getting chunk 43/144
Getting chunk 44/144
Getting chunk 45/144
Getting chunk 46/144
Getting chunk 47/144
Getting chunk 48/144
G

  distance_matrix = pd.concat([distance_matrix, row_distance], axis=0)
  time_matrix = pd.concat([time_matrix, row_time], axis=0)
  distance_matrix = pd.concat([distance_matrix, row_distance], axis=0)
  time_matrix = pd.concat([time_matrix, row_time], axis=0)
  distance_matrix = pd.concat([distance_matrix, row_distance], axis=0)
  time_matrix = pd.concat([time_matrix, row_time], axis=0)
  distance_matrix = pd.concat([distance_matrix, row_distance], axis=0)
  time_matrix = pd.concat([time_matrix, row_time], axis=0)


name,Medellín,Abejorral,Abriaquí,Alejandría,Amagá,Amalfi,Andes,Angelópolis,Angostura,Anorí,...,Las Ánimas,Pueblo Viejo de Sentay,Villagarzón,Facatativá,Puerto Libertador,Marquetalia,Arboleda (Berruecos),Buenaventura,Ciénaga,Ponedera
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Medellín,0.0,86361.7,134450.8,90408.1,41218.2,174751.1,133763.2,43076.8,147665.7,182161.0,...,288491.4,819376.5,912175.1,370782.6,295806.8,304877.0,756217.3,487924.0,760406.5,653681.3
Abejorral,91214.1,0.0,224680.0,133735.3,107066.2,241147.0,148093.6,108924.7,214061.6,248556.9,...,302821.9,885772.5,862991.2,383936.3,362202.7,199829.1,707033.4,438740.1,826802.4,720077.3
Abriaquí,133242.4,219637.5,0.0,224310.2,174262.8,308653.2,220927.7,176121.3,248203.0,282698.3,...,375656.0,885371.1,1017618.1,504058.4,396344.1,410320.0,861660.2,593366.9,826401.0,719675.9
Alejandría,91666.1,125038.3,223780.5,0.0,125258.6,162501.2,217803.6,127117.2,113698.7,161591.3,...,372531.8,767784.1,905732.9,321352.5,318436.6,261665.5,840257.6,571964.4,772995.4,676311.1
Amagá,41981.6,107280.4,175447.6,126257.9,0.0,210600.9,93035.8,15574.9,183515.4,218010.7,...,247764.0,855226.2,873114.3,398337.6,331656.6,265816.2,717156.5,448863.1,796256.3,689531.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Marquetalia,304519.7,199967.2,410533.7,261718.1,265053.7,402924.5,302304.7,280399.5,413784.6,461677.3,...,457032.9,820134.4,708779.5,171285.6,641231.1,0.0,705335.8,437042.5,825345.6,934233.5
Arboleda (Berruecos),756455.4,707648.8,862469.4,840731.6,716989.4,925074.6,754240.4,732335.2,897989.1,932484.5,...,908968.6,1486354.8,247252.0,840299.1,1046130.4,706023.3,0.0,446975.3,1491566.0,1404005.0
Buenaventura,489729.1,440922.4,595743.0,574005.3,450263.0,658348.3,487514.0,465608.9,631262.8,665758.1,...,642242.3,1219628.4,570048.8,486838.5,779404.0,439296.9,448257.6,0.0,1224839.6,1137278.6
Ciénaga,761442.1,840409.3,825185.0,770792.5,795034.6,751725.7,887579.6,796893.2,662063.4,696558.7,...,1042307.8,67064.3,1467882.6,883502.2,538299.5,823815.2,1651806.8,1241740.4,0.0,109011.5


name,Medellín,Abejorral,Abriaquí,Alejandría,Amagá,Amalfi,Andes,Angelópolis,Angostura,Anorí,...,Las Ánimas,Pueblo Viejo de Sentay,Villagarzón,Facatativá,Puerto Libertador,Marquetalia,Arboleda (Berruecos),Buenaventura,Ciénaga,Ponedera
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Medellín,0.0,6332.6,7536.2,7244.6,2537.5,13895.3,9583.1,3519.0,8877.8,11908.9,...,18389.5,40137.5,46688.8,21167.8,21049.2,18490.2,46931.7,23783.1,35407.8,30210.9
Abejorral,6343.5,0.0,13639.7,9647.7,6938.1,18915.8,10856.8,7919.6,13898.3,16929.4,...,19663.2,45158.0,44992.1,20941.8,26069.7,15428.6,45235.0,22086.4,40428.3,35231.4
Abriaquí,7504.3,13629.2,0.0,14544.1,9829.5,21194.8,14045.4,10811.0,15923.9,18955.0,...,22851.8,43646.0,51798.5,28464.4,28095.3,23599.9,52041.4,28892.8,38916.3,33719.4
Alejandría,6421.7,9836.6,13666.9,0.0,8300.2,14020.3,15345.8,9281.7,9998.0,12044.8,...,24152.2,37822.0,48756.9,20335.7,23030.4,18425.5,52694.4,29545.8,34894.3,32192.1
Amagá,2379.3,6911.7,9675.5,8929.7,0.0,15580.4,7149.1,1508.0,10562.9,13594.0,...,15955.5,41822.6,44339.2,22283.3,22734.3,16140.6,44582.1,21433.5,37092.9,31896.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Marquetalia,18948.5,15643.3,24994.0,19153.1,16795.8,26697.0,19805.7,18239.2,23562.0,25608.8,...,28612.1,40535.2,37777.0,10264.8,37683.1,0.0,46571.1,23422.5,37607.5,43213.1
Arboleda (Berruecos),47988.7,46541.2,54034.2,54539.1,45836.0,61189.8,48845.9,47279.4,56172.3,59203.4,...,57652.3,81666.6,19453.7,45421.3,68343.7,48011.3,0.0,32948.8,78738.9,77505.4
Buenaventura,24863.9,23416.4,30909.4,31414.3,22711.2,38065.0,25721.1,24154.6,33047.5,36078.6,...,34527.5,58541.8,38218.0,26640.6,45218.9,24886.5,32900.7,0.0,55614.1,54380.6
Ciénaga,34856.8,40535.0,38834.8,34770.8,36735.3,42859.8,43780.9,37716.8,31758.9,34790.0,...,52587.3,6055.4,67053.4,38632.2,29315.6,36722.0,78589.4,57980.9,0.0,5634.9


In [None]:

# TODO: #3 Matriz de Distancias
# TODO: #4 Matriz de Tiempos
# TODO: #5 Merge Habitantes

# TODO: #6 Merge Demanda
# TODO: #7 Merge Origenes