In [1]:
import requests

# La api rest utiliza el estandar ckan (https://docs.ckan.org/en/2.8/api/)
api_url = "https://datos.gob.cl/api/3/action/package_search?fq=groups:concurso-transparenta&start=0&rows=20"

response = requests.get(api_url)
response: dict = response.json()
results:list = response['result']['results']
resources = [result['resources'] for result in results]

In [2]:
# First, we will flatten the resources
from itertools import chain
from collections import Counter

resources_flatten = list(chain.from_iterable(resources))

all_keys = [key for dict in resources_flatten for key in dict.keys()]

key_counts = Counter(all_keys)

key_counts_sorted = dict(sorted(key_counts.items(), key=lambda item: item[1], reverse=True))
display(key_counts_sorted)


{'mimetype': 53,
 'cache_url': 53,
 'hash': 53,
 'description': 53,
 'name': 53,
 'created': 53,
 'url': 53,
 'datastore_active': 53,
 'format': 53,
 'package_id': 53,
 'id': 53,
 'cache_last_updated': 53,
 'state': 53,
 'mimetype_inner': 53,
 'last_modified': 53,
 'position': 53,
 'revision_id': 53,
 'url_type': 53,
 'datastore_contains_all_records_of_source_file': 53,
 'resource_type': 53,
 'size': 53,
 'original_url': 21,
 'resource_id': 21,
 'set_url_type': 21,
 'ignore_hash': 21,
 'ckan_url': 21,
 'task_created': 21}

In [5]:
import pandas as pd


df_api = pd.DataFrame.from_records(resources_flatten, columns=['cache_last_updated','package_id','datastore_contains_all_records_of_source_file','datastore_active','id','size','state','hash','description','format','mimetype_inner','url_type','mimetype','cache_url','name','created','url','last_modified','position','revision_id','resource_type'])

In [6]:
df_scraped = pd.read_csv("transparenta2024links.csv")

In [7]:
display(df_api.shape)
display(df_scraped.shape)

(53, 21)

(45, 5)

In [10]:
df_scraped['url'] = df_scraped['file_url']

In [11]:
# Concatenate the two dataframes with a 'source' column
combined_df = pd.concat([df_api.assign(source='df_api'), df_scraped.assign(source='df_scraped')], ignore_index=True)

# Count the occurrences of each value
value_counts = combined_df['url'].value_counts()

In [23]:
value_counts

url
https://datos.gob.cl/dataset/606ef5bb-11d1-475b-b69f-b980da5757f4/resource/ae6c9887-106d-4e98-8875-40bf2b836041/download/at_urg_respiratorio_semanal.parquet                                          2
https://datos.gob.cl/dataset/606ef5bb-11d1-475b-b69f-b980da5757f4/resource/27fbe340-0e74-465f-b9f9-488c3bf6533c/download/diccionario-de-datos-urgenciasrespiratoriasporsemana.xlsx                    2
https://datos.gob.cl/dataset/3bf4cf7c-f638-4735-9a01-f65faae4beca/resource/7127fc99-b858-4e74-8971-51413e484fb1/download/diccionario_base_establecimientos.xlsx                                       2
https://datos.gob.cl/dataset/8982a05a-91f7-422d-97bc-3eee08fde784/resource/8e5539b7-10b2-409b-ae5a-36dae4faf817/download/defunciones_covid19_2020_2024.csv                                            2
https://datos.gob.cl/dataset/8982a05a-91f7-422d-97bc-3eee08fde784/resource/80b3e81c-2f06-4919-a3ee-c4f94714c53a/download/diccionario-de-datos-bbdd-covid19.xlsx                                     

In [19]:
unique_values = value_counts[value_counts == 1].index

In [20]:
unique_df = combined_df[combined_df['url'].isin(unique_values)]

In [22]:
unique_values

Index(['https://datos.gob.cl/dataset/3bf4cf7c-f638-4735-9a01-f65faae4beca/resource/2c44d782-3365-44e3-aefb-2c8b8363a1bc/download/establecimientos_20240806.csv',
       'https://repositoriodeis.minsal.cl/SistemaAtencionesUrgencia/AtencionesUrgencia2024.zip',
       'https://repositoriodeis.minsal.cl/SistemaAtencionesUrgencia/AtencionesUrgencia2023.zip',
       'https://repositoriodeis.minsal.cl/DatosAbiertos/AtencionesDeUrgencia/AtencionesUrgencia2022.zip',
       'https://repositoriodeis.minsal.cl/DatosAbiertos/AtencionesDeUrgencia/AtencionesUrgencia2020.zip',
       'https://repositoriodeis.minsal.cl/DatosAbiertos/AtencionesDeUrgencia/AtencionesUrgencia2021.zip',
       'https://repositoriodeis.minsal.cl/DatosAbiertos/AtencionesDeUrgencia/AtencionesUrgencia2019.zip',
       'https://repositoriodeis.minsal.cl/DatosAbiertos/AtencionesDeUrgencia/AtencionesUrgencia2018.zip',
       'https://repositoriodeis.minsal.cl/DatosAbiertos/AtencionesDeUrgencia/AtencionesUrgencia2017.zip',
       'h

In [53]:
unique_df['url'].iloc[-1]

'https://datos.gob.cl/dataset/3bf4cf7c-f638-4735-9a01-f65faae4beca/resource/2c44d782-3365-44e3-aefb-2c8b8363a1bc/download/establecimientos_20240730.csv'