# Notebook 1 - Exploración de fuentes de datos - v1

In [3]:
# pip install owid-catalog -> se instala pero Jupyter Lab no encuentra el módulo owid

''' solucionado con este código
import sys
import subprocess

# verifica dónde está instalado Python
print("Python que usa JupyterLab:", sys.executable)

# instala owid-catalog en este Python específico
try:
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'owid-catalog'])
    print("✅ Instalación completada")
except Exception as e:
    print("❌ Error en la instalación:", e)

# comprueba si owid-catalog está instalado
result = subprocess.run([sys.executable, '-m', 'pip', 'show', 'owid-catalog'], 
                       capture_output=True, text=True)
print("Paquete instalado:", "owid-catalog" in result.stdout)

# lista paquetes instalados que contengan 'owid'
result = subprocess.run([sys.executable, '-m', 'pip', 'list'], 
                       capture_output=True, text=True)
owid_packages = [line for line in result.stdout.split('\n') if 'owid' in line.lower()]
print("Paquetes owid encontrados:", owid_packages)
'''
print()




In [2]:
import owid.catalog as catalog

## 1. Investigar y evaluar APIs disponibles de datos COVID

### 1.1. Our World in Data (OWID)

https://github.com/owid/etl

https://docs.owid.io/projects/etl/api/covid/

Recopilación de datos de COVID-19 desde el inicio de la pandemia hasta la actualidad (Mayo de 2025); consolida indicadores de varios conjuntos de datos en un único archivo, incluyendo metadatos que explican todos los indicadores en detalle.

### 1.2. Otros

## 2. Our World in Data (OWID)

In [4]:
# crea una instancia del catálogo
cat = catalog.RemoteCatalog()

In [29]:
# obtiene todos los datasets
datasets = cat.find()
print(f"Tipo de datasets: {type(datasets)}")
print(f"Total de datasets: {len(datasets)}")

Tipo de datasets: <class 'owid.catalog.catalogs.CatalogFrame'>
Total de datasets: 17533


In [30]:
# verifica si el DataFrame tiene datos
if datasets.empty:
    print("❌ El dataframe no contiene datos")
else:
    print("✅ El dataframe sí contiene datos")

✅ El dataframe sí contiene datos


In [31]:
print("Columnas:", list(datasets.columns))

Columnas: ['table', 'dataset', 'version', 'namespace', 'channel', 'is_public', 'dimensions', 'path', 'formats']


In [38]:
# muestra los 5 primeros datasets del catálogo
datasets.head()

Unnamed: 0,table,dataset,version,namespace,channel,is_public,dimensions,path,formats
0,_10_1_1_si_hei_totl,un_sdg,2023-01-24,un,garden,True,"[country, year, goal, target, indicator, serie...",garden/un/2023-01-24/un_sdg/_10_1_1_si_hei_totl,"[feather, parquet]"
1,_10_1_1_si_hei_totl,un_sdg,2023-08-16,un,garden,True,"[country, year, goal, target, indicator, serie...",garden/un/2023-08-16/un_sdg/_10_1_1_si_hei_totl,"[feather, parquet]"
2,_10_1_1_si_hei_totl,un_sdg,2024-08-27,un,garden,True,"[country, year, goal, target, indicator, serie...",garden/un/2024-08-27/un_sdg/_10_1_1_si_hei_totl,"[feather, parquet]"
3,_10_2_1_si_pov_50mi,un_sdg,2023-01-24,un,garden,True,"[country, year, goal, target, indicator, serie...",garden/un/2023-01-24/un_sdg/_10_2_1_si_pov_50mi,"[feather, parquet]"
4,_10_2_1_si_pov_50mi,un_sdg,2023-08-16,un,garden,True,"[country, year, goal, target, indicator, serie...",garden/un/2023-08-16/un_sdg/_10_2_1_si_pov_50mi,"[feather, parquet]"


In [37]:
datasets.info()

<class 'owid.catalog.catalogs.CatalogFrame'>
RangeIndex: 17533 entries, 0 to 17532
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   table       17533 non-null  object
 1   dataset     17533 non-null  object
 2   version     17533 non-null  object
 3   namespace   17533 non-null  object
 4   channel     17533 non-null  object
 5   is_public   17533 non-null  bool  
 6   dimensions  17533 non-null  object
 7   path        17533 non-null  object
 8   formats     17533 non-null  object
dtypes: bool(1), object(8)
memory usage: 1.1+ MB


In [39]:
datasets['channel'].value_counts()    

channel
garden    17533
Name: count, dtype: int64

In [68]:
# busca datasets por palabra clave en todas las columnas de texto
filtro = 'population'
text_cols = datasets.select_dtypes(include=['object']).columns
filter_mask = False
for col in text_cols:
    filter_mask |= datasets[col].str.contains(filtro, case=False, na=False)
    
filtered_datasets = datasets[filter_mask]
print(f"Datasets con '{filtro}': {len(filtered_datasets)}")
if not filtered_datasets.empty:
    display(filtered_datasets.head())

Datasets con 'population': 203


Unnamed: 0,table,dataset,version,namespace,channel,is_public,dimensions,path,formats
562,_15_19_years_old_heavy_episodic_drinkers__popu...,gho,2024-01-03,who,garden,True,"[year, country, sex]",garden/who/2024-01-03/gho/_15_19_years_old_hea...,"[feather, parquet]"
2377,adolescent_mortality_rate__per_100_000_population,gho,2024-01-03,who,garden,True,"[year, country]",garden/who/2024-01-03/gho/adolescent_mortality...,"[feather, parquet]"
2381,adult_day_centre_places__per_10_000_population,gho,2024-01-03,who,garden,True,"[year, country]",garden/who/2024-01-03/gho/adult_day_centre_pla...,"[feather, parquet]"
2383,adult_mortality_rate__probability_of_dying_bet...,gho,2024-01-03,who,garden,True,"[year, country, sex]",garden/who/2024-01-03/gho/adult_mortality_rate...,"[feather, parquet]"
2483,age_standardized_dalys_attributable_to_the_env...,gho,2024-01-03,who,garden,True,"[year, country, cause]",garden/who/2024-01-03/gho/age_standardized_dal...,"[feather, parquet]"


In [64]:
type(filtered_datasets)

owid.catalog.catalogs.CatalogFrame

In [65]:
first_dataset = filtered_datasets.iloc[0].load()

In [66]:
first_dataset

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,_15_19_years_old_heavy_episodic_drinkers__population__pct_with_95pctci
year,country,sex,Unnamed: 3_level_1
2010,Afghanistan,both sexes,0.0
2010,Afghanistan,female,0.0
2010,Afghanistan,male,0.0
2010,Albania,both sexes,21.5
2010,Albania,female,13.3
...,...,...,...
2016,Zambia,female,2.4
2016,Zambia,male,15.0
2016,Zimbabwe,both sexes,5.3
2016,Zimbabwe,female,1.4


In [58]:
# metadatos
metadata = first_dataset.metadata
display(metadata.to_dict())

{'short_name': '_15_19_years_old_heavy_episodic_drinkers__population__pct_with_95pctci',
 'title': '15-19 years old heavy episodic drinkers (population), % with 95%CI',
 'dataset': {'channel': 'garden',
  'namespace': 'who',
  'short_name': 'gho',
  'title': 'Global Health Observatory - World Health Organization',
  'description': "The GHO data repository is WHO's gateway to health-related statistics for its 194 Member States. It provides access to over 1000 indicators on priority health topics including mortality and burden of diseases, the Millennium Development Goals (child nutrition, child health, maternal and reproductive health, immunization, HIV/AIDS, tuberculosis, malaria, neglected diseases, water and sanitation), non communicable diseases and risk factors, epidemic-prone diseases, health systems, environmental health, violence and injuries, equity among others.",
  'is_public': True,
  'version': '2024-01-03',
  'update_period_days': 365,
  'non_redistributable': False,
  'so

In [61]:
print("Título general:", metadata.title)
print("Channel:", metadata.dataset.channel)
print("Namespace:", metadata.dataset.namespace)
print("Título:", metadata.dataset.title)
print("Público:", metadata.dataset.is_public)
print("Descripción:", metadata.dataset.description)

Título: 15-19 years old heavy episodic drinkers (population), % with 95%CI
Channel: garden
Namespace: who
Título dataset: Global Health Observatory - World Health Organization
Público: True
Descripción: The GHO data repository is WHO's gateway to health-related statistics for its 194 Member States. It provides access to over 1000 indicators on priority health topics including mortality and burden of diseases, the Millennium Development Goals (child nutrition, child health, maternal and reproductive health, immunization, HIV/AIDS, tuberculosis, malaria, neglected diseases, water and sanitation), non communicable diseases and risk factors, epidemic-prone diseases, health systems, environmental health, violence and injuries, equity among others.


### 2.1. COVID-19

Vamos a ver qué datasets están relacionados con el COVID-19.

In [75]:
# busca datasets por palabra clave en todas las columnas de texto (object)
filtro = 'covid'
text_cols = datasets.select_dtypes(include=['object']).columns
filter_mask = False
for col in text_cols:
    filter_mask |= datasets[col].str.contains(filtro, case=False, na=False)
    
filtered_datasets = datasets[filter_mask]
print(f"Datasets con '{filtro}': {len(filtered_datasets)}")
if not filtered_datasets.empty:
    display(filtered_datasets.head())

Datasets con 'covid': 53


Unnamed: 0,table,dataset,version,namespace,channel,is_public,dimensions,path,formats
4740,cases_and_deaths_who,cases_and_deaths_who,latest,covid,garden,False,"[year, country]",garden/covid/latest/cases_and_deaths_who/cases...,[feather]
4741,cases_deaths,cases_deaths,latest,covid,garden,True,"[country, date]",garden/covid/latest/cases_deaths/cases_deaths,"[feather, csv]"
4845,chile,deaths_vax_status,latest,covid,garden,True,"[country, date]",garden/covid/latest/deaths_vax_status/chile,"[feather, parquet]"
5256,combined,combined,latest,covid,garden,True,"[country, date]",garden/covid/latest/combined/combined,"[feather, parquet]"
5280,compact,compact,latest,covid,garden,True,"[country, date]",garden/covid/latest/compact/compact,"[feather, csv]"


In [86]:
print(type(filtered_datasets))
print(filtered_datasets.columns)

<class 'owid.catalog.catalogs.CatalogFrame'>
Index(['table', 'dataset', 'version', 'namespace', 'channel', 'is_public',
       'dimensions', 'path', 'formats'],
      dtype='object')


In [91]:
# selecciona sólo los datasets públicos
public_datasets = filtered_datasets[filtered_datasets["is_public"]]
print(f"Total de datasets públicos: {len(public_datasets)}")

Total de datasets públicos: 41


In [94]:
for _, row in public_datasets.iterrows():
    # carga el dataset como objeto Table
    table = row.load()
    print("Título:", table.metadata.dataset.title)

Título: COVID-19, confirmed cases and deaths
Título: COVID-19, deaths by vaccination status
Título: COVID-19, combined indicators
Título: Global Health Estimates
Título: GitHub stats on owid/covid-19-data repository
Título: COVID-19, Countries reporting data
Título: COVID-19, Government Response Tracker (OxCGRT)
Título: COVID-19, Countries reporting data
Título: COVID-19, Countries reporting data
Título: COVID-19, Countries reporting data
Título: COVID-19, Countries reporting data
Título: COVID-19, donations to COVAX
Título: None
Título: COVID-19, decoupling of indicators
Título: COVID-2019 - ECDC (2020)
Título: COVID-19, deaths by vaccination status
Título: COVID-19, Community Mobility Reports
Título: Google Mobility Trends (2020)
Título: COVID Government Response (OxBSG)
Título: COVID-19, hospitalisations
Título: COVID-2019 - Hospital & ICU
Título: Global Health Observatory - World Health Organization
Título: Global Health Observatory - World Health Organization
Título: COVID-19, inf

<b>Conclusión</b>

Vamos a trabajar con los datasets del COVID-19 de OWID.