In [None]:
!pip install owid-catalog

In [3]:
from owid.catalog import RemoteCatalog
# find the default OWID catalog and fetch the catalog index over HTTPS
catalog = RemoteCatalog(channels=('garden', 'meadow', 'open_numbers'))

In [4]:
# All namespaces = main sources
catalog.datasets["namespace"].unique()

array(['un', 'demography', 'faostat', 'regions',
       'aviation_safety_network', 'war', 'papers', 'ember', 'cait',
       'biodiversity', 'ihme_gbd', 'andrew', 'unep', 'owid', 'health',
       'dummy', 'energy', 'rff', 'eia', 'bp', 'who', 'excess_mortality',
       'shift', 'growth', 'gcp', 'smil', 'happiness', 'technology',
       'oecd', 'homicide', 'living_planet', 'agriculture', 'lis', 'ggdc',
       'malnutrition', 'wb', 'ophi', 'emissions', 'emdat',
       'met_office_hadley_centre', 'country_profile', 'nasa', 'hmd',
       'hyde', 'gapminder', 'postnatal_care', 'irena', 'uk_beis', 'bls',
       'usda_nass', 'democracy', 'worldbank_wdi', 'imf', 'wid', 'wvs',
       'open_numbers'], dtype=object)

In [24]:
# GET LATEST VERSIONS of datasets
import pandas as pd
import numpy as np
from dataclasses import asdict

# Get datasets from owid remote catalog
datasets = catalog.datasets

# Group the data by 'namespace, dataset' and get latest version
datasets = datasets.groupby(['namespace', 'dataset'])['version'].max()
datasets = datasets.reset_index()

# Filter for performance/tests
# datasets = datasets[datasets["dataset"].str.contains("energy|consumption")]

print(datasets)

# Dataframe result including dataset metadata
df = pd.DataFrame()

for index, row in datasets.iterrows():
    try: 
        print('Getting metadata: index='+ str(index) + ' namespace=' + row.namespace + ', dataset=' + row.dataset)
        dataset = catalog.find_latest(namespace=row.namespace, dataset=row.dataset)
        df = pd.concat([df, pd.DataFrame([asdict(dataset.metadata.dataset)])], ignore_index=True)
    except:
        print('ERROR: namespace=' + row.namespace + ', dataset=' + row.dataset)

# Explode sources
df = df.explode(["sources"])
# df = df.explode(["licenses"])
df = df.reset_index()

# Extract the dictionary values into new DataFrame columns using apply and lambda function
df = pd.concat([df.drop(columns='sources'), df['sources'].apply(pd.Series)], axis=1)
# df = pd.concat([df.drop(columns='licenses'), df['licenses'].apply(pd.Series)], axis=1)

df.to_csv('../../processed/owid_catalog.csv', index=False)
df


                   namespace                    dataset     version
0                agriculture     long_term_wheat_yields  2023-04-20
1                agriculture        uk_long_term_yields  2023-04-21
2                     andrew      co2_mitigation_curves  2019-12-03
3    aviation_safety_network        aviation_statistics  2023-04-18
4               biodiversity             cherry_blossom  2023-01-11
..                       ...                        ...         ...
243                      who                   vehicles  2023-03-13
244                      who            who_vaccination  2022-07-17
245                      wid  world_inequality_database  2023-01-27
246            worldbank_wdi                        wdi  2022-05-26
247                      wvs                  wvs_trust  2023-03-08

[248 rows x 3 columns]
ERROR: namespace=ihme_gbd, dataset=gbd_drug_disorders
ERROR: namespace=open_numbers, dataset=ihme__global_burden_disease_death_number
ERROR: namespace=open_numb

In [None]:
# Population
df_pop = df[df["dataset"].str.contains("population")]
df_pop

data_pop = catalog.find_latest(namespace='gapminder', dataset='population')
data_pop


In [None]:

# GH CO2 Emmissions filter
df_gh = df[df["dataset"].str.contains("gh|green|house|carbon|co2|emission")]
df_gh


In [18]:
# NRJ filter
df_nrj = df[df["dataset"].str.contains("energy|consumption|final")]
df_nrj


Unnamed: 0,namespace,dataset,version
6,bp,energy_mix,2023-02-20
17,eia,energy_consumption,2022-07-27
26,energy,global_primary_energy,2023-02-20
27,energy,owid_energy,2023-02-20
29,energy,primary_energy_consumption,2023-02-20
98,irena,renewable_energy_patents,2022-10-25
107,open_numbers,bp__energy,
113,open_numbers,energy_flows_usa,
115,open_numbers,gapminder__bp_energy,
213,smil,global_primary_energy,2017-01-01


In [None]:
#  !!!! FROM SHIFT DATA PORTAL (UP TO 2016)
sdp = catalog.find_latest(namespace='shift')
print(sdp.metadata.dataset)
sdp.sort_values('year', ascending=False).head()


In [20]:
# DEFAULT LATEST EIA

df = catalog.find_latest(namespace='eia', dataset='energy_consumption')
print(df.metadata.dataset)
df

DatasetMeta(namespace='eia', short_name='energy_consumption', title='Energy consumption (EIA, 2022)', description='Total energy consumption.', sources=[Source(name='U.S. Energy Information Administration', description=None, url='https://www.eia.gov/opendata/bulkfiles.php', source_data_url='https://api.eia.gov/bulk/INTL.zip', owid_data_url='https://walden.nyc3.digitaloceanspaces.com/eia/2022-07-27/international_energy_data.zip', date_accessed='2022-07-27', publication_date='2022-07-27', publication_year=2022, published_by=None, publisher_source=None)], licenses=[License(name='Public domain', url='https://www.eia.gov/about/copyrights_reuse.php')], is_public=True, additional_info=None, version='2022-07-27', source_checksum='ff47bf59dfb2300ba411d833c6e16f16')


Unnamed: 0_level_0,Unnamed: 1_level_0,members,values
country,year,Unnamed: 2_level_1,Unnamed: 3_level_1
Afghanistan,1980,AFG,28046.779297
Afghanistan,1981,AFG,31598.349609
Afghanistan,1982,AFG,33653.976562
Afghanistan,1983,AFG,41170.183594
Afghanistan,1984,AFG,41360.554688
...,...,...,...
Zimbabwe,2015,ZWE,196743.890625
Zimbabwe,2016,ZWE,167997.359375
Zimbabwe,2017,ZWE,165369.781250
Zimbabwe,2018,ZWE,171007.437500
