In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from utils import decorate, configure_plot_style, AIBM_COLORS

configure_plot_style()

In [None]:
country_to_iso_dict = {
    'Austria': 'AUT',
    'Canada': 'CAN',
    'Chile': 'CHL',
    'Czechia': 'CZE',
    'Denmark': 'DNK',
    'Estonia': 'EST',
    'Finland': 'FIN',
    'France': 'FRA',
    'Germany': 'DEU',
    'Hungary': 'HUN',
    'Ireland': 'IRL',
    'Israel': 'ISR',
    'Italy': 'ITA',
    'Japan': 'JPN',
    'Korea': 'KOR',
    'Latvia': 'LVA',
    'Lithuania': 'LTU',
    'Netherlands': 'NLD',
    'New Zealand': 'NZL',
    'Norway': 'NOR',
    'Poland*': 'POL',
    'Portugal': 'PRT',
    'Slovak Republic': 'SVK',
    'Spain': 'ESP',
    'Sweden': 'SWE',
    'Switzerland': 'CHE',
    'United States': 'USA'
}



# Literacy

From the WEF report (emphasis added):

> Literacy rate, %
>
>Percentage of the adult population (women and men over 15 years of age) with the ability to
both read and write and make simple arithmetic calculations. *For advanced economies for which
data was unavailable in the last 10 years, the authors assumed based on older data that the
gender gap on literacy rate is closed.* 
>
>Period: 2023 or most recent year available. Source: UNESCO, UIS.Stat education statistics
data portal; when not available, data is sourced from the UNDP Human Development Reports, most
recent data available.


UNESCO stopped tracking OECD countries a while ago (more than 10 years, it seems, based on OWID data). So the WEF gives them a pass. But in many of those countries, women now surpass men in literacy, as we can see in data from the OEDC Survey of Adult Skills (PIAAC).

## Our World In Data

Lightly processed data from UNESCO

In [None]:
import requests

# Fetch the data.
owid = pd.read_csv("https://ourworldindata.org/grapher/literacy-rate-of-young-men-and-women.csv?v=1&csvType=full&useColumnShortNames=true", storage_options = {'User-Agent': 'Our World In Data data fetch/1.0'})

# Fetch the metadata
metadata = requests.get("https://ourworldindata.org/grapher/literacy-rate-of-young-men-and-women.metadata.json?v=1&csvType=full&useColumnShortNames=true").json()

In [None]:
metadata

In [None]:
owid.columns = ['Entity', 'Code', 'Year',
       'literacy_male',
       'literacy_female',
       'owid_region']

In [None]:
df.head()

In [None]:
owid.query("Code == 'AFG'")

In [None]:
owid_last = owid.groupby(["Code"]).last()
owid_last['ratio'] = owid_last['literacy_female'] / owid_last['literacy_male']
owid_last['score'] = 1 - np.abs(owid_last['ratio'] - 1)
owid_last.shape


In [None]:
owid_last.head()

In [None]:
owid_last.sort_values(by='score', ascending=False).tail(20)

In [None]:
missing = owid_last['score'].isna()
owid_last[missing]

In [None]:
owid_last['score'].describe()

## OECD PIAAC

From https://www.oecd.org/en/about/programmes/piaac.html

>The Programme for the International Assessment of Adult Competencies (PIAAC) is a programme of assessment and analysis of adult skills. The major product of PIAAC is the Survey of Adult Skills, an international computer-based household survey of adults aged 16-65 years. It is designed as 10-yearly cycles.

>The Survey measures adults’ proficiency in key information-processing skills - literacy, numeracy and problem solving – which represent skills needed for individuals to participate in society and for economies to prosper. It also gathers information and data on how adults use their skills at home and at work.

>The 1st Cycle of the Survey of Adult Skills was conducted over three separate rounds between 2011 and 2018 in 39 countries. During the 1st Cycle, about 245 000 adults were interviewed, representing 1.15 billion people.

>The 2nd Cycle of the Survey of Adults Skills has been conducted in 31 countries and economies so far. A first round of data collection took place in 2022-2023 with results released on 10 December 2024.  




I downloaded two tables from https://www.oecd.org/en/publications/do-adults-have-the-skills-they-need-to-thrive-in-a-changing-world_b263dc5d-en/full-report/tables-of-results-for-countries-and-economies_0432d7e4.html#annex-d1e19379-dc605a08b8:

* Table A.2.7 (L): Literacy proficiency, by gender

* Table A.2.8 (L): Literacy proficiency, by gender and age group

Both are sheets in this Excel spreadsheet: https://stat.link/eb8dxq

The first table includes "Percentage of low performers (scoring at Level 1 or below)", which we will use as a replacement for illiteracy.

Here's the description of Level 1:

> Adults at Level 1 are able to locate information on a text page, find a relevant link from a website, and identify relevant text among multiple options when the relevant information is explicitly cued. They can understand the meaning of short texts, as well as the organization of lists or multiple sections within a single page.
>
>The texts at level 1 may be continuous, noncontinuous, or mixed and pertain to printed or digital environments. They typically include a single page with up to a few hundred words and little or no distracting information. Noncontinuous texts may have a list structure (such as a web search engine results page) or include a small number of independent sections, possibly with pictorial illustrations or simple diagrams. Tasks at Level 1 involve simple questions providing some guidance as to what needs to be done and a single processing step. There is a direct, fairly obvious match between the question and target information in the text, although some tasks may require the examination of more than one piece of information.

We'll flip the sense by computing:

`literacy rate = (100 - Percentage of low performers)`

Here's the data.


In [None]:
piaac = pd.read_excel('eb8dxq.xlsx', sheet_name='A.2.7 (L)', skiprows=6, skipfooter=11)

piaac.columns = ['country', 'mean', 'se', 
           'male_mean', 'male_mean_se', 'female_mean', 'female_mean_se',
           'diff', 'diff_se', 
           'unused', 'unused', 'unused', 'unused', 
           'male_percent', 'male_percent_se', 'female_percent', 'female_percent_se', 
           'unused', 'unused']

piaac.head()

In [None]:
piaac['code'] = piaac['country'].map(country_to_iso_dict)

Here's the ratio intended to be comparable to the WEF equity scores, except that it's symmetric.

In [None]:
piaac['ratio'] = (100 - piaac['female_percent']) / (100 - piaac['male_percent'])
piaac['ratio'].describe()

In [None]:
piaac['score'] = 1 - np.abs(piaac['ratio'] - 1)
piaac['score'].describe()

In [None]:
from utils import add_title, add_subtext, add_logo

piaac_sorted = piaac.sort_values(by='female_percent')
country = piaac_sorted['country']
male = 100 - piaac_sorted['male_percent']
female = 100 - piaac_sorted['female_percent']

fig, ax = plt.subplots(figsize=(6, 8))
plt.hlines(country, male, female, color=AIBM_COLORS['light_gray'])
plt.plot(male, country, 's', color=AIBM_COLORS['green'], label='Male')
plt.plot(female, country, 'o', color=AIBM_COLORS['orange'], label='Female')
ax.invert_yaxis()

decorate(xlabel='Percent performing above Level 1', ylim=[len(piaac), 0.5])
add_title("In Most OECD Countries Men Lag Women in Literacy",
          "Based on the PIAAC Survey of Adult Skills", y=1.01)
add_subtext("Source: OECD PIAAC", y=-0.05)
logo = add_logo(location=(1.0, -0.05))

### PIACC by age and gender

The percentages in the previous table include all adults, so they are comparable to WEF literacy percentages in that sense.
But adult literacy is a long-lagging indicator of equity in primary and secondary education. To get a sense of generational shifts, we can split the PIAAC data by age group, this time using average scores rather than percentages above or below Level 1. 

In [None]:
piaac2 = pd.read_excel('eb8dxq.xlsx', sheet_name='A.2.8 (L)', skiprows=7, skipfooter=11)
piaac2.columns = ['country', 'mean', 'se', 
              'male_1624', 'unused', 
              'male_2544', 'unused',
              'male_4565', 'unused', 
              'female_1624', 'unused', 
              'female_2544', 'unused', 
              'female_4565', 'unused']
piaac2.head()

In [None]:
diff_1624 = piaac2['female_1624'] - piaac2['male_1624']
diff_2544 = piaac2['female_2544'] - piaac2['male_2544']
diff_4565 = piaac2['female_4565'] - piaac2['male_4565']


In [None]:

diffs = pd.DataFrame(dict(
    country=piaac2['country'],
    diff_1624=diff_1624,
    diff_2544=diff_2544,
    diff_4565=diff_4565))
diffs.head()

In [None]:
from utils import add_title, add_subtext, add_logo

diffs_sorted = diffs.sort_values(by='diff_1624', ascending=False)
country = diffs_sorted['country']
young = diffs_sorted['diff_1624']
middle = diffs_sorted['diff_2544']
old = diffs_sorted['diff_4565']

fig, ax = plt.subplots(figsize=(6, 8))
plt.hlines(country, young, middle, color=AIBM_COLORS['light_gray'])
plt.hlines(country, old, middle, color=AIBM_COLORS['light_gray'])

plt.plot(young, country, '>', 
         color=AIBM_COLORS['blue'], label='Ages 16-24')
plt.plot(middle, country, 'o', 
         color=AIBM_COLORS['purple'], label='Ages 25-44')
plt.plot(old, country, '<', 
         color=AIBM_COLORS['amber'], label='Ages 45-65')
ax.invert_yaxis()

decorate(xlabel='Gender gap in average literacy scores', ylim=[len(piaac2), 0.5])
add_title("In Some Countries Gaps Are Widest Among the Young",
          "", y=1.01)
add_subtext("Source: OECD PIAAC", y=-0.05)
logo = add_logo(location=(1.0, -0.05))

In some countries the gender gap is widest among young people.
In these countries, the use of a lagging indicator might understate the degree of inequality.

Not to make too much of this point -- it is probably a reasonable thing to include in the index one measure that is an aggregation of the entire population, along with snapshots of current education.

### Merge PIACC into OWID

In [None]:
owid_last['merged_score'] = owid_last['score']
for i, row in piaac.iterrows():
    code = row['code']
    if code in owid_last.index:
        owid_last.at[code, 'merged_score'] = row['score']

In [None]:
owid_codes = set(owid['Code'])
piaac_codes = set(piaac['code'])
overlap = owid_codes & piaac_codes
len(owid_codes), len(piaac_codes), len(overlap)

In [None]:
owid_last.query("Code in @overlap")


## WEF data

In [None]:

import os
from extract_pdf_data import read_pdfs

if not os.path.exists("wef_literacy_rate.csv"):
    wef = read_pdfs('literacy')
    wef.to_csv("wef_literacy_rate.csv", index=False)

In [None]:
from utils import read_wef_file

wef = read_wef_file("wef_literacy_rate.csv")
wef.shape

In [None]:
wef.head()

In [None]:
wef.tail()

In [None]:
wef['rank'].value_counts(dropna=False).head()

In [None]:
dinged = wef['score'] < 1
dinged.sum()

In [None]:
from empiricaldist import Cdf

cdf_score = Cdf.from_seq(wef.loc[dinged, 'score'])
cdf_score.plot(label='WEF')
decorate(xlabel='Literacy Score', ylabel='CDF')

In [None]:
from utils import iso_country_dict

country_to_iso_dict = {}
for code, country in iso_country_dict.items():
    country_to_iso_dict[country] = code


In [None]:
for i, row in wef.iterrows():
    code = country_to_iso_dict[row['country']]
    if code in owid_last.index:
        wef.at[code, 'revised_score'] = owid_last.loc[code, 'merged_score']

In [None]:
wef.sort_values(by='revised_score', ascending=False).tail(20)

Here's the distribution of revised scores.

In [None]:
cdf_score.plot(label='WEF')
cdf_revised = Cdf.from_seq(wef['revised_score'])
cdf_revised.plot(label='WEF (Revised)')
decorate(xlabel='Revised Literacy Score', ylabel='CDF')

In [None]:
revised = df[~dinged].dropna(subset=['ratio']).sort_values('revised_score')
revised.shape

In [None]:
# make a figure that shows a horizontal line for each country that
# connects the revised score and the secondary enrolment score
# with a circle for the original and a triangle for the revised

from utils import add_title, add_subtext, add_logo

fig, ax = plt.subplots(figsize=(6, 15))
plt.hlines(revised['country'], 
           revised['score'], revised['revised_score'], 
           color=AIBM_COLORS['light_gray'])
plt.plot(revised['score'], revised['country'], '|', 
         color=AIBM_COLORS['blue'])
plt.plot(revised['revised_score'], revised['country'], '<', 
         color=AIBM_COLORS['blue'])
ax.invert_yaxis()

decorate(xlabel='Literacy Rate', ylim=[len(revised)+1, -1])
add_title("Revised Scores Are Very Different For Many Countries",
          "Literacy rate", y=1.01)
add_subtext("Source: World Economic Forum", y=-0.05)
logo = add_logo(location=(1.0, -0.05))



And here's the new ranking of countries based on revised scores.

In [None]:
columns = ['revised_rank', 'country', 'score', 'revised_score', 'diff']
df_sorted = df.dropna(subset=['ratio']).sort_values(by='revised_score', ascending=False)
df_sorted['revised_rank'] = np.arange(1, len(df_sorted)+1)
table = df_sorted[columns].round(2)
table.to_csv("wef_literacy_rate_table.csv", index=False)

In [None]:


df_sorted[columns].head(40).round(2)

In [None]:
df_sorted[columns].tail(50).round(2)

In [None]:
df.query("country == 'Qatar'")

In [None]:
import seaborn as sns

options = dict(cut=0, bw_adjust=0.7)

sns.kdeplot(df['score'], label='WEF truncated scores', **options)
sns.kdeplot(df['revised_score'], label='Revised symmetric scores', **options)

decorate(xlabel='Gender equality score')

add_title("The Distribution of Scores Is Very Different",
          "Literacy rate")
add_subtext("Source: World Economic Forum", y=-0.25)
logo = add_logo(location=(1.0, -0.25))
None

## UNESCO Data

UNESCO, UIS.Stat Education statistics data portal. When not available, data is sourced from United Nations Development Programme, Human Development Reports, most recent year available between 2013 and 2023.

https://databrowser.uis.unesco.org/view#=countries&geoUnits=&timeMode=range&view=table&chartMode=multiple&chartHighlightSeries=&chartHighlightEnabled=true&indicatorPaths=UIS-SDG4Monitoring%3A0%3ALR.GALP.AG15T24.F%2CUIS-SDG4Monitoring%3A0%3ALR.GALP.AG15T24.M%2CUIS-SDG4Monitoring%3A0%3ALR.GALP.AG15T99.F%2CUIS-SDG4Monitoring%3A0%3ALR.GALP.AG15T99.M%2CUIS-SDG4Monitoring%3A0%3ALR.GALP.AG25T64.F%2CUIS-SDG4Monitoring%3A0%3ALR.GALP.AG25T64.M&geoMode=countries&tableIndicatorId=LR.GALP.AG15T99.F&years=2013%2C2022


In [None]:
filename = "indicator-data-export_LR.GALP.AG15T24.F_LR.GALP.AG15T24.M_LR.GALP.AG15T99.F_and_3_more/data.csv"

unesco = pd.read_csv(filename)
unesco.head()


In [None]:
from utils import iso_country_dict

country_map = pd.Series(iso_country_dict)
unesco['country'] = unesco['geoUnit'].map(country_map)
unesco.head()

In [None]:
unesco.query('geoUnit == "CAN"')

In [None]:

# from each group, select the row with the latest date
grouped = unesco.groupby(["indicatorId", "geoUnit"]).last()
grouped.loc[:, 'AFG']


In [None]:
group = 'youth'

if group == 'adults':
    age_group = 'AG25T64'
elif group == 'youth':
    age_group = 'AG15T24'


male = grouped.loc[f'LR.GALP.{age_group}.M']
male.index = male['country']
female = grouped.loc[f'LR.GALP.{age_group}.F']
female.index = female['country']

male.head()

In [None]:
literacy = pd.concat([male, female], axis=1, keys=["male", "female"])
literacy.head()

In [None]:
# look up the ratio for each country in the literacy dataframe

df['unesco_ratio'] = literacy['ratio'].reindex(df['country']).values
df[['country', 'score', 'unesco_ratio']].head(30)