In [1]:
import pandas as pd 
import numpy as np
import requests
import boto3
import os
import json
from io import BytesIO
import base64
from bs4 import BeautifulSoup
import json5

charts_path = "/Users/finn/Documents/GitHub/GrowthDiagnostics/charts"

In [91]:
def upload_series(df, name, bucket="eco-temp-cache"):
    csv = df.to_csv(index=False, date_format='%Y-%m-%d')
    s3 = boto3.client('s3')
    res = s3.put_object(Bucket=bucket, Key=f"gd/{name}.csv", Body=csv)
    return f"https://eco-temp-cache.s3.eu-west-2.amazonaws.com/gd/{name}.csv"

def prepare_spec(data_url, path, title, filter=None, parent_path="charts", multi_series=False, yAxisExpr=None):
    spec = {
        "$schema": "https://vega.github.io/schema/vega-lite/v5.json",
        "title": {"text": title},
        "transform": [{"filter": "datum.value>0"}],
        "data": {
            "url": data_url,
        },
        "mark": {"type": "line"},
        "encoding": {
            "x": {
                "field": "date",
                "type": "temporal",
                "axis": {"title": "", "titleColor": "#122B39"}
            },
            "y": {
                "field": "value",
                "type": "quantitative",
                "axis": {"title": "", "titleColor": "#122B39"}
            }
        }
    }
    if filter:
        spec['transform'].append({"filter": filter})
    if multi_series:
        spec['encoding']['color'] = {"field": "series", "type": "nominal"}
    if yAxisExpr:
        spec['encoding']['y']['axis']['labelExpr'] = yAxisExpr
    with open(f"{parent_path}/{path}.json", "w") as f:
        f.write(json.dumps(spec))

In [68]:
def get_stats(df):
    df = df.sort_values('date')
    # get the most recent value and it's date
    latest = df.iloc[-1]
    latest_date = latest.date
    latest_value = latest.value
    # get the as close to 1 year ago as possible
    year_ago = df[df.date<=df.date.max()-pd.DateOffset(years=1)].iloc[-1]
    year_ago_date = year_ago.date
    year_ago_value = year_ago.value
    # get the average for the last 10 years (or as many as we have)
    avg = df[df.date>df.date.max()-pd.DateOffset(years=10)].value.mean()
    avg_year_count = df.iloc[-1].date.year-df[df.date>df.date.max()-pd.DateOffset(years=10)].iloc[0].date.year
    return {
        'latest': {'date': latest_date, 'value': latest_value},
        'latest_date': latest_date,
        'year_ago': {'date': year_ago_date, 'value': year_ago_value},
        'year_ago_date': year_ago_date,
        'avg': avg,
        'avg_year_count': avg_year_count
    }

def human_change(val1, val2, date1, date2, date_format="%b %Y", val_format=".1%", change_format=".3f", change_multiplier=1, change_units=""):
    print(f"val1: {val1}, val2: {val2}, date1: {date1}, date2: {date2}")
    if np.round(val1, 3) == np.round(val2, 3):
        return f"unchanged from {date2.strftime(date_format)}"
    if val1 > val2:
        return f"up {(val1-val2)*change_multiplier:{change_format}}{change_units} from {date2.strftime(date_format)} when it was {val2:{val_format}}"
    if val1 < val2:
        return f"down {(val1-val2)*change_multiplier:{change_format}}{change_units} from {date2.strftime(date_format)} when it was {val2:{val_format}}"  

def temporal_description(data, name="it", preamble="", val_format=".1%", change_format=".3f", change_multiplier=1, change_units="", date_format="%b %Y"):
    if len(preamble)>0 and preamble[-1]!=" ":
        preamble += " "
    stats = get_stats(data)
    return f"{preamble}In {stats['latest']['date'].strftime('%B %Y')}, {name} was {stats['latest']['value']:{val_format}}. This is {human_change(stats['latest']['value'], stats['year_ago']['value'], stats['latest']['date'], stats['year_ago']['date'], date_format, val_format, change_format, change_multiplier, change_units)}. The average for the last {stats['avg_year_count']} years is {stats['avg']:{val_format}}."

def upload_description(description, name, bucket="eco-temp-cache"):
    s3 = boto3.client('s3')
    res = s3.put_object(Bucket=bucket, Key=f"gd/descriptions/{name}.txt", Body=json.dumps(description))
    return f"https://eco-temp-cache.s3.eu-west-2.amazonaws.com/gd/descriptions/{name}.txt"


# PISA Scores

In [63]:
df = pd.read_excel("/Users/finn/Documents/GitHub/GrowthDiagnostics/manual_data/pisa.xls", skiprows=11)
df = df.iloc[:, 1:]
# propagate Year/Study downwards
df["Year/Study"] = df["Year/Study"].ffill()
df["date"] = pd.to_datetime(df["Year/Study"], errors="coerce", format="%Y")
df["value"] = pd.to_numeric(df["Average"], errors="coerce")
df["series"] = df["Jurisdiction"]
df = df.dropna(subset=["date", "series", "value"])
df = df[["date", "series", "value"]]

# keep only series with at least 3 data points
df = df.groupby("series").filter(lambda x: len(x) >= 3)

# keep just UK and "International Average (OECD)"
df = df[df["series"].isin(["United Kingdom", "International Average (OECD)"])]
df.series = df.series.replace({"United Kingdom": "UK", 
                               "International Average (OECD)": "OECD"})
pisa_df = df.copy()

pisa_url = upload_series(df, "pisa")

prepare_spec(pisa_url, "human_capital_pisa", "PISA Scores", multi_series=True, parent_path=charts_path)


In [69]:
stats = get_stats(pisa_df.query("series=='UK'"))
description = temporal_description(pisa_df.query("series=='UK'"), 
                                   "the UK average PISA Score", 
                                   "The PISA test is an international test of student performance in reading, maths and science. ", 
                                   val_format=".0f",
                                   change_format=".0f",
                                   change_multiplier=1,
                                   date_format="%b %Y")
description_url = upload_description(description, "human_capital_pisa")
print(description)


val1: 488.975085044251, val2: 501.769899185046, date1: 2022-01-01 00:00:00, date2: 2018-01-01 00:00:00
The PISA test is an international test of student performance in reading, maths and science. In January 2022, the UK average PISA Score was 489. This is down -13 from Jan 2018 when it was 502. The average for the last 7 years is 494.


# Quantity of Graduates

In [75]:
# Quantity with NVQ4+, whole UK, 16-64
url = "https://www.nomisweb.co.uk/api/v01/dataset/NM_17_1.data.csv?geography=2092957697&cell=403898630&measures=20100,20701"
df = pd.read_csv(url)
df['date'] = pd.to_datetime(df['DATE_CODE'], format="%Y-%m")
df = df.query("MEASURES_NAME == 'Value'")
df['value'] = df['OBS_VALUE']
df = df[["date", "value"]]
df = df.dropna(subset=["date", "value"])
grad_df = df.copy()

nvq4_url = upload_series(df, "nvq4")
prepare_spec(nvq4_url, "human_capital_nvq4", "Population with NVQ4+", parent_path=charts_path)

In [80]:
stats = get_stats(grad_df)
description = temporal_description(grad_df, 
                                   "the UK population aged 16-64 with degree level qualifications", 
                                   "",
                                   val_format=",.0f",
                                   change_format=",.0f",
                                   change_multiplier=1,
                                   change_units="",
                                   date_format="%b %Y")
description_url = upload_description(description, "human_capital_nvq4")
print(description)

val1: 17958300.0, val2: 17750700.0, date1: 2021-12-01 00:00:00, date2: 2020-12-01 00:00:00
In December 2021, the UK population aged 16-64 with degree level qualifications was 17,958,300. This is up 207,600 from Dec 2020 when it was 17,750,700. The average for the last 9 years is 15,706,010.


# A Levels

In [92]:
url = "https://www.nomisweb.co.uk/api/v01/dataset/NM_17_5.data.csv?geography=2092957697&variable=1911,720&measures=20599,21001,21002,21003"
df = pd.read_csv(url)

df['date'] = pd.to_datetime(df['DATE_CODE'], format="%Y-%m")

df = df.query("MEASURES_NAME == 'Variable'")



df['value'] = df['OBS_VALUE']
df = df.dropna(subset=["date", "value"])


df = df[["date", "value"]]

df = df.query("value > 25")

df = df.dropna(subset=["date", "value"])
df['value'] = df['value']/100
nvq3_df = df.copy()

nvq3_url = upload_series(df, "nvq3")
prepare_spec(nvq3_url, "human_capital_nvq3", "Population with NVQ3+, %", parent_path=charts_path, yAxisExpr="format('.0%', datum.value)")



In [85]:
nvq3_df

Unnamed: 0,date,value
4,2004-12-01,43.7
36,2005-12-01,44.3
68,2006-12-01,45.2
100,2007-12-01,46.1
132,2008-12-01,46.0
164,2009-12-01,47.2
196,2010-12-01,48.8
228,2011-12-01,50.5
260,2012-12-01,52.9
292,2013-12-01,53.7


In [88]:
stats = get_stats(nvq3_df)
description = temporal_description(nvq3_df, 
                                   "the percentage of 16-64 year olds with at least A-level qualifications, or equivalents, ", 
                                    "",
                                   val_format=".2%",
                                   change_format=".3f",
                                   change_multiplier=100,
                                   change_units="pp",
                                   date_format="%b %Y")
description_url = upload_description(description, "human_capital_nvq3")
print(description)

val1: 0.614, val2: 0.612, date1: 2021-12-01 00:00:00, date2: 2020-12-01 00:00:00
In December 2021, the percentage of 16-64 year olds with at least A-level qualifications, or equivalents,  was 61.40%. This is up 0.200pp from Dec 2020 when it was 61.20%. The average for the last 9 years is 56.91%.


In [None]:


df['value'] = df['OBS_VALUE']
df = df[["date", "value"]]

df

df