In [2]:
import pandas as pd 
import numpy as np
import requests
import boto3
import os
import json
from io import BytesIO
import base64
from bs4 import BeautifulSoup
import json5

charts_path = "/Users/finn/Documents/GitHub/GrowthDiagnostics/charts"

In [3]:
def upload_series(df, name, bucket="eco-temp-cache"):
    csv = df.to_csv(index=False, date_format='%Y-%m-%d')
    s3 = boto3.client('s3')
    res = s3.put_object(Bucket=bucket, Key=f"gd/{name}.csv", Body=csv)
    return f"https://eco-temp-cache.s3.eu-west-2.amazonaws.com/gd/{name}.csv"

def prepare_spec(data_url, path, title, filter=None, parent_path="charts", multi_series=False, yAxisExpr=None):
    spec = {
        "$schema": "https://vega.github.io/schema/vega-lite/v5.json",
        "title": {"text": title},
        "transform": [{"filter": "datum.value>0"}],
        "data": {
            "url": data_url,
        },
        "mark": {"type": "line"},
        "encoding": {
            "x": {
                "field": "date",
                "type": "temporal",
                "axis": {"title": "", "titleColor": "#122B39"}
            },
            "y": {
                "field": "value",
                "type": "quantitative",
                "axis": {"title": "", "titleColor": "#122B39"}
            }
        }
    }
    if filter:
        spec['transform'].append({"filter": filter})
    if multi_series:
        spec['encoding']['color'] = {"field": "series", "type": "nominal"}
    if yAxisExpr:
        spec['encoding']['y']['axis']['labelExpr'] = yAxisExpr
    with open(f"{parent_path}/{path}.json", "w") as f:
        f.write(json.dumps(spec))

In [4]:
def get_stats(df):
    df = df.sort_values('date')
    # get the most recent value and it's date
    latest = df.iloc[-1]
    latest_date = latest.date
    latest_value = latest.value
    # get the as close to 1 year ago as possible
    year_ago = df[df.date<=df.date.max()-pd.DateOffset(years=1)].iloc[-1]
    year_ago_date = year_ago.date
    year_ago_value = year_ago.value
    # get the average for the last 10 years (or as many as we have)
    avg = df[df.date>df.date.max()-pd.DateOffset(years=10)].value.mean()
    avg_year_count = df.iloc[-1].date.year-df[df.date>df.date.max()-pd.DateOffset(years=10)].iloc[0].date.year
    return {
        'latest': {'date': latest_date, 'value': latest_value},
        'latest_date': latest_date,
        'year_ago': {'date': year_ago_date, 'value': year_ago_value},
        'year_ago_date': year_ago_date,
        'avg': avg,
        'avg_year_count': avg_year_count
    }

def human_change(val1, val2, date1, date2, date_format="%b %Y", val_format=".1%", change_format=".3f", change_multiplier=1, change_suffix="", val_suffix="", val_multiplier=1):
    print(f"val1: {val1}, val2: {val2}, date1: {date1}, date2: {date2}")
    if np.round(val1, 3) == np.round(val2, 3):
        return f"unchanged from {date2.strftime(date_format)}"
    if val1 > val2:
        return f"up {(val1-val2)*change_multiplier:{change_format}}{change_suffix} from {date2.strftime(date_format)} when it was {val2*val_multiplier:{val_format}}{val_suffix}"
    if val1 < val2:
        return f"down {(val1-val2)*change_multiplier:{change_format}}{change_suffix} from {date2.strftime(date_format)} when it was {val2*val_multiplier:{val_format}}{val_suffix}"  

def temporal_description(data, name="it", preamble="", val_format=".1%", change_format=".3f", val_multiplier=1, change_multiplier=1, change_suffix="", date_format="%b %Y", val_suffix=""):
    if len(preamble)>0 and preamble[-1]!=" ":
        preamble += " "
    stats = get_stats(data)
    return f"{preamble}In {stats['latest']['date'].strftime('%B %Y')}, {name} was {stats['latest']['value']*val_multiplier:{val_format}}{val_suffix} . This is {human_change(stats['latest']['value'], stats['year_ago']['value'], stats['latest']['date'], stats['year_ago']['date'], date_format, val_format, change_format, change_multiplier, change_suffix, val_suffix=val_suffix, val_multiplier=val_multiplier)}. The average for the last {stats['avg_year_count']} years is {stats['avg']*val_multiplier:{val_format}}{val_suffix}."

def upload_description(description, name, bucket="eco-temp-cache"):
    s3 = boto3.client('s3')
    res = s3.put_object(Bucket=bucket, Key=f"gd/descriptions/{name}.txt", Body=json.dumps(description))
    return f"https://eco-temp-cache.s3.eu-west-2.amazonaws.com/gd/descriptions/{name}.txt"

def short_description(stats, var_name, date_format="%b %Y", val_format=".1%", val_multiplier=1, val_suffix=""):
    direction = 'unchanged' if np.round(stats['latest']['value'], 3) == np.round(stats['year_ago']['value'], 3) else "up" if np.round(stats['latest']['value'], 3) > np.round(stats['year_ago']['value'], 3) else "down"
    desc = f"{var_name} is {stats['latest']['value']*val_multiplier:{val_format}}{val_suffix}. It is {direction} from {stats['year_ago_date'].strftime(date_format)} ({stats['year_ago']['value']*val_multiplier:{val_format}}{val_suffix}). The UK historical average (past {stats['avg_year_count']} years) is {stats['avg']*val_multiplier:{val_format}}{val_suffix}."
    return desc


In [5]:
def upload_to_datahub(df, name, bucket="eco-temp-cache"):
    json_data = df.to_json(orient="records", date_format='iso')
    s3 = boto3.client('s3')
    res = s3.put_object(Bucket=bucket, Key=f"gbr/{name}.json", Body=json_data)
    return f"https://eco-temp-cache.s3.eu-west-2.amazonaws.com/gbr/{name}.json"


# PISA Scores

In [6]:
df = pd.read_excel("/Users/finn/Documents/GitHub/GrowthDiagnostics/manual_data/pisa.xls", skiprows=11)
df = df.iloc[:, 1:]
# propagate Year/Study downwards
df["Year/Study"] = df["Year/Study"].ffill()
df["date"] = pd.to_datetime(df["Year/Study"], errors="coerce", format="%Y")
df["value"] = pd.to_numeric(df["Average"], errors="coerce")
df["series"] = df["Jurisdiction"]
df = df.dropna(subset=["date", "series", "value"])
df = df[["date", "series", "value"]]

# keep only series with at least 3 data points
df = df.groupby("series").filter(lambda x: len(x) >= 3)

# keep just UK and "International Average (OECD)"
df = df[df["series"].isin(["United Kingdom", "International Average (OECD)"])]
df.series = df.series.replace({"United Kingdom": "UK", 
                               "International Average (OECD)": "OECD"})
pisa_df = df.copy()

pisa_url = upload_series(df, "pisa")

prepare_spec(pisa_url, "human_capital_pisa", "PISA Scores", multi_series=True, parent_path=charts_path)


In [10]:
stats = get_stats(pisa_df.query("series=='UK'"))
description = short_description(stats, "The UK average maths PISA Score", val_format=".0f")
print(description)
description_url = upload_description(description, "human_capital_pisa")


The UK average maths PISA Score is 489. It is down from Jan 2018 (502). The UK historical average (past 7 years) is 494.


In [11]:
dh_url = upload_to_datahub(pisa_df.query("series=='UK'")[["date", "value"]], "pisa")
dh_url

'https://eco-temp-cache.s3.eu-west-2.amazonaws.com/gbr/pisa.json'

# Quantity of Graduates

In [12]:
# Quantity with NVQ4+, whole UK, 16-64
url = "https://www.nomisweb.co.uk/api/v01/dataset/NM_17_1.data.csv?geography=2092957697&cell=403898630&measures=20100,20701"
df = pd.read_csv(url)
df['date'] = pd.to_datetime(df['DATE_CODE'], format="%Y-%m")
df = df.query("MEASURES_NAME == 'Value'")
df['value'] = df['OBS_VALUE']
df = df[["date", "value"]]
df = df.dropna(subset=["date", "value"])
grad_df = df.copy()

nvq4_url = upload_series(df, "nvq4")
prepare_spec(nvq4_url, "human_capital_nvq4", "Population with NVQ4+", parent_path=charts_path, yAxisExpr="format(datum.value, ',.0s')")

In [16]:
# stats = get_stats(grad_df)
# description = temporal_description(grad_df, 
#                                    "the UK population aged 16-64 with degree level qualifications", 
#                                    "",
#                                    val_format=",.2f",
#                                    val_suffix="M",
#                                    change_format=",.2f",
#                                    change_multiplier=1/1000000,
#                                    val_multiplier=1/1000000,
#                                    change_suffix="M",
#                                    date_format="%b %Y")
# description_url = upload_description(description, "human_capital_nvq4")
# print(description)

stats = get_stats(grad_df)
description = short_description(stats, "The graduate population (16-64)", val_format=",.2f", val_suffix="M", val_multiplier=1/1000000)
print(description)

The graduate population (16-64) is 17.96M. It is up from Dec 2020 (17.75M). The UK historical average (past 9 years) is 15.71M.


In [17]:
dh_url = upload_to_datahub(grad_df[["date", "value"]], "nvq4")
dh_url

'https://eco-temp-cache.s3.eu-west-2.amazonaws.com/gbr/nvq4.json'

# A Levels

In [18]:
url = "https://www.nomisweb.co.uk/api/v01/dataset/NM_17_5.data.csv?geography=2092957697&variable=1911,720&measures=20599,21001,21002,21003"
df = pd.read_csv(url)

df['date'] = pd.to_datetime(df['DATE_CODE'], format="%Y-%m")

df = df.query("MEASURES_NAME == 'Variable'")



df['value'] = df['OBS_VALUE']
df = df.dropna(subset=["date", "value"])


df = df[["date", "value"]]

df = df.query("value > 25")

df = df.dropna(subset=["date", "value"])
df['value'] = df['value']/100
nvq3_df = df.copy()

nvq3_url = upload_series(df, "nvq3")
prepare_spec(nvq3_url, "human_capital_nvq3", "Population with NVQ3+, %", parent_path=charts_path, yAxisExpr="format(datum.value, '.0%')")



In [21]:
# stats = get_stats(nvq3_df)
# description = temporal_description(nvq3_df, 
#                                    "the percentage of 16-64 year olds with at least A-level qualifications, or equivalents, ", 
#                                     "",
#                                    val_format=".2%",
#                                    change_format=".3f",
#                                    change_multiplier=100,
#                                    change_units="pp",
#                                    date_format="%b %Y")
# description_url = upload_description(description, "human_capital_nvq3")
# print(description)

stats = get_stats(nvq3_df)
print(description)
description_url = upload_description(description, "human_capital_nvq3")
print(description)

The percentage of 16-64 year olds with at least A-level qualifications, or equivalents is 61.40%. It is up from Dec 2020 (61.20%). The UK historical average (past 9 years) is 56.91%.
The percentage of 16-64 year olds with at least A-level qualifications, or equivalents is 61.40%. It is up from Dec 2020 (61.20%). The UK historical average (past 9 years) is 56.91%.


In [None]:


df['value'] = df['OBS_VALUE']
df = df[["date", "value"]]

df

df