In [2]:
import altair as alt
import pandas as pd
import eco_style

alt.themes.enable('light')

ThemeRegistry.enable('light')

In [3]:
df = pd.read_csv('tertiary_educ_time_data.csv')
df

Unnamed: 0,Age,Sex,Education level,TIME_PERIOD,OBS_VALUE,COUNTRY,Country
0,From 25 to 64 years,Total,Tertiary education,2014,15.3,TUR,Turkiye
1,From 25 to 64 years,Total,Tertiary education,2008,10.9,TUR,Turkiye
2,From 25 to 64 years,Total,Tertiary education,2018,45.7,AUS,Australia
3,From 25 to 64 years,Total,Tertiary education,2017,45.4,AUS,Australia
4,From 25 to 64 years,Total,Tertiary education,2007,40.3,USA,United States
...,...,...,...,...,...,...,...
750,From 25 to 64 years,Total,Tertiary education,2019,20.5,TUR,Turkiye
751,From 25 to 64 years,Total,Tertiary education,2006,9.8,TUR,Turkiye
752,From 25 to 64 years,Total,Tertiary education,2018,19.3,TUR,Turkiye
753,From 25 to 64 years,Total,Tertiary education,2016,17.9,TUR,Turkiye


In [4]:
df['TIME_PERIOD'] = df['TIME_PERIOD'].astype(str) + '-01-01' 
df.head()

Unnamed: 0,Age,Sex,Education level,TIME_PERIOD,OBS_VALUE,COUNTRY,Country
0,From 25 to 64 years,Total,Tertiary education,2014-01-01,15.3,TUR,Turkiye
1,From 25 to 64 years,Total,Tertiary education,2008-01-01,10.9,TUR,Turkiye
2,From 25 to 64 years,Total,Tertiary education,2018-01-01,45.7,AUS,Australia
3,From 25 to 64 years,Total,Tertiary education,2017-01-01,45.4,AUS,Australia
4,From 25 to 64 years,Total,Tertiary education,2007-01-01,40.3,USA,United States


In [5]:
to_filter = ['GBR', 'FRA','DEU', 'USA', 'ITA', 'JPN', 'ESP', 'CAN']

df2= df[df['COUNTRY'].isin(to_filter)]
df2


Unnamed: 0,Age,Sex,Education level,TIME_PERIOD,OBS_VALUE,COUNTRY,Country
4,From 25 to 64 years,Total,Tertiary education,2007-01-01,40.3,USA,United States
45,From 25 to 64 years,Total,Tertiary education,2017-01-01,56.0,CAN,Canada
46,From 25 to 64 years,Total,Tertiary education,2016-01-01,56.0,CAN,Canada
47,From 25 to 64 years,Total,Tertiary education,2020-01-01,60.0,CAN,Canada
48,From 25 to 64 years,Total,Tertiary education,2019-01-01,59.0,CAN,Canada
...,...,...,...,...,...,...,...
677,From 25 to 64 years,Total,Tertiary education,2000-01-01,21.6,FRA,France
678,From 25 to 64 years,Total,Tertiary education,2008-01-01,25.4,DEU,Germany
684,From 25 to 64 years,Total,Tertiary education,2013-01-01,16.4,ITA,Italy
697,From 25 to 64 years,Total,Tertiary education,2022-01-01,40.7,ESP,Spain


In [6]:
chart = alt.Chart(df2).mark_line(interpolate='monotone').encode(
    x=alt.X('TIME_PERIOD:T', title=''),
    y=alt.Y('OBS_VALUE:Q', title='', axis=alt.Axis(labelExpr="format(datum.value,',') + '%'")),
    color=alt.Color('Country:N', legend=None, scale=alt.Scale(domain=["United Kingdom", "France", "Italy", "Spain", "Germany", "Japan", "Canada", "United States"], range=["#001f3f", "#4269d0", "#efb118", "#ff725c", "#6cc5b0", "#3ca951", "#ff8ab7", "#a463f2", "#97bbf5", "#9c6b4e"] 
    )),
    size=alt.condition(
        alt.datum.Country == "United Kingdom",
        alt.value(2),  
        alt.value(1.5)  
    ),
    strokeDash=alt.condition(
        alt.datum.Country == "United Kingdom",
        alt.value([0, 0]),
        alt.value([5, 2])   
    )
).properties(
    width=500,
    height=300,
    title={
        "text": "Educational attainment",
        "anchor": "start",
        "subtitle": ["Share of 25-64 year olds with tertiary education", "Source: OECD", ""],
        "subtitleColor": "#676A86"})

text = alt.Chart(df2).mark_text(
    align='left',
    dx=5,
    dy={
            "expr": "datum.Country == 'Spain' ? 7 : 0"
          }
).encode(
    alt.X('TIME_PERIOD:T', aggregate='max'),
    alt.Y('OBS_VALUE:Q', aggregate={'argmax': 'TIME_PERIOD'}),
    text='Country:N',
    color = 'Country:N'
)  


chart2 = chart + text 
chart2

In [7]:
chart2.save('terteduc_oecd.png', scale_factor=2.0)

In [8]:
chart2.save('terteduc_oecd.json')

In [9]:
chart = alt.Chart(df2).mark_line(interpolate='monotone').encode(
    x=alt.X('TIME_PERIOD:T', title=''),
    y=alt.Y('OBS_VALUE:Q', title='', axis=alt.Axis(labelExpr="format(datum.value,',') + '%'")),
    color=alt.Color('Country:N', legend=None),
       size=alt.condition(
        alt.datum.Country == "Canada",
        alt.value(2),  
        alt.value(1.5)  
    ),
    strokeDash=alt.condition(
        alt.datum.Country == "Canada",
        alt.value([0, 0]),
        alt.value([5, 2])   
    )
).properties(
    width=500,
    height=300,
    title={
        "text": "Tertiary education attainment",
        "anchor": "start",
        "subtitle": ["Share of 25-64 year olds with tertiary education", "Source: OECD", ""],
        "subtitleColor": "#676A86"})

text = alt.Chart(df2).mark_text(
    align='left',
    dx=5,
    dy={
            "expr": "datum.Country == 'Spain' ? 7 : 0"
          }
).encode(
    alt.X('TIME_PERIOD:T', aggregate='max'),
    alt.Y('OBS_VALUE:Q', aggregate={'argmax': 'TIME_PERIOD'}),
    text='Country:N',
    color = 'Country:N'
)  


chart20 = chart + text 
chart20

In [10]:
chart20.save('terteduc_cotd.png', scale_factor=2.0)

In [11]:
df0 = pd.read_csv('gdp_pc_data.csv')
df0

Unnamed: 0,country,gdp_pc
0,Albania,7956.559
1,Algeria,5323.635
2,Andorra,43784.571
3,Angola,2565.912
4,Antigua and Barbuda,19123.213
...,...,...
218,Latin America and the Caribbean,10291.151
219,Major advanced economies (G7),60221.173
220,Middle East and Central Asia,5582.4
221,Other advanced economies,49696.398


In [12]:
df0.rename(columns={'country': 'Country'}, inplace=True)
df0

Unnamed: 0,Country,gdp_pc
0,Albania,7956.559
1,Algeria,5323.635
2,Andorra,43784.571
3,Angola,2565.912
4,Antigua and Barbuda,19123.213
...,...,...
218,Latin America and the Caribbean,10291.151
219,Major advanced economies (G7),60221.173
220,Middle East and Central Asia,5582.4
221,Other advanced economies,49696.398


In [13]:
merged_df = pd.merge(df0, df, on='Country', how='inner')
merged_df

Unnamed: 0,Country,gdp_pc,Age,Sex,Education level,TIME_PERIOD,OBS_VALUE,COUNTRY
0,Australia,65434.328,From 25 to 64 years,Total,Tertiary education,2018-01-01,45.7,AUS
1,Australia,65434.328,From 25 to 64 years,Total,Tertiary education,2017-01-01,45.4,AUS
2,Australia,65434.328,From 25 to 64 years,Total,Tertiary education,2021-01-01,48.7,AUS
3,Australia,65434.328,From 25 to 64 years,Total,Tertiary education,2020-01-01,49.3,AUS
4,Australia,65434.328,From 25 to 64 years,Total,Tertiary education,2019-01-01,47.1,AUS
...,...,...,...,...,...,...,...,...
727,United States,81632.253,From 25 to 64 years,Total,Tertiary education,2000-01-01,34.5,USA
728,United States,81632.253,From 25 to 64 years,Total,Tertiary education,2011-01-01,42.4,USA
729,United States,81632.253,From 25 to 64 years,Total,Tertiary education,2006-01-01,39.5,USA
730,United States,81632.253,From 25 to 64 years,Total,Tertiary education,2005-01-01,37.4,USA


In [14]:
to_filter = ['2021-01-01']

merged_df2= merged_df[merged_df['TIME_PERIOD'].isin(to_filter)]
merged_df2

Unnamed: 0,Country,gdp_pc,Age,Sex,Education level,TIME_PERIOD,OBS_VALUE,COUNTRY
2,Australia,65434.328,From 25 to 64 years,Total,Tertiary education,2021-01-01,48.7,AUS
24,Austria,57081.047,From 25 to 64 years,Total,Tertiary education,2021-01-01,34.6,AUT
32,Belgium,53659.317,From 25 to 64 years,Total,Tertiary education,2021-01-01,44.9,BEL
72,Canada,53547.719,From 25 to 64 years,Total,Tertiary education,2021-01-01,62.0,CAN
117,Costa Rica,16390.222,From 25 to 64 years,Total,Tertiary education,2021-01-01,22.5,CRI
128,Denmark,68299.821,From 25 to 64 years,Total,Tertiary education,2021-01-01,41.9,DNK
152,Estonia,29838.985,From 25 to 64 years,Total,Tertiary education,2021-01-01,41.2,EST
180,Finland,54007.975,From 25 to 64 years,Total,Tertiary education,2021-01-01,42.3,FIN
200,France,46000.803,From 25 to 64 years,Total,Tertiary education,2021-01-01,40.7,FRA
237,Germany,52726.965,From 25 to 64 years,Total,Tertiary education,2021-01-01,32.1,DEU


In [15]:
to_filter = ['GBR', 'FRA','DEU', 'USA', 'ITA', 'JPN', 'ESP', 'CAN']

merged_df3= merged_df2[merged_df2['COUNTRY'].isin(to_filter)]
merged_df3

Unnamed: 0,Country,gdp_pc,Age,Sex,Education level,TIME_PERIOD,OBS_VALUE,COUNTRY
72,Canada,53547.719,From 25 to 64 years,Total,Tertiary education,2021-01-01,62.0,CAN
200,France,46000.803,From 25 to 64 years,Total,Tertiary education,2021-01-01,40.7,FRA
237,Germany,52726.965,From 25 to 64 years,Total,Tertiary education,2021-01-01,32.1,DEU
372,Italy,38325.839,From 25 to 64 years,Total,Tertiary education,2021-01-01,20.0,ITA
634,Spain,33071.349,From 25 to 64 years,Total,Tertiary education,2021-01-01,40.4,ESP
707,United Kingdom,49098.977,From 25 to 64 years,Total,Tertiary education,2021-01-01,50.1,GBR
716,United States,81632.253,From 25 to 64 years,Total,Tertiary education,2021-01-01,46.2,USA


In [16]:
# chart = alt.Chart(merged_df2).mark_point(filled=True).encode(
#     alt.X('OBS_VALUE:Q', title='',  axis=alt.Axis(labelExpr="format(datum.value,',') + '%'")),
#     alt.Y('gdp_pc:Q', title='', axis=alt.Axis(labelExpr="'$' + format(datum.value,',')")
#     ),
#     tooltip=['Country:N', 'gdp_pc:Q'],
#     color = alt.condition(
#         alt.datum.Country == scale=alt.Scale(domain=["United Kingdom", "France", "Italy", "Spain", "Germany", "Japan", "Canada", "United States"], range=["#001f3f", "#4269d0", "#efb118", "#ff725c", "#6cc5b0", "#3ca951", "#ff8ab7", "#a463f2", "#97bbf5", "#9c6b4e"] 
#     )
#     opacity=alt.condition(
#         (alt.datum.Country == "United Kingdom") | (alt.datum.Country == "France") | (alt.datum.Country == "Spain") | (alt.datum.Country == "Canada") | (alt.datum.Country == "Germany") | (alt.datum.Country == "Italy") | (alt.datum.Country == "United States"),
#         alt.value(0.7),
#         alt.value(0.3)
#     )
# ).properties(
#     width=500,
#     height=400,
#     title={
#         "text": "Educational attainment and GDP per capita",
#         "anchor": "start",
#         "subtitle": ["Share of 25-64 year olds with tertiary education", "Source: OECD and IMF", ""],
#         "subtitleColor": "#676A86"})

# text = alt.Chart(merged_df3).mark_text( 
#     align='left',
#     dx=5,
#      dy= 2
# ).encode(
#     x=alt.X('OBS_VALUE:Q'),
#     y=alt.Y('gdp_pc:Q'),
#     text='Country:N',
#     color=alt.Color('Country:N', legend=None, scale=alt.Scale(domain=["United Kingdom", "France", "Italy", "Spain", "Germany", "Japan", "Canada", "United States"], range=["#001f3f", "#4269d0", "#efb118", "#ff725c", "#6cc5b0", "#3ca951", "#ff8ab7", "#a463f2", "#97bbf5", "#9c6b4e"]))
# )

# chart3 = chart + text
# chart3

SyntaxError: expression cannot contain assignment, perhaps you meant "=="? (2446376044.py, line 7)

In [18]:
import altair as alt

# Define the countries and their associated colors
highlighted_countries = ["United Kingdom", "France", "Italy", "Spain", "Germany", "Japan", "Canada", "United States"]
colors = ["#001f3f", "#4269d0", "#efb118", "#ff725c", "#6cc5b0", "#3ca951", "#ff8ab7", "#a463f2"]

# Create the main scatter plot
chart = alt.Chart(merged_df2).mark_point(filled=True).transform_filter(
    alt.datum.gdp_pc >= 27000).encode(
    alt.X('OBS_VALUE:Q', title='', axis=alt.Axis(labelExpr="format(datum.value,',') + '%'")),
    alt.Y('gdp_pc:Q', title='', axis=alt.Axis(labelExpr="'$' + format(datum.value,',')"), scale=alt.Scale(domain= [20000, 130000])),
    tooltip=['Country:N', 'gdp_pc:Q'],
    color=alt.condition(
        alt.FieldOneOfPredicate(field='Country', oneOf=highlighted_countries),
        alt.Color('Country:N', scale=alt.Scale(domain=highlighted_countries, range=colors)),
        alt.value("#676A86")
    ),
    opacity=alt.condition(
        alt.FieldOneOfPredicate(field='Country', oneOf=highlighted_countries),
        alt.value(0.9),
        alt.value(0.5)
    )
).properties(
    width=500,
    height=400,
    title={
        "text": "Educational attainment and GDP per capita",
        "anchor": "start",
        "subtitle": ["Share of 25-64 year olds with tertiary education (2021), GDP pc (2023) ", "Source: OECD, IMF", ""],
        "subtitleColor": "#676A86"
    }
)

# Add text labels to the highlighted countries
text = alt.Chart(merged_df3).mark_text(
    align='left',
    dx=5,
    dy=2
).encode(
    x=alt.X('OBS_VALUE:Q'),
    y=alt.Y('gdp_pc:Q'),
    text='Country:N',
    color=alt.Color('Country:N', legend=None, scale=alt.Scale(domain=highlighted_countries, range=colors))
)

# Combine the chart and text layers
chart3 = chart + text
chart3


In [None]:
chart3.save('terteduc_scatter.png', scale_factor=2.0)

In [None]:
chart3.save("terteduc_scatter.json")

In [None]:
uk_df = pd.read_csv('uk_terteduc_region_data.csv')
uk_df

Unnamed: 0,Reference area,TIME_PERIOD,OBS_VALUE
0,United Kingdom,2000,28.5
1,Scotland,2000,30.1
2,North East England,2000,20.8
3,Wales,2000,26.4
4,Yorkshire and The Humber,2000,24.8
...,...,...,...
305,Northern Ireland,2023,42.9
306,South East England,2023,56.2
307,Greater London,2023,71.1
308,East Midlands,2023,45.7


In [None]:
uk_df.rename(columns={'Reference area': 'Country'}, inplace=True)

uk_df['TIME_PERIOD'] = uk_df['TIME_PERIOD'].astype(str) + '-01-01' 
uk_df.head(15)

Unnamed: 0,Country,TIME_PERIOD,OBS_VALUE
0,United Kingdom,2000-01-01,28.5
1,Scotland,2000-01-01,30.1
2,North East England,2000-01-01,20.8
3,Wales,2000-01-01,26.4
4,Yorkshire and The Humber,2000-01-01,24.8
5,East of England,2000-01-01,25.9
6,North West England,2000-01-01,26.0
7,South East England,2000-01-01,31.6
8,East Midlands,2000-01-01,24.6
9,South West England,2000-01-01,29.5


In [None]:
to_filter = ['North East England', 'South East England','East Midlands', 'South West Englands', 'Yorkshire and The Humber', 'East of England', 'North West England', 'West Midlands']

e_df= uk_df[uk_df['Country'].isin(to_filter)]
e_df

Unnamed: 0,Country,TIME_PERIOD,OBS_VALUE
2,North East England,2000-01-01,20.8
4,Yorkshire and The Humber,2000-01-01,24.8
5,East of England,2000-01-01,25.9
6,North West England,2000-01-01,26.0
7,South East England,2000-01-01,31.6
...,...,...,...
301,West Midlands,2023-01-01,47.4
303,Yorkshire and The Humber,2023-01-01,47.5
304,East of England,2023-01-01,50.3
306,South East England,2023-01-01,56.2


In [None]:
df_min = e_df.groupby(['TIME_PERIOD'], as_index=False)['OBS_VALUE'].min()
df_max = e_df.groupby(['TIME_PERIOD'], as_index=False)['OBS_VALUE'].max()

df_min = df_min.rename(columns={'OBS_VALUE': 'min'})
df_max = df_max.rename(columns={'OBS_VALUE': 'max'})

e_df2 = pd.merge(df_min, df_max, on='TIME_PERIOD')
e_df2

Unnamed: 0,TIME_PERIOD,min,max
0,2000-01-01,20.8,31.6
1,2001-01-01,22.4,31.7
2,2002-01-01,23.3,34.2
3,2003-01-01,23.8,31.8
4,2004-01-01,23.6,32.6
5,2005-01-01,23.9,33.4
6,2006-01-01,25.6,33.6
7,2007-01-01,26.0,34.6
8,2008-01-01,25.2,34.6
9,2009-01-01,26.4,36.4


In [None]:
 dy={
            "expr": "datum.Country == 'Northern Ireland' ? 3 : datum.Country == 'East of England' ? -7 : datum.Country == 'South East England' ? 7 : datum.Country == 'North East England' ? 7 : datum.Country == 'East Midlands' ? 7 : datum.Country == 'South West England' ? -15 : datum.Country == 'North West England' ? -7 : 0"
          }

In [None]:
to_filter = ['Greater London', 'Scotland','Northern Ireland', 'Wales', 'United Kingdom']

uk_df10= uk_df[uk_df['Country'].isin(to_filter)]
uk_df10

Unnamed: 0,Country,TIME_PERIOD,OBS_VALUE
0,United Kingdom,2000-01-01,28.5
1,Scotland,2000-01-01,30.1
3,Wales,2000-01-01,26.4
11,Northern Ireland,2000-01-01,21.8
12,Greater London,2000-01-01,39.8
...,...,...,...
295,Greater London,2022-01-01,69.5
299,Scotland,2023-01-01,56.1
302,Wales,2023-01-01,46.1
305,Northern Ireland,2023-01-01,42.9


In [None]:
chart = alt.Chart(uk_df10).mark_line(interpolate='monotone').encode(
    x=alt.X('TIME_PERIOD:T', title=''),
    y=alt.Y('OBS_VALUE:Q', title='', axis=alt.Axis(labelExpr="format(datum.value,',') + '%'"), scale=alt.Scale(domain= [20, 70])),
    color=alt.Color('Country:N', legend=None),
    size=alt.condition(
        alt.datum.Country == "United Kingdom",
        alt.value(2),  
        alt.value(1.5)  
    ),
    strokeDash=alt.condition(
        alt.datum.Country == "United Kingdom",
        alt.value([0, 0]),
        alt.value([5, 2])   
    ),
    tooltip = ['Country:N']
).properties(
    width=500,
    height=300,
    title={
        "text": "Educational attainment: United Kingdom",
        "anchor": "start",
        "subtitle": ["Share of 25-64 year olds with tertiary education", "England regions excluding London shaded", "Source: OECD", ""],
        "subtitleColor": "#676A86"})

area = alt.Chart(e_df2).mark_area(
    color='rgba(0,0,0,0.08)'
).encode(
    x=alt.X('TIME_PERIOD:T'),
    y=alt.Y('min:Q'),
    y2=alt.Y2('max:Q')
)

text = alt.Chart(uk_df10).mark_text(
    align='left',
    dx= 2,
    dy=  2
          
).encode(
    alt.X('TIME_PERIOD:T', aggregate='max'),
    alt.Y('OBS_VALUE:Q', aggregate={'argmax': 'TIME_PERIOD'}),
    text='Country:N',
    color = 'Country:N'
)  


chart4 = chart + text + area
chart4



In [None]:
chart4.save('UK_terteduc.png', scale_factor=2.0)

In [None]:
chart4.save('UK_terteduc.json')

In [None]:
# Pivot the DataFrame
uk_df2 = uk_df.pivot(index=['Reference area'], columns='TIME_PERIOD', values='OBS_VALUE').reset_index()

# Display the result
uk_df2

TIME_PERIOD,Reference area,2000,2001,2002,2003,2004,2005,2006,2007,2008,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,East Midlands,24.6,23.8,24.9,24.2,26.9,26.5,28.2,28.1,27.8,...,34.6,35.9,36.0,36.4,37.6,40.4,44.0,42.6,43.7,45.7
1,East of England,25.9,27.1,27.1,26.3,27.7,27.7,27.6,29.6,28.6,...,37.8,37.1,38.7,39.3,39.2,43.3,44.8,45.2,47.4,50.3
2,Greater London,39.8,41.3,41.7,35.1,36.0,37.7,39.9,41.2,41.6,...,53.7,55.1,57.0,56.5,57.5,64.7,68.5,69.8,69.5,71.1
3,North East England,20.8,22.4,23.3,23.8,23.6,23.9,25.6,26.0,25.2,...,31.4,35.0,35.4,34.7,33.6,35.3,38.3,39.7,38.9,42.3
4,North West England,26.0,25.2,25.8,25.2,26.9,27.4,27.9,29.2,29.1,...,36.1,37.2,38.8,38.5,38.8,42.1,43.7,44.0,47.0,47.5
5,Northern Ireland,21.8,22.8,25.5,24.6,26.9,26.6,27.7,29.4,30.4,...,34.0,33.6,34.4,35.6,37.2,41.7,45.1,48.8,41.6,42.9
6,Scotland,30.1,31.4,32.7,32.2,33.2,33.4,34.6,36.1,36.3,...,46.5,47.2,47.8,47.5,47.4,52.9,55.2,55.2,57.6,56.1
7,South East England,31.6,31.7,34.2,31.8,32.6,33.4,33.6,34.6,34.6,...,45.0,44.7,45.2,46.5,46.9,51.5,51.9,51.6,54.2,56.2
8,South West England,29.5,29.4,30.5,29.1,29.0,30.1,31.8,32.7,32.0,...,40.8,41.8,42.3,44.0,43.3,45.4,47.4,48.0,48.5,50.3
9,United Kingdom,28.5,28.7,29.8,28.3,29.4,29.8,30.8,32.0,32.0,...,40.6,41.6,42.3,42.7,43.2,44.7,49.4,50.1,,


In [58]:
reg_df = pd.read_csv('oecd_terteduc_region_data.csv')
reg_df

Unnamed: 0,Territorial level,REF_AREA,Reference_area,TIME_PERIOD,OBS_VALUE,COUNTRY,Country
0,TL2,US31,Nebraska,2022,49.9,USA,United States
1,TL2,US31,Nebraska,2021,49.7,USA,United States
2,TL2,CA11,Prince Edward Island,2020,59.0,CAN,Canada
3,TL2,CA11,Prince Edward Island,2019,56.0,CAN,Canada
4,TL2,US12,Florida,2019,42.8,USA,United States
...,...,...,...,...,...,...,...
553,TL2,US28,Mississippi,2021,37.4,USA,United States
554,TL2,US40,Oklahoma,2022,38.5,USA,United States
555,TL2,US40,Oklahoma,2021,37.7,USA,United States
556,TL2,US53,Washington,2022,51.0,USA,United States


In [59]:
gdpreg_df = pd.read_csv('oecd_gdppc_tl2reg_data2.csv')
gdpreg_df

Unnamed: 0,REF_AREA,TIME_PERIOD,gdppc
0,US04,2020,53772.2
1,US04,2019,51594.9
2,CA11,2020,39001.5
3,CA11,2019,39213.1
4,US24,2022,77890.0
...,...,...,...
666,US38,2021,81246.9
667,US38,2020,70998.2
668,US38,2019,79411.3
669,US04,2022,64577.0


In [60]:
reg_df2 = pd.merge(gdpreg_df, reg_df, on=['REF_AREA', 'TIME_PERIOD'], how='inner')
reg_df2

Unnamed: 0,REF_AREA,TIME_PERIOD,gdppc,Territorial level,Reference_area,OBS_VALUE,COUNTRY,Country
0,US04,2019,51594.9,TL2,Arizona,39.4,USA,United States
1,CA11,2020,39001.5,TL2,Prince Edward Island,59.0,CAN,Canada
2,CA11,2019,39213.1,TL2,Prince Edward Island,56.0,CAN,Canada
3,US24,2022,77890.0,TL2,Maryland,53.0,USA,United States
4,CA13,2022,47203.1,TL2,New Brunswick,60.0,CAN,Canada
...,...,...,...,...,...,...,...,...
474,US38,2022,93272.8,TL2,North Dakota,50.3,USA,United States
475,US38,2021,81246.9,TL2,North Dakota,51.9,USA,United States
476,US38,2019,79411.3,TL2,North Dakota,47.5,USA,United States
477,US04,2022,64577.0,TL2,Arizona,43.3,USA,United States


In [61]:
reg_df2['TIME_PERIOD'] = reg_df2['TIME_PERIOD'].astype(str) + '-01-01' 
reg_df2

Unnamed: 0,REF_AREA,TIME_PERIOD,gdppc,Territorial level,Reference_area,OBS_VALUE,COUNTRY,Country
0,US04,2019-01-01,51594.9,TL2,Arizona,39.4,USA,United States
1,CA11,2020-01-01,39001.5,TL2,Prince Edward Island,59.0,CAN,Canada
2,CA11,2019-01-01,39213.1,TL2,Prince Edward Island,56.0,CAN,Canada
3,US24,2022-01-01,77890.0,TL2,Maryland,53.0,USA,United States
4,CA13,2022-01-01,47203.1,TL2,New Brunswick,60.0,CAN,Canada
...,...,...,...,...,...,...,...,...
474,US38,2022-01-01,93272.8,TL2,North Dakota,50.3,USA,United States
475,US38,2021-01-01,81246.9,TL2,North Dakota,51.9,USA,United States
476,US38,2019-01-01,79411.3,TL2,North Dakota,47.5,USA,United States
477,US04,2022-01-01,64577.0,TL2,Arizona,43.3,USA,United States


In [62]:
to_filter = ['2021-01-01']

reg_df3 = reg_df2[reg_df2['TIME_PERIOD'].isin(to_filter)]
reg_df3

Unnamed: 0,REF_AREA,TIME_PERIOD,gdppc,Territorial level,Reference_area,OBS_VALUE,COUNTRY,Country
6,CA60,2021-01-01,70834.5,TL2,Yukon,59.0,CAN,Canada
10,CA12,2021-01-01,43043.2,TL2,Nova Scotia,61.0,CAN,Canada
13,FRC,2021-01-01,42517.4,TL2,Bourgogne-Franche-Comté,34.5,FRA,France
14,CA13,2021-01-01,44396.7,TL2,New Brunswick,59.0,CAN,Canada
15,CA47,2021-01-01,64277.6,TL2,Saskatchewan,50.0,CAN,Canada
...,...,...,...,...,...,...,...,...
466,US21,2021-01-01,52783.3,TL2,Kentucky,39.5,USA,United States
469,US53,2021-01-01,88954.1,TL2,Washington,50.8,USA,United States
472,US19,2021-01-01,69050.0,TL2,Iowa,46.5,USA,United States
475,US38,2021-01-01,81246.9,TL2,North Dakota,51.9,USA,United States


In [63]:
to_filter2 = ['GBR']
regUK_df = reg_df3[reg_df3['COUNTRY'].isin(to_filter2)]
regUK_df

Unnamed: 0,REF_AREA,TIME_PERIOD,gdppc,Territorial level,Reference_area,OBS_VALUE,COUNTRY,Country
266,UKD,2021-01-01,46982.2,TL2,North West England,44.0,GBR,United Kingdom
270,UKG,2021-01-01,43469.6,TL2,West Midlands,44.2,GBR,United Kingdom
274,UKK,2021-01-01,47790.1,TL2,South West England,48.0,GBR,United Kingdom
278,UKJ,2021-01-01,56912.6,TL2,South East England,51.6,GBR,United Kingdom
282,UKF,2021-01-01,43249.9,TL2,East Midlands,42.6,GBR,United Kingdom
286,UKM,2021-01-01,48436.5,TL2,Scotland,55.2,GBR,United Kingdom
290,UKN,2021-01-01,42653.5,TL2,Northern Ireland,48.8,GBR,United Kingdom
292,UKH,2021-01-01,47766.0,TL2,East of England,45.2,GBR,United Kingdom
296,UKC,2021-01-01,38327.1,TL2,North East England,39.7,GBR,United Kingdom
300,UKL,2021-01-01,39175.0,TL2,Wales,44.3,GBR,United Kingdom


In [83]:


# Define the countries and their associated colors


# Create the main scatter plot
chart = alt.Chart(reg_df3).mark_point(filled=True).transform_filter(
    alt.datum.gdppc <= 120000).encode(
    alt.X('OBS_VALUE:Q', title='', axis=alt.Axis(labelExpr="format(datum.value,',') + '%'")),
    alt.Y('gdppc:Q', title='', axis=alt.Axis(labelExpr="'$' + format(datum.value,',')"), scale=alt.Scale(domain= [20000, 120000])),
    tooltip=['Reference_area:N', 'gdppc:Q', 'COUNTRY:N'],
    color=alt.Color('Country:N', legend=alt.Legend(
                        orient='top',
                        direction='horizontal',
                        title='',
                        padding = 10
    ))

).properties(
    width=500,
    height=600,
    title={
        "text": "Tertiary Education by GDP per capita 2021",
        "anchor": "start",
        "subtitle": ["Share of 25-64 year olds with tertiary education", "Regional GDP per capita USD PPP", "Source: OECD", ""],
        "subtitleColor": "#676A86"
    }
)

# Add text labels to the highlighted countries
text = alt.Chart(regUK_df).mark_text(
    color = 'black',
    opacity = 0.8,
    align={
            "expr": "datum.OBS_VALUE <= 46.5 ? 'left' : 'right'"
          },
    dx= {
            "expr": "datum.OBS_VALUE <= 46.5 ? 5 : -5"
          },
    dy={
            "expr": "datum.Reference_area == 'North East England' ? 7 : datum.Reference_area == 'Northern Ireland' ? 7 : datum.Reference_area == 'Scotland' ? -5 : datum.Reference_area == 'North West England' ? 4 : 0"
          }
).encode(
    x=alt.X('OBS_VALUE:Q'),
    y=alt.Y('gdppc:Q'),
    text='Reference_area:N'
    # color=alt.Color('Country:N')
)

# Combine the chart and text layers
chart5 = chart + text
chart5


In [84]:
chart5.save('oecdreg_terteduc.png', scale_factor=2.0)

In [85]:
chart5.save('oecdreg_terteduc.json')