In [1]:
import pandas as pd
import numpy as np
import glob

In [2]:
df = pd.read_csv('estat_educ_uoe_enrt02.csv')
df.columns = df.columns.str.strip()
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16866 entries, 0 to 16865
Data columns (total 13 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   freq     16866 non-null  object
 1   unit     16866 non-null  object
 2   age      16866 non-null  object
 3   sex      16866 non-null  object
 4   isced11  16866 non-null  object
 5   geo      16866 non-null  object
 6   2016     16866 non-null  object
 7   2017     16866 non-null  object
 8   2018     16866 non-null  object
 9   2019     16866 non-null  object
 10  2020     16866 non-null  object
 11  2021     16866 non-null  object
 12  2022     16866 non-null  object
dtypes: object(13)
memory usage: 1.7+ MB


In [3]:
path = '../ESTAT_CODELISTS/*.tsv'
files = glob.glob(path)

dataframes = {file.split('/')[-1].replace('.tsv', ''): pd.read_csv(
    file, sep='\t') for file in files}

In [4]:
for col in ["freq", "unit", "sex", "age", "isced11", "geo"]:
    curr_codelist = dataframes[f"ESTAT_{col.upper()}"]
    df[col] = df[col].map(curr_codelist.set_index('CODE')['Label'])

df.rename(columns={'geo': 'country'}, inplace=True)

df.head()

Unnamed: 0,freq,unit,age,sex,isced11,country,2016,2017,2018,2019,2020,2021,2022
0,Annual,Number,Total,Females,Short-cycle tertiary education,Albania,:,:,:,:,:,2185,2561
1,Annual,Number,Total,Females,Short-cycle tertiary education,Austria,41271,40562,40188,39272,38738,39989,40489
2,Annual,Number,Total,Females,Short-cycle tertiary education,Bosnia and Herzegovina,:,:,: z,: z,: z,: z,28
3,Annual,Number,Total,Females,Short-cycle tertiary education,Belgium,15351,14696,13850,13390,12400 bd,14159,14283
4,Annual,Number,Total,Females,Short-cycle tertiary education,Bulgaria,: z,: z,: z,: z,: z,: z,: z


## Replace ":" with NaN

In [5]:
df.replace(": z", np.nan, inplace=True)
df.replace(":", np.nan, inplace=True)

## remove all the flags

In [6]:
year_cols = [col for col in df.columns if col.isdigit()]

for col in year_cols:
    df[col] = df[col].astype(str).str.extract(r'(\d+)').astype(float)

In [7]:
for col in df.columns:
    if not col.isdigit() and df[col].nunique() == 1:
        df.drop(columns=[col], inplace=True)

df.rename(columns={'isced11': 'level'}, inplace=True)

In [8]:
df.head()

Unnamed: 0,age,sex,level,country,2016,2017,2018,2019,2020,2021,2022
0,Total,Females,Short-cycle tertiary education,Albania,,,,,,2185.0,2561.0
1,Total,Females,Short-cycle tertiary education,Austria,41271.0,40562.0,40188.0,39272.0,38738.0,39989.0,40489.0
2,Total,Females,Short-cycle tertiary education,Bosnia and Herzegovina,,,,,,,28.0
3,Total,Females,Short-cycle tertiary education,Belgium,15351.0,14696.0,13850.0,13390.0,12400.0,14159.0,14283.0
4,Total,Females,Short-cycle tertiary education,Bulgaria,,,,,,,


In [9]:
df.head()

Unnamed: 0,age,sex,level,country,2016,2017,2018,2019,2020,2021,2022
0,Total,Females,Short-cycle tertiary education,Albania,,,,,,2185.0,2561.0
1,Total,Females,Short-cycle tertiary education,Austria,41271.0,40562.0,40188.0,39272.0,38738.0,39989.0,40489.0
2,Total,Females,Short-cycle tertiary education,Bosnia and Herzegovina,,,,,,,28.0
3,Total,Females,Short-cycle tertiary education,Belgium,15351.0,14696.0,13850.0,13390.0,12400.0,14159.0,14283.0
4,Total,Females,Short-cycle tertiary education,Bulgaria,,,,,,,


In [10]:
df['country'] = df["country"].replace('Türkiye', 'Turkey')

In [11]:
df = df.loc[df["country"] != "European Union - 28 countries (2013-2020)"]
df.loc[:, 'country'] = df["country"].replace(
    'European Union - 27 countries (from 2020)', 'European Union')

In [12]:
# Group by country and sum the enrollments for 2022, but only for sex="Total"
df_totals = df[(df['sex'] == "Total") & (df['age'] == 'Total') & (df["level"] == 'Tertiary education (levels 5-8)')].groupby(
    'country')[['2022']].sum().reset_index()
df_totals = df_totals.rename(columns={'2022': 'enrollments'})
df_totals = df_totals.sort_values('enrollments', ascending=False)

# Calculate thresholds using quantiles (33% and 66%)
low_threshold = df_totals['enrollments'].quantile(0.33)
high_threshold = df_totals['enrollments'].quantile(0.66)

# Create categories
df_totals['enrollment_category'] = pd.cut(df_totals['enrollments'],
                                          bins=[-float('inf'), low_threshold,
                                                high_threshold, float('inf')],
                                          labels=['Low', 'Medium', 'High'])

# Display results with categories
df_totals[['country', 'enrollments', 'enrollment_category']
          ].sort_values('enrollments', ascending=False).reset_index(drop=True)

Unnamed: 0,country,enrollments,enrollment_category
0,European Union,17803433.0,High
1,Turkey,8296959.0,High
2,Germany,3362739.0,High
3,France,2883412.0,High
4,Spain,2309272.0,High
5,Italy,2145733.0,High
6,Poland,1365740.0,High
7,Greece,872828.0,High
8,Romania,554007.0,High
9,Belgium,549399.0,High


In [14]:
# Define new thresholds
thresholds = [20000, 50000, 100000, 300000, 
              500000, 1000000, 3000000, 8000000]

# Create categories with new thresholds
df_totals['enrollment_category'] = pd.cut(df_totals['enrollments'],
                                          bins=[-float('inf')] +
                                          thresholds + [float('inf')],
                                          labels=['Extremely Low', 'Very Low', 'Low', 'Medium Low', 'Medium', 'Medium High', 'High', 'Very High', 'Extremely High'])

# Display results with new categories
df_totals[['country', 'enrollments', 'enrollment_category']].sort_values(
    'enrollments', ascending=False).reset_index(drop=True)

Unnamed: 0,country,enrollments,enrollment_category
0,European Union,17803433.0,Extremely High
1,Turkey,8296959.0,Extremely High
2,Germany,3362739.0,Very High
3,France,2883412.0,High
4,Spain,2309272.0,High
5,Italy,2145733.0,High
6,Poland,1365740.0,High
7,Greece,872828.0,Medium High
8,Romania,554007.0,Medium High
9,Belgium,549399.0,Medium High


In [15]:
df_totals.to_json("clean/enrollmentQuantiles.json", orient = 'records')

In [16]:
print(f"{thresholds=}")

thresholds=[20000, 50000, 100000, 300000, 500000, 1000000, 3000000, 8000000]


In [12]:
# df.to_csv("./clean/estat_educ_uoe_enrt02.csv", index=False)