In [1]:
import pandas as pd 
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("population_by_education_level.csv")

In [3]:
df.head()

Unnamed: 0,unit,sex,age,isced11,geography,date,value
0,THS,F,Y15-19,ED0-2,AT,2020,149.8
1,THS,F,Y15-19,ED0-2,BE,2020,247.9
2,THS,F,Y15-19,ED0-2,BG,2020,128.4
3,THS,F,Y15-19,ED0-2,CH,2020,168.2
4,THS,F,Y15-19,ED0-2,CY,2020,16.8


In [4]:
df.tail()

Unnamed: 0,unit,sex,age,isced11,geography,date,value
609971,THS,T,Y65-69,TOTAL,SE,1983,
609972,THS,T,Y65-69,TOTAL,SI,1983,
609973,THS,T,Y65-69,TOTAL,SK,1983,
609974,THS,T,Y65-69,TOTAL,TR,1983,
609975,THS,T,Y65-69,TOTAL,UK,1983,2532.5


In [5]:
df.shape

(609976, 7)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 609976 entries, 0 to 609975
Data columns (total 7 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   unit       609976 non-null  object 
 1   sex        609976 non-null  object 
 2   age        609976 non-null  object 
 3   isced11    609976 non-null  object 
 4   geography  609976 non-null  object 
 5   date       609976 non-null  int64  
 6   value      361370 non-null  float64
dtypes: float64(1), int64(1), object(5)
memory usage: 32.6+ MB


In [7]:
df.describe()

Unnamed: 0,date,value
count,609976.0,361370.0
mean,2001.5,4437.056919
std,10.965865,16716.155159
min,1983.0,0.4
25%,1992.0,84.5
50%,2001.5,400.05
75%,2011.0,1947.575
max,2020.0,380593.3


In [8]:
df.isnull().sum()

unit              0
sex               0
age               0
isced11           0
geography         0
date              0
value        248606
dtype: int64

## Exploração Inicial dos Dados

Existem muitos dados "T" na variavel sexo, que representa a soma dos dados de ambos os sexos (M + F). Essa é a explicação mais comum em bases do Eurostat.

In [9]:
df['sex'].value_counts()

sex
T    203680
M    203376
F    202920
Name: count, dtype: int64

Existem valores "TOTAL" na variavel isced11, representando a soma dos dados total das faixas de Educacionais segunda a International Standard Classification of Education (ISCED11)
Os valores "NRP" representam "No Response Provided", Isso indica que, para aquele registro, não houve uma resposta válida sobre o nível educacional. Pode ser um dado faltante.

In [10]:
df['isced11'].value_counts()

isced11
ED0-2    128934
ED3_4    128934
TOTAL    128934
ED5-8    128402
NRP       94772
Name: count, dtype: int64

Existem valores 'EU15', 'EA19', 'EU27_2020', 'EU28' que representam dados agregados de paises dentro União Europeia ou da Zona do Euro

In [11]:
df['geography'].value_counts()

geography
IE           16530
EU28         16530
SI           16530
SE           16530
PT           16530
NO           16530
NL           16530
LU           16530
IT           16530
IS           16530
BE           16530
HU           16530
FR           16530
FI           16530
UK           16530
EU27_2020    16530
EU15         16530
CH           16530
ES           16530
EL           16530
CZ           16530
EA19         16530
DE           16530
DK           16530
LV           16492
AT           16340
EE           16112
HR           15960
MT           14896
LT           13908
TR           13680
RO           13642
MK           13224
CY           13224
BG           13224
SK           13224
ME           13110
PL           13110
RS           13110
Name: count, dtype: int64

### Dados "T" de sexo são apagados

In [12]:
df = df[df['sex'] != 'T']
print(df['sex'].unique())

['F' 'M']


### Dados "TOTAL" de isced11 são apagados

In [13]:
df = df[df['isced11'] != 'TOTAL']
print(df['isced11'].unique())

['ED0-2' 'ED3_4' 'ED5-8' 'NRP']


### Os valores agregados da União Européia e zona do Euro são apagados

In [14]:
df = df[~df['geography'].isin(['EU15', 'EA19', 'EU27_2020', 'EU28'])]
print(df['geography'].unique())

['AT' 'BE' 'BG' 'CH' 'CY' 'CZ' 'DE' 'DK' 'EE' 'EL' 'ES' 'FI' 'FR' 'HR'
 'HU' 'IE' 'IS' 'IT' 'LT' 'LU' 'LV' 'ME' 'MK' 'MT' 'NL' 'NO' 'PL' 'PT'
 'RO' 'RS' 'SE' 'SI' 'SK' 'TR' 'UK']


### Dados isced11 = "NRP" & value = NaN são apagados

In [15]:
#df = df[~((df['isced11'] == 'NRP') & (df['value'].isnull()))]
#print(df[(df['isced11'] == 'NRP') & (df['value'].isnull())])

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 285076 entries, 0 to 604576
Data columns (total 7 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   unit       285076 non-null  object 
 1   sex        285076 non-null  object 
 2   age        285076 non-null  object 
 3   isced11    285076 non-null  object 
 4   geography  285076 non-null  object 
 5   date       285076 non-null  int64  
 6   value      160846 non-null  float64
dtypes: float64(1), int64(1), object(5)
memory usage: 17.4+ MB


In [17]:
df['age'].value_counts()

age
Y15-39    10032
Y15-59    10032
Y15-64    10032
Y15-74    10032
Y15-24    10032
Y20-64     9918
Y25-74     9880
Y25-49     9880
Y25-64     9880
Y25-59     9880
Y25-54     9880
Y25-39     9880
Y25-29     9842
Y20-24     9804
Y50-74     9804
Y35-39     9804
Y40-44     9804
Y40-59     9804
Y40-64     9804
Y45-49     9804
Y50-59     9766
Y50-64     9766
Y65-69     9766
Y50-54     9766
Y30-34     9728
Y55-64     9690
Y55-59     9652
Y60-64     9652
Y15-19     9462
Name: count, dtype: int64

In [18]:
df['date'].value_counts()

date
2020    7502
1992    7502
1999    7502
1998    7502
1997    7502
1996    7502
1995    7502
1994    7502
1993    7502
1991    7502
2019    7502
1990    7502
1989    7502
1988    7502
1987    7502
1986    7502
1985    7502
1984    7502
2000    7502
2001    7502
2002    7502
2003    7502
2018    7502
2017    7502
2016    7502
2015    7502
2014    7502
2013    7502
2012    7502
2011    7502
2010    7502
2009    7502
2008    7502
2007    7502
2006    7502
2005    7502
2004    7502
1983    7502
Name: count, dtype: int64