In [1]:
import pandas as pd 
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("population_by_education_level.csv")

In [3]:
df.head()

Unnamed: 0,unit,sex,age,isced11,geography,date,value
0,THS,F,Y15-19,ED0-2,AT,2020,149.8
1,THS,F,Y15-19,ED0-2,BE,2020,247.9
2,THS,F,Y15-19,ED0-2,BG,2020,128.4
3,THS,F,Y15-19,ED0-2,CH,2020,168.2
4,THS,F,Y15-19,ED0-2,CY,2020,16.8


In [4]:
df.tail()

Unnamed: 0,unit,sex,age,isced11,geography,date,value
609971,THS,T,Y65-69,TOTAL,SE,1983,
609972,THS,T,Y65-69,TOTAL,SI,1983,
609973,THS,T,Y65-69,TOTAL,SK,1983,
609974,THS,T,Y65-69,TOTAL,TR,1983,
609975,THS,T,Y65-69,TOTAL,UK,1983,2532.5


In [5]:
df.shape

(609976, 7)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 609976 entries, 0 to 609975
Data columns (total 7 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   unit       609976 non-null  object 
 1   sex        609976 non-null  object 
 2   age        609976 non-null  object 
 3   isced11    609976 non-null  object 
 4   geography  609976 non-null  object 
 5   date       609976 non-null  int64  
 6   value      361370 non-null  float64
dtypes: float64(1), int64(1), object(5)
memory usage: 32.6+ MB


In [7]:
df.describe()

Unnamed: 0,date,value
count,609976.0,361370.0
mean,2001.5,4437.056919
std,10.965865,16716.155159
min,1983.0,0.4
25%,1992.0,84.5
50%,2001.5,400.05
75%,2011.0,1947.575
max,2020.0,380593.3


In [8]:
df.isnull().sum()

unit              0
sex               0
age               0
isced11           0
geography         0
date              0
value        248606
dtype: int64

## Exploração Inicial dos Dados

Existem muitos dados "T" na variavel sexo, que representa a soma dos dados de ambos os sexos (M + F). Essa é a explicação mais comum em bases do Eurostat.

In [None]:
df['sex'].value_counts()

Existem valores "TOTAL" na variavel isced11, representando a soma dos dados total das faixas de Educacionais segunda a International Standard Classification of Education (ISCED11)
Os valores "NRP" representam "No Response Provided", Isso indica que, para aquele registro, não houve uma resposta válida sobre o nível educacional. Pode ser um dado faltante.

In [12]:
df['isced11'].value_counts()

isced11
ED0-2    128934
ED3_4    128934
TOTAL    128934
ED5-8    128402
NRP       94772
Name: count, dtype: int64

### Dados "T" de sexo são apagados

In [25]:
df = df[df['sex'] != 'T']
print(df['sex'].unique())

### Dados "TOTAL" de isced11 são apagados

In [28]:
df = df[df['isced11'] != 'TOTAL']
print(df['isced11'].unique())

['ED0-2' 'ED3_4' 'ED5-8' 'NRP']


### Dados isced11 = "NRP" & value = NaN são apagados

In [30]:
df = df[~((df['isced11'] == 'NRP') & (df['value'].isnull()))]
print(df[(df['isced11'] == 'NRP') & (df['value'].isnull())])

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 279808 entries, 0 to 604576
Data columns (total 7 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   unit       279808 non-null  object 
 1   sex        279808 non-null  object 
 2   age        279808 non-null  object 
 3   isced11    279808 non-null  object 
 4   geography  279808 non-null  object 
 5   date       279808 non-null  int64  
 6   value      180028 non-null  float64
dtypes: float64(1), int64(1), object(5)
memory usage: 17.1+ MB


In [33]:
df['age'].value_counts()

age
Y15-74    9863
Y15-64    9846
Y15-59    9842
Y15-39    9803
Y25-74    9776
Y20-64    9766
Y25-64    9748
Y25-59    9742
Y25-54    9735
Y15-24    9727
Y25-49    9719
Y40-64    9705
Y50-74    9700
Y40-59    9695
Y25-39    9684
Y50-64    9656
Y50-59    9626
Y55-64    9616
Y25-29    9579
Y20-24    9577
Y35-39    9565
Y30-34    9563
Y40-44    9549
Y45-49    9540
Y50-54    9533
Y55-59    9497
Y65-69    9479
Y60-64    9458
Y15-19    9219
Name: count, dtype: int64

In [34]:
df['geography'].value_counts()

geography
UK           8577
NL           8572
DE           8497
IE           8348
EU15         8062
DK           8020
LU           7891
EA19         7783
EU27_2020    7693
FR           7683
EU28         7656
SE           7592
IT           7308
CH           7301
NO           7276
ES           7236
EL           7134
BE           7134
PT           7019
IS           6999
CZ           6980
SI           6705
HU           6700
FI           6698
HR           6667
LV           6627
SK           6612
TR           6612
CY           6612
BG           6612
MT           6612
MK           6612
LT           6612
AT           6612
EE           6610
PL           6536
RO           6536
RS           6536
ME           6536
Name: count, dtype: int64

In [35]:
df['date'].value_counts()

date
2013    7532
2015    7509
2017    7505
2010    7501
2018    7499
2016    7493
2012    7493
2014    7491
2011    7486
1987    7472
1989    7472
1988    7472
1990    7472
1991    7472
2002    7456
2019    7452
2000    7426
1986    7414
2009    7411
1998    7398
1999    7395
2008    7391
2001    7375
1985    7356
1983    7356
2003    7318
2007    7302
1984    7298
2004    7292
2006    7279
2005    7274
2020    7251
1997    7191
1996    7173
1995    7172
1992    7017
1993    6992
1994    6950
Name: count, dtype: int64