In [77]:
import pandas as pd
import pycountry

In [78]:
df = pd.read_csv("../estat_sdg_07_11_en.csv")
df.head(5)

Unnamed: 0,DATAFLOW,LAST UPDATE,freq,unit,geo,TIME_PERIOD,OBS_VALUE,Unnamed: 7
0,ESTAT:SDG_07_11(1.0),18/04/24 23:00:00,A,I05,AL,2000,80.6,
1,ESTAT:SDG_07_11(1.0),18/04/24 23:00:00,A,I05,AL,2001,82.6,
2,ESTAT:SDG_07_11(1.0),18/04/24 23:00:00,A,I05,AL,2002,94.6,
3,ESTAT:SDG_07_11(1.0),18/04/24 23:00:00,A,I05,AL,2003,92.6,
4,ESTAT:SDG_07_11(1.0),18/04/24 23:00:00,A,I05,AL,2004,103.8,


In [79]:
# Drop useless columns and rows
df = df.drop(columns=["DATAFLOW", "LAST UPDATE", "freq", "Unnamed: 7"])
df = df[df["geo"] != "EU27_2020"]
df.head(5)

Unnamed: 0,unit,geo,TIME_PERIOD,OBS_VALUE
0,I05,AL,2000,80.6
1,I05,AL,2001,82.6
2,I05,AL,2002,94.6
3,I05,AL,2003,92.6
4,I05,AL,2004,103.8


In [80]:
# Pivot dataframe, instead of unit and OBS_VALUE, have one column per each unit, excepting I05
df = df.pivot_table(index=["geo", "TIME_PERIOD"], columns="unit", values="OBS_VALUE").reset_index().drop(columns=["I05"])
df.head()

unit,geo,TIME_PERIOD,MTOE,TOE_HAB
0,AL,2000,1.5,0.5
1,AL,2001,1.6,0.51
2,AL,2002,1.8,0.59
3,AL,2003,1.8,0.58
4,AL,2004,2.0,0.65


In [81]:
# Change from ISO-2 country codes to country names
def get_country_name(country):
    values = pycountry.countries.get(alpha_2=country)
    if values is None:
        return None
    else:
        return values.name


df["geo"] = df["geo"].apply(lambda x: get_country_name(x))
df.head(5)

unit,geo,TIME_PERIOD,MTOE,TOE_HAB
0,Albania,2000,1.5,0.5
1,Albania,2001,1.6,0.51
2,Albania,2002,1.8,0.59
3,Albania,2003,1.8,0.58
4,Albania,2004,2.0,0.65


In [82]:
# Standarize column names
df = df.rename(columns={"geo": "country", "TIME_PERIOD": "year"}).rename_axis(axis=1, columns=None)
df.head()

Unnamed: 0,country,year,MTOE,TOE_HAB
0,Albania,2000,1.5,0.5
1,Albania,2001,1.6,0.51
2,Albania,2002,1.8,0.59
3,Albania,2003,1.8,0.58
4,Albania,2004,2.0,0.65


In [83]:
# Drop nulls
df = df.dropna()
df.head()

Unnamed: 0,country,year,MTOE,TOE_HAB
0,Albania,2000,1.5,0.5
1,Albania,2001,1.6,0.51
2,Albania,2002,1.8,0.59
3,Albania,2003,1.8,0.58
4,Albania,2004,2.0,0.65


In [None]:
# Save cleaned dataframe
df.to_csv("toe_hab.csv", index=False)