In [1]:
import sys
from pathlib import Path
sys.path.append(str(Path().resolve().parent))

In [2]:
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.cluster import SpectralClustering
from utils.preprocessing import preprocessing
from utils.display import couleurs_vives

# 1/ Dataset upload/overview

In [3]:
df = pd.read_csv("../data/kaya_dataset.csv")
print(df.info())
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7745 entries, 0 to 7744
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   country              7745 non-null   object 
 1   year                 7745 non-null   int64  
 2   iso_code             7745 non-null   object 
 3   co2                  7745 non-null   float64
 4   energy               7745 non-null   float64
 5   gdp                  7745 non-null   float64
 6   population           7745 non-null   float64
 7   co2_per_unit_energy  7745 non-null   float64
 8   energy_per_gdp       7745 non-null   float64
 9   gdp_per_capita       7745 non-null   float64
dtypes: float64(7), int64(1), object(2)
memory usage: 605.2+ KB
None


Unnamed: 0,year,co2,energy,gdp,population,co2_per_unit_energy,energy_per_gdp,gdp_per_capita
count,7745.0,7745.0,7745.0,7745.0,7745.0,7745.0,7745.0,7745.0
mean,1997.914009,169.43036,740778200000.0,425915100000.0,40026760.0,0.239455,1.674191,13025.601709
std,15.104324,683.710379,2786040000000.0,1488989000000.0,135093500.0,0.200058,1.631089,14718.564983
min,1965.0,0.022,97659230.0,164206000.0,64082.0,0.018,0.078,361.188725
25%,1986.0,3.691,18504530000.0,18033690000.0,3593782.0,0.186,0.767,2808.878751
50%,1999.0,20.87,88900390000.0,64346110000.0,9754398.0,0.221,1.245,7828.856656
75%,2011.0,84.838,412821000000.0,254289100000.0,28101180.0,0.261,2.053,18309.742796
max,2022.0,11711.808,44518690000000.0,26966020000000.0,1426437000.0,10.689,25.253,163531.400281


# 2/ Preprocessing & Training

In [4]:
# countries selection :
countries = df.country.unique()
# year selection :
# years = df.year.unique()
years = [2022]  # only 2022
# common columns :
# cols = df.columns.tolist() # select all
cols = ['country', 'year', 'iso_code']
# feature selection
features_1 = ['co2', 'energy', 'gdp', 'population']
features_2 = ['co2_per_unit_energy', 'energy_per_gdp', 'gdp_per_capita'] 
features_3 = ['co2_per_unit_energy', 'energy_per_gdp', 'gdp_per_capita', 'population']
features = features_2

# preprocessing
df_2022, X =  preprocessing(df, years, countries, cols, features)

# training
n_clusters = 5
gamma = 0.1
assign_labels = "cluster_qr"
rs = 42 # seed for random functions (center first position)
sc = SpectralClustering(
    n_clusters=n_clusters, affinity='rbf', assign_labels=assign_labels, random_state=rs
    )
cluster_labels = sc.fit_predict(X)


# 3/ Dataset pour animation

In [5]:
df_2022["cluster"] = cluster_labels.astype(str)
df_anim = pd.merge(
    df,
    df_2022[["iso_code", "cluster"]],
    on="iso_code",
    how='left',
)
df_anim.head()

Unnamed: 0,country,year,iso_code,co2,energy,gdp,population,co2_per_unit_energy,energy_per_gdp,gdp_per_capita,cluster
0,Afghanistan,1980,AFG,1.756,6334207000.0,15329840000.0,13169313.0,0.277,0.413,1164.057381,3
1,Afghanistan,1981,AFG,1.978,7290143000.0,15645340000.0,11937586.0,0.271,0.466,1310.595291,3
2,Afghanistan,1982,AFG,2.095,7891781000.0,15980410000.0,10991380.0,0.265,0.494,1453.903877,3
3,Afghanistan,1983,AFG,2.52,9883905000.0,16755330000.0,10917985.0,0.255,0.59,1534.654334,3
4,Afghanistan,1984,AFG,2.822,9932936000.0,17072150000.0,11190222.0,0.284,0.582,1525.631306,3


In [6]:
# v√©rification affectation clusters
for k in range(n_clusters):
    print(set(df_2022[df_2022["cluster"]==str(k)]["country"]))

{'Congo', 'Mongolia', 'Lesotho'}
{'North Korea', 'Trinidad and Tobago', 'Bahrain', 'Iceland', 'Turkmenistan', 'Venezuela'}
{'Armenia', 'Slovakia', 'Italy', 'United Kingdom', 'Mauritius', 'Luxembourg', 'Switzerland', 'New Zealand', 'Portugal', 'Singapore', 'Norway', 'Slovenia', 'Finland', 'Chile', 'Romania', 'Belgium', 'United States', 'Qatar', 'Malta', 'Germany', 'Spain', 'Hungary', 'Croatia', 'Australia', 'Israel', 'Montenegro', 'Ireland', 'Uruguay', 'Georgia', 'Lithuania', 'Taiwan', 'Netherlands', 'Latvia', 'Sweden', 'Denmark', 'Paraguay', 'Brazil', 'Cyprus', 'United Arab Emirates', 'Greece', 'Austria', 'Colombia', 'Costa Rica', 'France', 'Hong Kong', 'Japan', 'Albania', 'Canada', 'South Korea', 'Panama', 'Seychelles', 'Turkey', 'Argentina'}
{'Cameroon', 'Kenya', 'Chad', 'Guinea-Bissau', 'Pakistan', 'Liberia', 'Yemen', 'Democratic Republic of Congo', 'Malawi', 'Sri Lanka', 'Bangladesh', 'Angola', 'Nepal', 'Uganda', 'Niger', 'Madagascar', 'Philippines', 'Guinea', 'Myanmar', 'Senegal',

In [7]:
print(features)
for f in features:
    print(f"{f}: [{min(set(df[f])):.2f}, {max(set(df[f])):2f}]")

['co2_per_unit_energy', 'energy_per_gdp', 'gdp_per_capita']
co2_per_unit_energy: [0.02, 10.689000]
energy_per_gdp: [0.08, 25.253000]
gdp_per_capita: [361.19, 163531.400281]


In [8]:
color_map = dict(zip([str(k) for k in range(n_clusters)], couleurs_vives))
color_map

{'0': '#E53935',
 '1': '#9C27B0',
 '2': '#A7C7E7',
 '3': '#B2DFDB',
 '4': '#D87C5B'}

In [10]:
x = features[1]
y = features[2]
min_x = min(set(df[x]))
max_x = max(set(df[x]))
min_x -= (max_x + min_x)/10
max_x += (max_x + min_x)/10
min_y = min(set(df[y]))
max_y = max(set(df[y]))
min_y -= (max_y + min_y)/10
max_y += (max_y + min_y)/10

min_x = 0
max_x = 5
min_y = -5000
max_y = 50000


px.scatter(data_frame=df_anim[df_anim["year"]>=1980],
           x=x,
           y=y,
           size='population',
           color='cluster',
           color_discrete_map=color_map,
           title='Carbon emissions, Energy and Wealth 1980 - 2022',
        #    log_x=True,
           range_x=[min_x, max_x],
           range_y=[min_y, max_y],
           hover_name='country',
           animation_frame='year',
           height=600,
           size_max=100)