In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from plotly import graph_objs as go
from plotly.subplots import make_subplots
import plotly.express as px

In [2]:
df = pd.read_excel('owid_co2_data.xlsx')
df.head()

Unnamed: 0,Name,Value.iso_code,Value.data.year,Value.data.population,Value.data.cumulative_luc_co2,Value.data.land_use_change_co2,Value.data.land_use_change_co2_per_capita,Value.data.share_global_cumulative_luc_co2,Value.data.share_global_luc_co2,Value.data.share_of_temperature_change_from_ghg,Value.data.temperature_change_from_ch4,Value.data.temperature_change_from_co2,Value.data.temperature_change_from_ghg,Value.data.temperature_change_from_n2o
0,Afghanistan,AFG,1850,3752993.0,2.979601,2.979601,0.793927,0.112725,0.112725,,,,,
1,Afghanistan,AFG,1851,3767956.0,5.981443,3.001842,0.796677,0.111259,0.109841,0.164799,1e-06,2e-06,3e-06,1.028669e-07
2,Afghanistan,AFG,1852,3783940.0,9.002998,3.021554,0.798521,0.11038,0.10868,0.164375,2e-06,3e-06,6e-06,2.077781e-07
3,Afghanistan,AFG,1853,3800954.0,12.041333,3.038336,0.799361,0.109507,0.107001,0.163732,4e-06,5e-06,9e-06,3.146464e-07
4,Afghanistan,AFG,1854,3818038.0,15.094068,3.052735,0.799556,0.108817,0.106176,0.162811,5e-06,6e-06,1.2e-05,4.233508e-07


In [32]:
print('Le jeu de données possède', df.shape[0], 'lignes et', df.shape[1], 'colonnes')

The data has 48058 rows and 14 columns


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48058 entries, 0 to 48057
Data columns (total 14 columns):
 #   Column                                           Non-Null Count  Dtype  
---  ------                                           --------------  -----  
 0   Name                                             48058 non-null  object 
 1   Value.iso_code                                   39717 non-null  object 
 2   Value.data.year                                  48058 non-null  int64  
 3   Value.data.population                            39495 non-null  float64
 4   Value.data.cumulative_luc_co2                    37022 non-null  float64
 5   Value.data.land_use_change_co2                   37022 non-null  float64
 6   Value.data.land_use_change_co2_per_capita        36313 non-null  float64
 7   Value.data.share_global_cumulative_luc_co2       37022 non-null  float64
 8   Value.data.share_global_luc_co2                  37022 non-null  float64
 9   Value.data.share_of_temperat

In [4]:
df.loc[df['Value.iso_code'].isna()]['Name'].unique()

array(['Africa', 'Africa (GCP)', 'Asia', 'Asia (GCP)',
       'Asia (excl. China and India)', 'Central America (GCP)', 'Europe',
       'Europe (GCP)', 'Europe (excl. EU-27)', 'Europe (excl. EU-28)',
       'European Union (27)', 'European Union (28)',
       'French Equatorial Africa (Jones et al. 2023)',
       'French West Africa (Jones et al. 2023)', 'High-income countries',
       'International aviation', 'International shipping',
       'International transport', 'Kosovo', 'Kuwaiti Oil Fires (GCP)',
       'Kuwaiti Oil Fires (Jones et al. 2023)',
       'Least developed countries (Jones et al. 2023)',
       'Leeward Islands (GCP)', 'Leeward Islands (Jones et al. 2023)',
       'Low-income countries', 'Lower-middle-income countries',
       'Middle East (GCP)', 'Non-OECD (GCP)', 'North America',
       'North America (GCP)', 'North America (excl. USA)', 'OECD (GCP)',
       'OECD (Jones et al. 2023)', 'Oceania', 'Oceania (GCP)',
       'Panama Canal Zone (GCP)', 'Panama Canal Zo

In [10]:
# On compte le nombre de lignes dont la date est antérieure à 1880 pour rester cohérent sur les période étudiées
len(df.loc[df['Value.data.year']<1880])

11090

In [13]:
# Si l'on choisit de faire l'économie de ces lignes, on réduit le jeu de données à environ 23%
(len(df.loc[df['Value.data.year']<1880])/len(df))*100

23.07628282491989

In [15]:
# Il s'agit ensuite de déterminer si la notion de pays est intéressante ou non, dans la mesure où nos autres fichiers évoquent
# plutôt des régions du monde

# On va alors compter le nombre de valeurs uniques dans la colonne Name pour tenter d'y voir plus clair et d'identifier ce qu'on
# veut y garder

df['Name'].unique()

array(['Afghanistan', 'Africa', 'Africa (GCP)', 'Albania', 'Algeria',
       'Andorra', 'Angola', 'Anguilla', 'Antarctica',
       'Antigua and Barbuda', 'Argentina', 'Armenia', 'Aruba', 'Asia',
       'Asia (GCP)', 'Asia (excl. China and India)', 'Australia',
       'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh',
       'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bermuda',
       'Bhutan', 'Bolivia', 'Bonaire Sint Eustatius and Saba',
       'Bosnia and Herzegovina', 'Botswana', 'Brazil',
       'British Virgin Islands', 'Brunei', 'Bulgaria', 'Burkina Faso',
       'Burundi', 'Cambodia', 'Cameroon', 'Canada', 'Cape Verde',
       'Central African Republic', 'Central America (GCP)', 'Chad',
       'Chile', 'China', 'Christmas Island', 'Colombia', 'Comoros',
       'Congo', 'Cook Islands', 'Costa Rica', "Cote d'Ivoire", 'Croatia',
       'Cuba', 'Curacao', 'Cyprus', 'Czechia',
       'Democratic Republic of Congo', 'Denmark', 'Djibouti', 'Dominica',
       'Domini

In [16]:
# On peut établir une pré-liste qui excluerait les pays pour se focaliser sur des régions :

not_country_list = ['Africa', 'Africa (GCP)', 'Antarctica', 'Asia',
               'Asia (GCP)', 'Asia (excl. China and India)', 'Australia',
               'Central America (GCP)', 'Europe', 'Europe (GCP)',
               'Europe (excl. EU-27)', 'Europe (excl. EU-28)',
               'European Union (27)', 'European Union (28)', 'North America',
               'North America (GCP)', 'North America (excl. USA)', 'Oceania', 
               'Oceania (GCP)', 'South America', 'South America (GCP)', 'World']

continents_list = ['Europe', 'Africa', 'North America', 'South America', 'Antartica', 'Australia','Asia']

In [24]:
# Se focaliser uniquement sur une telle liste réduirait alors drastiquement notre jeu de données à seulement 10% de son total initial
display(len(df.loc[df['Name'].isin(not_country_list)]))

display((len(df.loc[df['Name'].isin(not_country_list)])/len(df))*100)

5062

10.533105830454867

In [31]:
# Si on cumule filtre par région du monde + toute date au-delà de 1879, on réduit le jeu de données à 3145 lignes soit 6% du
# jeu initial
display(len(df.loc[(df['Name'].isin(not_country_list))&(df['Value.data.year']>=1880)]))

display((len(df.loc[(df['Name'].isin(not_country_list))&(df['Value.data.year']>=1880)])/len(df))*100)

3145

6.544175787589995

In [69]:
# Procédons par une autre approche :

# Déterminons quelles sont les variables essentielles parmi les 14 à disposition et créons un nouveau dataframe :
important_variables = ['Value.data.year', 
                       'Value.iso_code',
                       'Name', 
                       'Value.data.population',  
                       'Value.data.cumulative_luc_co2']
df2 = df[important_variables]


#On restreint la période à post 1880 pour coller avec nos autres jeux de données
df2 = df2[(df2['Value.data.year'] >= 1880)]

#On crée une nouvelle variable visant à calculer le co2 par habitant, élément manquant du tableau d'origine
df2['co2_per_capita'] = df2['Value.data.cumulative_luc_co2']/df2['Value.data.population']

In [70]:
# On renomme les noms de colonne pour plus de praticité
df2 = df2.rename({'Value.data.year':'Year',
            'Value.iso_code' : "Iso_code",
            'Value.data.population' : 'Population',
            'Value.data.cumulative_luc_co2': 'co2'}, axis = 1)
df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 36968 entries, 30 to 48057
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Year            36968 non-null  int64  
 1   Iso_code        31314 non-null  object 
 2   Name            36968 non-null  object 
 3   Population      31500 non-null  float64
 4   co2             30602 non-null  float64
 5   co2_per_capita  29734 non-null  float64
dtypes: float64(3), int64(1), object(2)
memory usage: 2.0+ MB


In [71]:
# Ici, on couvre bien 143 années comme attendu, et 264 pays/régions du monde
fig=px.bar(df2.nunique().sort_values(ascending=False), 
           text_auto=True,
           color_discrete_sequence = ['lightblue'], 
           title='Valeurs uniques de notre Dataframe filtré')
fig.update_layout(showlegend=False)

In [72]:
df2.isnull().sum()

Year                 0
Iso_code          5654
Name                 0
Population        5468
co2               6366
co2_per_capita    7234
dtype: int64

In [73]:
# Reprenons notre idée vue plus haut de séparer par continents plutôt que par pays :
continent = ['Europe', 'Africa', 'North America', 'South America', 'Antartica', 'Australia','Asia']
continents= df2.loc[df['Name'].isin(continent)]
continents

Unnamed: 0,Year,Iso_code,Name,Population,co2,co2_per_capita
303,1880,,Africa,128189317.0,10602.686523,0.000083
304,1881,,Africa,128683453.0,10837.153320,0.000084
305,1882,,Africa,129364909.0,11072.800781,0.000086
306,1883,,Africa,130067281.0,11305.886719,0.000087
307,1884,,Africa,130775459.0,11541.186523,0.000088
...,...,...,...,...,...,...
40616,2018,,South America,424740741.0,186532.125000,0.000439
40617,2019,,South America,428318218.0,187949.171875,0.000439
40618,2020,,South America,431530105.0,189122.953125,0.000438
40619,2021,,South America,434254167.0,190411.171875,0.000438


In [74]:
# Comme attendu et déjà vu, les continents ne disposent pas d'iso_code
continents['Name'].unique()

array(['Africa', 'Asia', 'Australia', 'Europe', 'North America',
       'South America'], dtype=object)

In [75]:
print("Description statistique des colonnes numériques disposant d'un Iso_Code")
# On supprime ici les continents de l'équation
df2 = df2[df2['Iso_code'].notnull()]
df2.describe().T

Description statistique des colonnes numériques disposant d'un Iso_Code


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Year,31314.0,1950.993,41.27632,1880.0,1915.0,1951.0,1987.0,2022.0
Population,29711.0,16786450.0,77579270.0,238.0,332746.0,2488557.0,8583182.0,1425894000.0
co2,28171.0,2935.096,11459.91,-4409.774414,18.102487,340.2954,1570.265,128049.0
co2_per_capita,28018.0,0.0002310598,0.0003382155,-0.000252,2.8e-05,9.674547e-05,0.0002959127,0.003055225


In [76]:
results = df2.groupby('Name')['co2'].mean().sort_values(ascending=False)[:20]
display(results)

Name
United States                   102448.845471
Russia                           78377.782998
Indonesia                        44269.640147
Brazil                           38892.153095
China                            24474.190788
India                            23558.318300
Canada                           22690.708535
Colombia                         12746.323793
Ukraine                          10542.877914
Thailand                          9137.941110
Myanmar                           8883.987477
Philippines                       8087.111524
Mexico                            7949.190245
Democratic Republic of Congo      7500.273877
Cote d'Ivoire                     6838.829102
Argentina                         6688.015899
Australia                         5893.991082
Malaysia                          5652.931405
Nigeria                           5155.480271
Angola                            4840.574443
Name: co2, dtype: float64

In [77]:
continents.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Year,858.0,1951.0,41.30361,1880.0,1915.0,1951.0,1987.0,2022.0
Population,858.0,580566500.0,864226500.0,2201376.0,88829960.0,309010300.0,693327600.0,4721383000.0
co2,858.0,95903.99,75651.0,630.2333,22127.15,92020.77,154625.8,321034.9
co2_per_capita,858.0,0.0003329369,0.0002109798,3.119985e-05,0.000149929,0.0002889302,0.0004801799,0.0007994172


In [78]:
co2_map = px.choropleth(df2, locations='Iso_code',
                        color='co2', 
                        hover_name='Name',
                        title= 'Emission de Co2 dans le monde',
                        color_continuous_scale='RdYlGn_r',
                        projection='natural earth')
co2_map

In [79]:
line = px.line(continents, 'Year', 'co2', color='Name', log_y=True,
               title='Evolution des émissions de co2 par continent au fil des ans')
line.update_yaxes(showgrid=False, showline=True)
line.update_xaxes(showgrid=False, showline=True)
line.update_layout(margin=dict(t=100, b=0, l=70, r=40),
                   hovermode='closest',
                  xaxis_title='', yaxis_title='Emission de co2',
                  legend = dict(orientation='h', yanchor='auto', y=1.02, xanchor='right', x=1, title='Continents'))

In [80]:
co2_map_anim = px.choropleth(df2, locations='Iso_code',
                        color='co2',
                        labels='Name',
                        animation_frame='Year',
                        animation_group='Name',
                        title= 'Evolution des émissions par pays et année',
                        projection="natural earth", 
                        color_continuous_scale='RdYlGn_r',
                        )


co2_map_anim

In [95]:
cont = continents.groupby('Name')['co2'].mean().reset_index()

bar = px.bar(cont, 'co2', 'Name', log_x=True, text_auto=True,
             title = 'Les continents les plus gros émetteurs de co2', 
             color_discrete_sequence = ['green']
            )
bar.update_yaxes(showgrid=False, categoryorder='total ascending')
bar.update_xaxes(showgrid=False)
bar.update_layout(margin=dict(t=100, b=0, l=70, r=40),
                  xaxis_title='', yaxis_title='Emissions de co2')


In [94]:
country = df2.groupby('Name').sum().reset_index().sort_values('co2', ascending=False)
bar2 =px.bar(country.head(10), 'co2', 'Name', log_x=True, 
             text_auto=True,
             orientation='h', title='Les pays les plus gros émetteurs de co2',
             color_discrete_sequence = ['green'])
bar2.update_yaxes(showgrid=False, categoryorder='total ascending', showline=True)
bar2.update_xaxes(showgrid=False, showline=True, showticklabels=False, ticks='')
bar2.update_layout(margin=dict(t=100, b=0, l=70, r=40),
                  xaxis_title='', yaxis_title='')

In [90]:
relation =px.scatter(df2,  'co2', 
                'Population',
                log_y=True, log_x=True,
                color_discrete_sequence = ['blue'],
                title='La relation Population/Emission de co2')
relation.update_traces(textposition='top right')
relation.update_yaxes(showgrid=False, showline=True)
relation.update_xaxes(showgrid=False, showline=True)
relation.update_layout(margin=dict(t=100, b=0, l=70, r=40),
                   hovermode='closest')