<h1 align="center"><u>Explainability AI Project</u></h1>
<h3 align="center"><u>Clément MOLLY-MITTON  | Diane VERBECQ  |  Raphaël VIGNAL</u></h3>
<h3 align="center"><u>Paul ESCALIER</u></h3>

# Table of Contents
1. [Introduction](#introduction)
2. [Import](#imports)
3. [Data exploration and cleaning](#data-exploration-and-cleaning)
4. [Models and evaluation](#models-and-evaluation)     
5. [Conclusion](#conclusion)

# Introduction
***


# Imports
***

We import all the necessary librairy

In [85]:
from copy import deepcopy
import numpy as np
import pandas as pd

import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OrdinalEncoder, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import optuna

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, RocCurveDisplay, f1_score

# Data Exploration and Cleaning
***

In [86]:
original_data = pd.read_csv(r'dataset\World Energy Consumption.csv', delimiter= ',')
display(original_data.info())
display(original_data.head(5))
display(original_data.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22012 entries, 0 to 22011
Columns: 129 entries, country to wind_share_energy
dtypes: float64(126), int64(1), object(2)
memory usage: 21.7+ MB


None

Unnamed: 0,country,year,iso_code,population,gdp,biofuel_cons_change_pct,biofuel_cons_change_twh,biofuel_cons_per_capita,biofuel_consumption,biofuel_elec_per_capita,...,solar_share_elec,solar_share_energy,wind_cons_change_pct,wind_cons_change_twh,wind_consumption,wind_elec_per_capita,wind_electricity,wind_energy_per_capita,wind_share_elec,wind_share_energy
0,ASEAN (Ember),2000,,,,,,,,,...,0.0,,,,,,0.0,,0.0,
1,ASEAN (Ember),2001,,,,,,,,,...,0.0,,,,,,0.0,,0.0,
2,ASEAN (Ember),2002,,,,,,,,,...,0.0,,,,,,0.0,,0.0,
3,ASEAN (Ember),2003,,,,,,,,,...,0.0,,,,,,0.0,,0.0,
4,ASEAN (Ember),2004,,,,,,,,,...,0.0,,,,,,0.0,,0.0,


Unnamed: 0,year,population,gdp,biofuel_cons_change_pct,biofuel_cons_change_twh,biofuel_cons_per_capita,biofuel_consumption,biofuel_elec_per_capita,biofuel_electricity,biofuel_share_elec,...,solar_share_elec,solar_share_energy,wind_cons_change_pct,wind_cons_change_twh,wind_consumption,wind_elec_per_capita,wind_electricity,wind_energy_per_capita,wind_share_elec,wind_share_energy
count,22012.0,18123.0,11113.0,1747.0,2687.0,2302.0,2767.0,5275.0,5527.0,5492.0,...,6849.0,4035.0,2413.0,4639.0,4744.0,7065.0,7996.0,4065.0,6886.0,4101.0
mean,1974.213247,105309600.0,358511400000.0,46.091694,2.719074,133.558459,38.068209,64.7168,10.966685,1.954035,...,0.62275,0.210395,267.541206,7.314174,53.422938,62.811828,17.561153,232.108949,1.40491,0.651413
std,35.058653,464046000.0,2411179000000.0,274.616745,10.057018,264.707851,111.623148,201.043915,47.953561,5.274155,...,2.066186,0.643304,5940.94412,37.327922,273.255306,252.734217,101.693172,733.841324,4.489043,1.89382
min,1900.0,1833.0,164206000.0,-100.0,-50.843,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-100.0,-51.208,0.0,0.0,0.0,0.0,0.0,0.0
25%,1946.0,1712404.0,13658980000.0,-0.6165,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.029,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1984.0,6991543.0,41674110000.0,8.179,0.0,15.482,0.673,0.146,0.01,0.062,...,0.0,0.0,20.0,0.0,0.005,0.0,0.0,0.051,0.0,0.003
75%,2003.0,25576880.0,174429500000.0,26.5085,0.8615,165.26675,13.6295,34.289,0.69,1.5775,...,0.095,0.0185,47.971,0.315,2.35775,2.101,0.14,45.523,0.372,0.268
max,2022.0,7975105000.0,113630200000000.0,5659.328,141.131,2588.512,1199.207,2514.102,677.57,71.429,...,40.0,7.586,242384.844,665.413,5487.6,3219.852,2139.23,8422.012,56.84,26.157


based on https://github.com/owid/energy-data/blob/master/owid-energy-codebook.csv we will keep only the relevant colulumn for us and we will clean the data

In [96]:
column_to_keep = [
    # Basic info per country
    "country",
    "year",
    "iso_code",
    "population",
    "gdp", # PIB

    # Total energy consumption
    "primary_energy_consumption",
    "energy_per_capita",
    "energy_per_gdp",

    # Part of energy in the total consumption
    "fossil_share_energy",
    "renewables_share_energy",
    "low_carbon_share_energy",

    # Consumption per energy 
    "coal_consumption",
    "oil_consumption",
    "gas_consumption",
    "biofuel_consumption",
    "hydro_consumption",
    "nuclear_consumption",
    "solar_consumption",
    "wind_consumption",

    # Electricity production per energy
    "electricity_generation",
    "fossil_electricity",
    "renewables_electricity",
    "nuclear_electricity",
    "hydro_electricity",
    "solar_electricity",
    "wind_electricity",

    # Carbon and gas emission
    "carbon_intensity_elec",
    "greenhouse_gas_emissions",
]

In [98]:
# Lets first check the pourcentage of null value per column
# null_per_column = (original_data.isnull().sum() / original_data.shape[0] ) * 100
# null_per_column
# We can see that we have a lot a column with more than 50% of null value. We will drop them.
# column_to_keep = null_per_column[null_per_column < 50]
# column_to_keep

In [100]:
df_clean = original_data[column_to_keep]
df_clean

Unnamed: 0,country,year,iso_code,population,gdp,primary_energy_consumption,energy_per_capita,energy_per_gdp,fossil_share_energy,renewables_share_energy,...,wind_consumption,electricity_generation,fossil_electricity,renewables_electricity,nuclear_electricity,hydro_electricity,solar_electricity,wind_electricity,carbon_intensity_elec,greenhouse_gas_emissions
0,ASEAN (Ember),2000,,,,,,,,,...,,368.65,295.75,72.90,0.0,50.37,0.00,0.0,500.231,184.41
1,ASEAN (Ember),2001,,,,,,,,,...,,397.19,320.51,76.68,0.0,54.26,0.00,0.0,499.358,198.34
2,ASEAN (Ember),2002,,,,,,,,,...,,422.82,346.83,75.99,0.0,53.32,0.00,0.0,505.652,213.80
3,ASEAN (Ember),2003,,,,,,,,,...,,447.15,371.44,75.71,0.0,53.28,0.00,0.0,513.698,229.70
4,ASEAN (Ember),2004,,,,,,,,,...,,484.94,407.92,77.02,0.0,52.88,0.00,0.0,520.910,252.61
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22007,Zimbabwe,2018,ZWE,15052191.0,2.271535e+10,51.809,3441.986,2.281,,,...,,9.19,3.73,5.46,0.0,5.05,0.02,0.0,355.822,3.27
22008,Zimbabwe,2019,ZWE,15354606.0,,46.120,3003.655,,,,...,,8.24,3.66,4.58,0.0,4.17,0.03,0.0,387.136,3.19
22009,Zimbabwe,2020,ZWE,15669663.0,,41.997,2680.132,,,,...,,7.59,3.40,4.19,0.0,3.81,0.03,0.0,389.987,2.96
22010,Zimbabwe,2021,ZWE,15993525.0,,42.145,2635.154,,,,...,,8.03,3.61,4.42,0.0,4.00,0.04,0.0,392.279,3.15


Now lets see for each country the number of null value

In [105]:
df_clean.isnull().sum()

country                           0
year                              0
iso_code                       5500
population                     3889
gdp                           10899
primary_energy_consumption     9424
energy_per_capita             11410
energy_per_gdp                14801
fossil_share_energy           17225
renewables_share_energy       17225
low_carbon_share_energy       17225
coal_consumption              16592
oil_consumption               16299
gas_consumption               16797
biofuel_consumption           19245
hydro_consumption             16533
nuclear_consumption           17615
solar_consumption             17329
wind_consumption              17268
electricity_generation        14821
fossil_electricity            15462
renewables_electricity        13989
nuclear_electricity           13496
hydro_electricity             13128
solar_electricity             14056
wind_electricity              14016
carbon_intensity_elec         16847
greenhouse_gas_emissions    

In [101]:
nb_null = df_clean.isnull().groupby(df_clean["country"]).sum().sum(axis=1)
nb_element = df_clean.groupby(df_clean["country"]).size() * df_clean.shape[1]
null_percentage = (nb_null / nb_element ) * 100
null_percentage

country
ASEAN (Ember)     60.714286
Afghanistan       74.297424
Africa            56.736353
Africa (EI)       30.110837
Africa (Ember)    60.714286
                    ...    
World             54.819977
Yemen             55.813953
Yugoslavia        86.141304
Zambia            74.297424
Zimbabwe          74.390244
Length: 306, dtype: float64

In [103]:
# Again we can see that we have a lot of null value for certain country so we will keep only the country with less 
# than 50% of null value
# line_to_keep = null_percentage[null_percentage < 20]
# line_to_keep
# df_clean = df_clean[df_clean["country"].isin(line_to_keep.index)].reset_index().drop("index", axis=1)
# df_clean

# Models and Evaluation
***

# Conclusion
***