# Nettoyage

## Importation des bibliothèques

In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import Pipeline

In [6]:
dataset = pd.read_csv('dataset.csv', nrows=1000)

In [7]:
dataset.head()

Unnamed: 0,id_stat,nom_region,nom_espece,date_plantation,superficie,pluviometrie,temperature_min,temperature_max,taux_survie_plants,rendement_moyen,annee
0,1,Abidjan,Cacao,2023-04-26,311.35,221.88,23.68,27.66,81.44,9.4,2019
1,1,Abidjan,Cacao,2023-08-20,141.35,221.88,23.68,27.66,81.44,9.4,2019
2,1,Abidjan,Cacao,2024-03-27,301.74,221.88,23.68,27.66,81.44,9.4,2019
3,2,Abidjan,Café,2019-06-26,114.35,92.11,23.94,34.81,75.21,6.29,2019
4,2,Abidjan,Café,2019-12-01,253.65,92.11,23.94,34.81,75.21,6.29,2019


Ajout de nouvelle colonne

In [8]:
# Convertir la colonne 'date' en type datetime
dataset['date_plantation'] = pd.to_datetime(dataset['date_plantation'], errors='coerce')  # Gestion des erreurs pour les dates non valides

# Ajouter la colonne température moyenne
dataset['temperature_moyenne'] = (dataset['temperature_min'] + dataset['temperature_max']) / 2

# Ajouter la colonne mois de plantation
dataset['mois_plantation'] = dataset['date_plantation'].dt.month

# Afficher les premières lignes du dataset enrichi
dataset.head()

Unnamed: 0,id_stat,nom_region,nom_espece,date_plantation,superficie,pluviometrie,temperature_min,temperature_max,taux_survie_plants,rendement_moyen,annee,temperature_moyenne,mois_plantation
0,1,Abidjan,Cacao,2023-04-26,311.35,221.88,23.68,27.66,81.44,9.4,2019,25.67,4
1,1,Abidjan,Cacao,2023-08-20,141.35,221.88,23.68,27.66,81.44,9.4,2019,25.67,8
2,1,Abidjan,Cacao,2024-03-27,301.74,221.88,23.68,27.66,81.44,9.4,2019,25.67,3
3,2,Abidjan,Café,2019-06-26,114.35,92.11,23.94,34.81,75.21,6.29,2019,29.375,6
4,2,Abidjan,Café,2019-12-01,253.65,92.11,23.94,34.81,75.21,6.29,2019,29.375,12


In [9]:
# Suppression des colonnes inutiles
colonnes_a_supprimer = ['id_stat', 'date_plantation', 'temperature_min', 'temperature_max', 'taux_survie_plants']
dataset = dataset.drop(columns=colonnes_a_supprimer)

# Afficher les premières lignes du dataset mis à jour
dataset.head()


Unnamed: 0,nom_region,nom_espece,superficie,pluviometrie,rendement_moyen,annee,temperature_moyenne,mois_plantation
0,Abidjan,Cacao,311.35,221.88,9.4,2019,25.67,4
1,Abidjan,Cacao,141.35,221.88,9.4,2019,25.67,8
2,Abidjan,Cacao,301.74,221.88,9.4,2019,25.67,3
3,Abidjan,Café,114.35,92.11,6.29,2019,29.375,6
4,Abidjan,Café,253.65,92.11,6.29,2019,29.375,12


In [10]:
import numpy as np

# Suppression des outliers de la colonne 'pluviometrie' avec la méthode IQR
def remove_outliers_IQR(data, column):
    Q1 = data[column].quantile(0.25)  # Premier quartile
    Q3 = data[column].quantile(0.75)  # Troisième quartile
    IQR = Q3 - Q1  # Écart interquartile
    lower_bound = Q1 - 1.5 * IQR  # Borne inférieure
    upper_bound = Q3 + 1.5 * IQR  # Borne supérieure
    return data[(data[column] >= lower_bound) & (data[column] <= upper_bound)]

# Appliquer la suppression des outliers sur la pluviométrie
dataset_cleaned = remove_outliers_IQR(dataset, 'pluviometrie')

# Ajouter une colonne avec la transformation logarithmique de la pluviométrie nettoyée
dataset_cleaned['log_pluviometrie'] = np.log1p(dataset_cleaned['pluviometrie'])

# Afficher un aperçu du dataset mis à jour
dataset_cleaned.head()


Unnamed: 0,nom_region,nom_espece,superficie,pluviometrie,rendement_moyen,annee,temperature_moyenne,mois_plantation,log_pluviometrie
0,Abidjan,Cacao,311.35,221.88,9.4,2019,25.67,4,5.406634
1,Abidjan,Cacao,141.35,221.88,9.4,2019,25.67,8,5.406634
2,Abidjan,Cacao,301.74,221.88,9.4,2019,25.67,3,5.406634
3,Abidjan,Café,114.35,92.11,6.29,2019,29.375,6,4.533782
4,Abidjan,Café,253.65,92.11,6.29,2019,29.375,12,4.533782


Analyses et Visualisation
