# Nettoyage

## Importation des bibliothèques

In [26]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import Pipeline

In [27]:
dataset = pd.read_csv('dataset.csv', nrows=1000)

In [28]:
dataset.head()

Unnamed: 0,id_stat,nom_region,nom_espece,date_plantation,superficie,pluviometrie,temperature_min,temperature_max,taux_survie_plants,rendement_moyen,annee
0,1,Abidjan,Cacao,2023-04-26,311.35,221.88,23.68,27.66,81.44,9.4,2019
1,1,Abidjan,Cacao,2023-08-20,141.35,221.88,23.68,27.66,81.44,9.4,2019
2,1,Abidjan,Cacao,2024-03-27,301.74,221.88,23.68,27.66,81.44,9.4,2019
3,2,Abidjan,Café,2019-06-26,114.35,92.11,23.94,34.81,75.21,6.29,2019
4,2,Abidjan,Café,2019-12-01,253.65,92.11,23.94,34.81,75.21,6.29,2019


Ajout de nouvelle colonne

In [29]:
# Convertir la colonne 'date' en type datetime
dataset['date_plantation'] = pd.to_datetime(dataset['date_plantation'], errors='coerce')  # Gestion des erreurs pour les dates non valides

# Ajouter la colonne température moyenne
dataset['temperature_moyenne'] = (dataset['temperature_min'] + dataset['temperature_max']) / 2

# Ajouter la colonne mois de plantation
dataset['mois_plantation'] = dataset['date_plantation'].dt.month

# Afficher les premières lignes du dataset enrichi
dataset.head()

Unnamed: 0,id_stat,nom_region,nom_espece,date_plantation,superficie,pluviometrie,temperature_min,temperature_max,taux_survie_plants,rendement_moyen,annee,temperature_moyenne,mois_plantation
0,1,Abidjan,Cacao,2023-04-26,311.35,221.88,23.68,27.66,81.44,9.4,2019,25.67,4
1,1,Abidjan,Cacao,2023-08-20,141.35,221.88,23.68,27.66,81.44,9.4,2019,25.67,8
2,1,Abidjan,Cacao,2024-03-27,301.74,221.88,23.68,27.66,81.44,9.4,2019,25.67,3
3,2,Abidjan,Café,2019-06-26,114.35,92.11,23.94,34.81,75.21,6.29,2019,29.375,6
4,2,Abidjan,Café,2019-12-01,253.65,92.11,23.94,34.81,75.21,6.29,2019,29.375,12


Division du dataset et création du pipeline

In [63]:
# Diviser le dataset en X et y
X = dataset.drop(columns=['rendement_moyen','date_plantation','annee','id_stat'])
y = dataset['rendement_moyen']

X.head()



Unnamed: 0,nom_region,nom_espece,superficie,pluviometrie,temperature_min,temperature_max,taux_survie_plants,temperature_moyenne,mois_plantation
0,Abidjan,Cacao,311.35,221.88,23.68,27.66,81.44,25.67,4
1,Abidjan,Cacao,141.35,221.88,23.68,27.66,81.44,25.67,8
2,Abidjan,Cacao,301.74,221.88,23.68,27.66,81.44,25.67,3
3,Abidjan,Café,114.35,92.11,23.94,34.81,75.21,29.375,6
4,Abidjan,Café,253.65,92.11,23.94,34.81,75.21,29.375,12


In [64]:
# Afficher les colonnes numériques
numeric_columns = X.select_dtypes(include=['number']).columns
print("Colonnes numériques :")
print(numeric_columns)

# Afficher les colonnes non numériques
non_numeric_columns = X.select_dtypes(exclude=['number']).columns
print("Colonnes non numériques :")
print(non_numeric_columns)

Colonnes numériques :
Index(['superficie', 'pluviometrie', 'temperature_min', 'temperature_max',
       'taux_survie_plants', 'temperature_moyenne', 'mois_plantation'],
      dtype='object')
Colonnes non numériques :
Index(['nom_region', 'nom_espece'], dtype='object')


In [74]:
# Créer le pipeline
pipeline = Pipeline([
    ('preprocessor', make_column_transformer(
        (OneHotEncoder(), make_column_selector(dtype_exclude=np.number)),
        (StandardScaler(), make_column_selector(dtype_include=np.number))
    )),
])

# Appliquer le pipeline sur X
X_transformed = pipeline.fit_transform(X, y)

# Afficher X transformé
print(X_transformed)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 9000 stored elements and shape (1000, 35)>
  Coords	Values
  (0, 0)	1.0
  (0, 16)	1.0
  (0, 28)	0.4227556860419999
  (0, 29)	-0.30352099973683155
  (0, 30)	1.1430244679854307
  (0, 31)	-0.9939732339348394
  (0, 32)	0.13506843681980737
  (0, 33)	-0.16492737787358164
  (0, 34)	-0.7265834047920436
  (1, 0)	1.0
  (1, 16)	1.0
  (1, 28)	-0.7660268540327524
  (1, 29)	-0.30352099973683155
  (1, 30)	1.1430244679854307
  (1, 31)	-0.9939732339348394
  (1, 32)	0.13506843681980737
  (1, 33)	-0.16492737787358164
  (1, 34)	0.42947610959466054
  (2, 0)	1.0
  (2, 16)	1.0
  (2, 28)	0.3555545083354212
  (2, 29)	-0.30352099973683155
  (2, 30)	1.1430244679854307
  (2, 31)	-0.9939732339348394
  (2, 32)	0.13506843681980737
  :	:
  (997, 28)	-1.594188714156593
  (997, 29)	0.25968183671591216
  (997, 30)	1.2818762201364355
  (997, 31)	-0.8838259790540057
  (997, 32)	-1.3801581124235962
  (997, 33)	0.009275369697818768
  (997, 34)	-1.3046131619853958

In [88]:
# Appliquer SelectKBest avec k=20
selector = SelectKBest(score_func=f_regression, k=20)
X_selected = selector.fit_transform(X_transformed, y)

# Afficher les lignes sélectionnées
print(X_selected)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 6069 stored elements and shape (1000, 20)>
  Coords	Values
  (0, 15)	-0.30352099973683155
  (0, 16)	1.1430244679854307
  (0, 17)	-0.9939732339348394
  (0, 18)	-0.16492737787358164
  (0, 19)	-0.7265834047920436
  (1, 15)	-0.30352099973683155
  (1, 16)	1.1430244679854307
  (1, 17)	-0.9939732339348394
  (1, 18)	-0.16492737787358164
  (1, 19)	0.42947610959466054
  (2, 15)	-0.30352099973683155
  (2, 16)	1.1430244679854307
  (2, 17)	-0.9939732339348394
  (2, 18)	-0.16492737787358164
  (2, 19)	-1.0155982833887196
  (3, 15)	-1.2871930332021415
  (3, 16)	1.2250732306201155
  (3, 17)	0.5811325108610832
  (3, 18)	1.2083091747903327
  (3, 19)	-0.14855364759869155
  (4, 15)	-1.2871930332021415
  (4, 16)	1.2250732306201155
  (4, 17)	0.5811325108610832
  (4, 18)	1.2083091747903327
  (4, 19)	1.5855356239813647
  :	:
  (995, 15)	0.25968183671591216
  (995, 16)	1.2818762201364355
  (995, 17)	-0.8838259790540057
  (995, 18)	0.00927536969781876

In [89]:
# Obtenir les indices des colonnes sélectionnées
selected_indices = selector.get_support(indices=True)

# Obtenir les noms des colonnes sélectionnées
selected_columns = [transformed_columns[i] for i in selected_indices]

# Afficher les noms des colonnes sélectionnées
print("Colonnes sélectionnées :")
print(selected_columns)

Colonnes sélectionnées :
['pluviometrie', 'temperature_min', 'temperature_max', 'taux_survie_plants', 'nom_region_Abidjan', 'nom_region_Bas-Sassandra', 'nom_region_Denguélé', 'nom_region_Gôh-Djiboua', 'nom_region_Lacs', 'nom_region_Lagunes', 'nom_region_Zanzan', 'nom_espece_Café', 'nom_espece_Cocotier', 'nom_espece_Eucalyptus', 'nom_espece_Goyave', 'nom_espece_Iroko', 'nom_espece_Mangue', 'nom_espece_Palmier à huile', 'nom_espece_Samba', 'nom_espece_Teck']
