In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import IsolationForest
from scipy.stats.mstats import winsorize

In [None]:
# Charger les données (remplacez le chemin par le vôtre)
data = pd.read_csv('votre_dataset.csv')

In [None]:
# Définir les colonnes numériques et catégorielles
numerical_features = data.select_dtypes(include=['float64', 'int64']).columns
categorical_features = data.select_dtypes(include=['object']).columns

# Créer le pipeline de traitement des données
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

outlier_detector = IsolationForest(contamination=0.05)


In [None]:
# Créer le pipeline complet
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('outlier_detector', outlier_detector)
])

In [None]:
# Appliquer le pipeline sur les données
processed_data = pipeline.fit_transform(data)

# Méthode 1: Suppression des valeurs aberrantes
data_no_outliers = data[(np.abs(processed_data[:, 0]) < 3) & (np.abs(processed_data[:, 1]) < 3)]

# Méthode 2: Remplacement par la médiane
data_median_replaced = data.copy()
for feature in numerical_features:
    median_val = data[feature].median()
    data_median_replaced[feature] = np.where((processed_data[:, 0] - median_val).abs() > 3 * data[feature].std(), median_val, data[feature])

# Méthode 3: Winsorization
data_winsorized = data.copy()
for feature in numerical_features:
    data_winsorized[feature] = winsorize(data[feature], limits=[0.05, 0.05])

# Créer un DataFrame avec les données traitées
processed_df = pd.DataFrame(processed_data, columns=numerical_features)

In [None]:
# Utiliser Plotly pour créer un scatter plot interactif
fig = px.scatter(processed_df, x=numerical_features[0], y=numerical_features[1], title='Scatter plot of processed data')
fig.show()

# Visualisation après traitement des valeurs aberrantes
fig_no_outliers = px.scatter(data_no_outliers, x=numerical_features[0], y=numerical_features[1], title='Scatter plot after removing outliers')
fig_median_replaced = px.scatter(data_median_replaced, x=numerical_features[0], y=numerical_features[1], title='Scatter plot after median replacement')
fig_winsorized = px.scatter(data_winsorized, x=numerical_features[0], y=numerical_features[1], title='Scatter plot after winsorization')

fig_no_outliers.show()
fig_median_replaced.show()
fig_winsorized.show()
