In [None]:
#
# ! Select evidently_no_docker virtual env 
# conda install numpy pandas evidently  -c conda-forge -y just ins case
# 


In [None]:
# -----------------------------------------------------------------------------
# prelude

import random
import numpy as np
import pandas as pd
from pathlib import Path
from datetime import datetime
pd.options.display.max_columns = None
# pd.reset_option('display.max_columns')

# import evidently
# print(evidently.__version__)

from evidently.report import Report
from evidently.test_suite import TestSuite
from evidently.metric_preset import DataDriftPreset
from evidently.test_preset import DataStabilityTestPreset
from evidently.pipeline.column_mapping import ColumnMapping


# !!!!!!!!!!!!!!!!!
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)


In [None]:
# -----------------------------------------------------------------------------
k_Number_of_samples     = 1024  # ! Must be an even number >= 30

# k_Current_dir   = Path.cwd()
# print(k_Current_dir)
k_Data_Dir        = Path.cwd()/"../../04_data"
k_Fraud_Test_csv  = "fraud_test.csv"
k_Production_csv = "production.csv"




In [None]:
# -----------------------------------------------------------------------------
# df = pd.read_csv(k_Data_Dir/k_Fraud_Test_csv)
# df.rename(columns={df.columns[0]: 'id'}, inplace=True)
# df.columns = df.columns.str.lower()
# df.head(3)

In [None]:
# pick k_Number_of_samples samples to create a reference_df
df = pd.read_csv(k_Data_Dir/k_Fraud_Test_csv)
num_rows: int = len(df)
reference_indices = random.sample(range(num_rows), int(k_Number_of_samples))
# reference_df = df.iloc[reference_indices]
reference_df = df.loc[reference_indices]
reference_df.rename(columns={reference_df.columns[0]: 'id'}, inplace=True)
reference_df["trans_date_trans_time"] = pd.to_datetime(reference_df["trans_date_trans_time"], format='%Y-%m-%d %H:%M:%S')
reference_df["dob"] = pd.to_datetime(reference_df["dob"], format='%Y-%m-%d')
reference_df["is_fraud"] = reference_df["is_fraud"].astype(bool)


df = pd.read_csv(Path(k_Data_Dir)/k_Production_csv)
num_rows: int = len(df)
production_indices = random.sample(range(num_rows), int(k_Number_of_samples))

# production_df = df.iloc[production_indices]
production_df = df.loc[production_indices]

# production_df.rename(columns={production_df.columns[0]: 'id'}, inplace=True)
# Insert an id at column 0 and fill from 1 to n_rows
production_df.insert(0, 'id', range(1, k_Number_of_samples + 1))
production_df["trans_date_trans_time"] = pd.to_datetime(production_df["trans_date_trans_time"], format='%Y-%m-%d %H:%M:%S')
production_df["dob"] = pd.to_datetime(production_df["dob"], format='%Y-%m-%d')
production_df["is_fraud"] = production_df["is_fraud"].astype(bool)




In [None]:
# Filtrer uniquement les colonnes numériques dans les DataFrames
# Car le modèle n'utilise que les données numériques et qu'en plus en mode demo
# on introduit du biais que sur certaines données numériques

reference_numeric_df  = reference_df.select_dtypes(include="number")
production_numeric_df = production_df.select_dtypes(include="number")
# production_drifted_numeric_df = production_drifted_df.select_dtypes(include="number")

numeric_columns = reference_df.select_dtypes(include="number").columns
object_columns = reference_df.select_dtypes(include="object").columns


column_mapping = ColumnMapping(
    numerical_features=numeric_columns,
    # categorical_features=["gender", "category"],
    datetime_features=["trans_date_trans_time", "dob"],
    target="is_fraud",
    id="id"
)


# Run the report and analyze drift
data_drift_report = Report(metrics=[DataDriftPreset()])








In [None]:
# data_drift_report.run(reference_data=reference_numeric_df, current_data=production_numeric_df, column_mapping=None)
data_drift_report.run(reference_data=reference_numeric_df, current_data=production_numeric_df)

# Extract and display drifted features
results = data_drift_report.as_dict()
drift_by_columns = results['metrics'][1]['result']['drift_by_columns']

# Display features with drift
drifted_features = [
    col for col, details in drift_by_columns.items()
    # Si la clé 'drift_detected' existe dans le dictionnaire, sa valeur sera retournée sinon on retourne False
    if details.get('drift_detected', False)  
]
print("Features with drift:", drifted_features)



In [None]:
# column_mapping=None => Inférence automatique
data_drift_report.run(reference_data=reference_numeric_df, current_data=production_numeric_df, column_mapping=None)
data_drift_report.show("inline")

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_file = f"data_drift_report_{timestamp}.html"
data_drift_report.save_html(output_file)



In [None]:
if True:
    data_stability= TestSuite(tests=[
        DataStabilityTestPreset(), # ici on fait 1 seul test de stabilité
    ])

    # column_mapping=None => Inférence automatique
    data_stability.run(reference_data=reference_numeric_df, current_data=production_numeric_df, column_mapping=None)         
    data_stability.show("inline")

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_file = f"data_stability_report_{timestamp}.html"
    data_drift_report.save_html(output_file)

