In [36]:
import pandas as pd
import numpy as np
from pathlib import Path
import random
import json

import evidently
print(evidently.__version__)

# from evidently.test_suite import TestSuite
# from evidently.test_preset import DataStabilityTestPreset
from evidently.metric_preset import DataDriftPreset

from evidently.report import Report
from evidently.pipeline.column_mapping import ColumnMapping



0.4.40


In [37]:
# In DEMO_MODE we force 25% of the records to have a non-NULL 'fraud_confirmed' feature
# Otherwise all records having a non-NULL 'fraud_confirmed' feature are used to create the CSV file
k_DEMO_MODE = True

# In DEBUG_MODE we force 1 record have a non-NULL 'fraud_confirmed' feature
k_DEBUG_MODE = True



# k_Current_dir   = Path.cwd()
# print(k_Current_dir)
k_AssetsDir     = "../../04_data"
k_FileName      = "fraud_test.csv"
k_AssetsURL     = "https://lead-program-assets.s3.eu-west-3.amazonaws.com/M05-Projects/fraudTest.csv"

k_Reference_Sample = "reference_sample.csv"
k_Production_Sample = 'production_sample.csv'
k_Production_Drifted_Sample = 'production_drifted_sample.csv'




In [38]:
# -----------------------------------------------------------------------------
filename_in = Path(k_AssetsDir)/k_FileName
df = pd.read_csv(filename_in)

# Alternative (AWS S3 bucket)
# df = pd.read_csv(k_AssetURL)

df.columns = df.columns.str.lower()
df.head(3)

Unnamed: 0,unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2020-06-21 12:14:25,2291163933867244,fraud_Kirlin and Sons,personal_care,2.86,Jeff,Elliott,M,351 Darlene Green,...,33.9659,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0
1,1,2020-06-21 12:14:33,3573030041201292,fraud_Sporer-Keebler,personal_care,29.84,Joanne,Williams,F,3638 Marsh Union,...,40.3207,-110.436,302,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0
2,2,2020-06-21 12:14:53,3598215285024754,"fraud_Swaniawski, Nitzsche and Welch",health_fitness,41.28,Ashley,Lopez,F,9333 Valentine Point,...,40.6729,-73.5365,34496,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be,1371816893,40.49581,-74.196111,0


In [39]:
def generate_datasets() -> None:
    """
    Create two random samples if there are sufficient rows in the dataset.
    """
    # Define the data directory and file paths
    # data_dir = Path('../../data')
    fraud_test_file = Path(k_AssetsDir)/k_FileName
    reference_sample_file = Path(k_AssetsDir) / k_Reference_Sample
    production_sample_file = Path(k_AssetsDir) / k_Production_Sample 

    # Check if the input file exists
    if not fraud_test_file.exists():
        print(f"File not found: {fraud_test_file}")
        return

    # Load the dataset
    df = pd.read_csv(fraud_test_file)

    # Determine the number of rows
    num_rows: int = len(df)
    print(f"Number of rows in the dataset: {num_rows}")

    # Stop if there are less than 2000 rows
    if num_rows < 2000:
        print("Not enough rows in the dataset. Exiting.")
        return

    # Take 1_000 random rows for the reference sample
    reference_indices = random.sample(range(num_rows), 1000)
    reference_sample = df.iloc[reference_indices]

    # Take another 1_000 random rows for the production sample (different from the first set)
    remaining_indices = list(set(range(num_rows)) - set(reference_indices))
    production_indices = random.sample(remaining_indices, 1000)
    production_sample = df.iloc[production_indices]

    # Save the samples to their respective files
    reference_sample.to_csv(reference_sample_file, index=False)
    production_sample.to_csv(production_sample_file, index=False)

    print(f"Saved reference sample to: {reference_sample_file}")
    print(f"Saved production sample to: {production_sample_file}")

generate_datasets()


Number of rows in the dataset: 555719
Saved reference sample to: ..\..\04_data\reference_sample.csv
Saved production sample to: ..\..\04_data\production_sample.csv


In [40]:
reference = pd.read_csv(Path(k_AssetsDir) / k_Reference_Sample)
reference.rename(columns={reference.columns[0]: 'id'}, inplace=True)
production = pd.read_csv(Path(k_AssetsDir) / k_Production_Sample)
production.rename(columns={production.columns[0]: 'id'}, inplace=True)
reference.head()

Unnamed: 0,id,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,393491,2020-11-22 20:10:33,4476840372112,fraud_Volkman PLC,food_dining,23.77,Steven,Walters,M,3206 Hall Divide Suite 282,...,34.6689,-86.2296,3395,"Editor, commissioning",1979-01-21,655b87da8bf09f398a570007bd5c9c6f,1385151033,35.477862,-86.01894,0
1,185495,2020-08-25 03:47:49,4025612008285111,"fraud_Rodriguez, Yost and Jenkins",misc_net,2.03,Krystal,Key,F,23142 Montoya Island Apt. 742,...,41.1762,-79.1976,1274,Maintenance engineer,1949-03-20,0cd10385dcf7c08e550fa8cfca3527ce,1377402469,41.849417,-79.161127,0
2,202942,2020-08-31 15:42:48,376944481517097,fraud_Jacobi Inc,health_fitness,5.73,Tiffany,Stephens,F,1447 Ryan Lodge,...,30.4983,-88.3282,15647,"Scientist, physiological",1957-12-26,5de8c222605151d916eb8cf380f1d7f0,1377963768,29.598724,-88.96416,0
3,46363,2020-07-07 01:34:47,340951438290556,"fraud_Cremin, Hamill and Reichel",misc_pos,28.41,Maria,Garcia,F,865 Thomas Village,...,41.0442,-73.9609,5950,Records manager,1971-07-02,9ce85231df6ed0a60dfc08603a40c932,1373160887,41.877566,-73.594759,0
4,91635,2020-07-22 22:32:46,4560004149983868183,fraud_Schmidt and Sons,shopping_net,60.7,Stacy,Villegas,F,20581 Pena Walks,...,38.8881,-104.6556,525713,Museum/gallery exhibitions officer,1992-05-09,21c4e8784585b6b2210c91f9deb37015,1374532366,38.489233,-104.949516,0


In [None]:
reference.describe(include='all')


In [None]:
# Montants (amt) : 30 % des transactions ont été augmentées de 50 %.
# Latitudes et longitudes (merch_lat, merch_long) : 20 % des transactions ont subi un léger décalage.
# Catégories (category) : 30 % des transactions ont vu leur catégorie changée en faveur d'un biais.


# Créer une copie des données pour introduire un drift
drifted_data = production.copy()

# Drift 1: Modifier les montants des transactions ('amt')
# Augmenter les transactions de 50% pour un échantillon aléatoire
drifted_data['amt'] = drifted_data['amt'] * np.random.choice([1, 1.5], size=len(production), p=[0.7, 0.3])

# Drift 2: Modifier les latitudes et longitudes ('merch_lat', 'merch_long')
# Introduire un décalage artificiel pour certaines transactions
drifted_data['merch_lat'] += np.random.choice([0, 0.01], size=len(production), p=[0.8, 0.2])
drifted_data['merch_long'] += np.random.choice([0, -0.01], size=len(production), p=[0.8, 0.2])

# Drift 3: Modifier la répartition des catégories ('category')
# Introduire un biais pour favoriser certaines catégories
categories = drifted_data['category'].unique()
biased_categories = np.random.choice(categories, size=len(production), p=[0.5] + [0.5 / (len(categories) - 1)] * (len(categories) - 1))
# drifted_data['category'] = np.random.choice([drifted_data['category'], biased_categories], size=len(production), p=[0.7, 0.3])
categories = drifted_data['category'].unique()
biased_categories = np.random.choice(categories, size=len(production), p=[0.5] + [0.5 / (len(categories) - 1)] * (len(categories) - 1))

# Modifier les catégories avec une probabilité de 30%
drifted_data['category'] = np.where(
    np.random.rand(len(production)) < 0.3,  # 30% des lignes seront modifiées
    biased_categories,  # Nouvelle catégorie biaisée
    drifted_data['category']  # Catégorie actuelle
)

# Enregistrer le dataset drifté pour les tests
drifted_data.to_csv(Path(k_AssetsDir) / k_Production_Drifted_Sample, index=False)






In [44]:
drifted_data.describe(include='all')


Unnamed: 0,id,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
count,1000.0,1000,1000.0,1000,1000,1000.0,1000,1000,1000,1000,...,1000.0,1000.0,1000.0,1000,1000,1000,1000.0,1000.0,1000.0,1000.0
unique,,1000,,509,14,,259,338,2,567,...,,,,375,560,1000,,,,
top,,2020-12-17 22:42:06,,"fraud_Tillman, Dickinson and Labadie",misc_pos,,Jessica,Williams,F,6296 John Keys Suite 858,...,,,,"Surveyor, land/geomatics",1955-05-06,056f097a842ffb086de9c004a30322e3,,,,
freq,,1,,8,213,,21,23,550,7,...,,,,11,7,1,,,,
mean,279897.164,,4.600931e+17,,,71.30869,,,,,...,38.43612,-90.489645,87166.73,,,,1380735000.0,38.429281,-90.536996,0.004
std,161721.994019,,1.361094e+18,,,93.694474,,,,,...,5.033686,14.16723,275833.8,,,,5237118.0,5.088277,14.190561,0.063151
min,99.0,,60416210000.0,,,1.04,,,,,...,20.0271,-165.6723,23.0,,,,1371819000.0,19.22235,-166.257153,0.0
25%,140507.0,,180056200000000.0,,,11.71875,,,,,...,34.64455,-97.2893,742.5,,,,1376069000.0,34.815069,-97.389318,0.0
50%,281403.5,,3525083000000000.0,,,54.56,,,,,...,39.3764,-87.5917,2464.0,,,,1380904000.0,39.306857,-87.356796,0.0
75%,427677.0,,4661522000000000.0,,,92.56875,,,,,...,41.5744,-80.1275,19638.0,,,,1386095000.0,41.935965,-80.314446,0.0


In [30]:
# Fait le liste des colonnes num qu'on veut surveiller
numeric_columns = production.select_dtypes(include="number").columns
# ColumnMapping(numerical_features=numeric_columns)

object_columns = production.select_dtypes(include="object").columns
print(object_columns)
# ColumnMapping()

column_mapping = ColumnMapping(
    numerical_features=numeric_columns,
    categorical_features=["gender"],
    datetime_features=["trans_date_trans_time"],
    target="is_fraud",
    prediction="predicted_fraud",
    id="id"
)


Index(['trans_date_trans_time', 'merchant', 'category', 'first', 'last',
       'gender', 'street', 'city', 'state', 'job', 'dob', 'trans_num'],
      dtype='object')


In [32]:
report = Report(metrics=[DataDriftPreset()])
report.run(reference_data=reference, current_data=production)

results = report.as_dict()

# print(json.dumps(results, indent=4))

# features = results['metrics'][0]['result']['details']['features']
# drifted_features = [feature for feature, details in features.items() if details['drift_detected']]

# print("Features with drift:", drifted_features)


results = report.as_dict()

# Extraire les détails sur les colonnes
drift_by_columns = results['metrics'][1]['result']['drift_by_columns']

# Récupérer les features ayant drifté
drifted_features = [
    col for col, details in drift_by_columns.items()
    if details.get('p_value', 1) < details.get('stattest_threshold', 0.05)  # Vérifie la p-value
]

print("Features with drift:", drifted_features)

  terms = (f_obs - f_exp)**2 / f_exp
  terms = (f_obs - f_exp)**2 / f_exp
  terms = (f_obs - f_exp)**2 / f_exp
  terms = (f_obs - f_exp)**2 / f_exp
  terms = (f_obs - f_exp)**2 / f_exp
  terms = (f_obs - f_exp)**2 / f_exp
  terms = (f_obs - f_exp)**2 / f_exp
  terms = (f_obs - f_exp)**2 / f_exp
  terms = (f_obs - f_exp)**2 / f_exp
  terms = (f_obs - f_exp)**2 / f_exp
  terms = (f_obs - f_exp)**2 / f_exp
  terms = (f_obs - f_exp)**2 / f_exp
  terms = (f_obs - f_exp)**2 / f_exp
  terms = (f_obs - f_exp)**2 / f_exp
  terms = (f_obs - f_exp)**2 / f_exp
  terms = (f_obs - f_exp)**2 / f_exp
  terms = (f_obs - f_exp)**2 / f_exp
  terms = (f_obs - f_exp)**2 / f_exp
  terms = (f_obs - f_exp)**2 / f_exp
  terms = (f_obs - f_exp)**2 / f_exp


Features with drift: []
