In [2]:
import pandas as pd
import numpy as np

from sklearn import datasets, ensemble, model_selection

In [None]:
from evidently import ColumnMapping
from evidently.report import Report
from evidently.test_suite import TestSuite

from evidently.metric_preset import DataDriftPreset
from evidently.metric_preset import DataQualityPreset
from evidently.metric_preset import RegressionPreset
from evidently.metric_preset import ClassificationPreset
from evidently.metric_preset import TargetDriftPreset
from evidently.metric_preset import TextOverviewPreset

from evidently.metrics import *

from evidently.test_preset import NoTargetPerformanceTestPreset
from evidently.test_preset import DataStabilityTestPreset
from evidently.test_preset import DataQualityTestPreset
from evidently.test_preset import DataDriftTestPreset

from evidently.tests import *

from evidently.tests.base_test import generate_column_tests
from evidently.metrics.base_metric import generate_column_metrics

from sklearn.preprocessing import LabelEncoder 

In [None]:
import nltk
nltk.download('words')
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
reviews = pd.read_csv('spam.csv',encoding = "latin-1")

In [None]:
reviews.shape

In [None]:
reviews = reviews.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'], axis=1)
reviews = reviews.rename(columns={'v1': 'type', 'v2' : 'text'})

#Suppression of duplicate lines
reviews = reviews.drop_duplicates()

In [None]:
reviews.shape

In [None]:
reviews.head()

In [None]:
lb_encod = LabelEncoder()
reviews['type'] = lb_encod.fit_transform(reviews['type'])

In [None]:
reviews.head()

In [5]:
reviews2 = pd.read_csv(r'C:\Users\cesar\Documents\Dos\Entrega final de la formacion IA\E1\E1_Projet-Spam\derniere_prédiction_de_messages.csv',encoding = "latin-1")


In [6]:
reviews2

Unnamed: 0,text,type,prediction,probabilite_spam
0,he IRS is trying to reach you regarding a tax ...,?,ham,18.3
1,Hurry â your IRS tax refund is ready to be a...,?,spam,85.5
2,"You have an overdue tax refund of $1,000 waiti...",?,spam,62.7
3,Our records indicate that you have overpaid fo...,?,spam,71.3
4,We have a package for you. Click this link to ...,?,spam,88.0
5,"Myke, we have recently discovered an awaiting ...",?,spam,91.1
6,Your new billing statement for April is now av...,?,spam,73.1
7,"Greetings Clark, your AT&T billing invoice is ...",?,ham,26.7
8,Congratulations! Youâve won a free iPhone 12...,?,spam,99.8
9,You have been chosen to receive a one-time $50...,?,spam,98.8


In [None]:
# #Dataset for Data Quality and Integrity
# reviews_data = datasets.fetch_openml(name='Womens-E-Commerce-Clothing-Reviews', version=2, as_frame='auto')
# reviews = reviews_data.frame

In [None]:
reviews['prediction'] = reviews['type']

In [None]:
reviews_ref = reviews.sample(n=2000, replace=True, ignore_index=True, random_state=42) #.dropna()
reviews_cur = reviews.sample(n=2000, replace=True, ignore_index=True, random_state=81) #.dropna()

In [None]:
reviews.head()

In [None]:
column_mapping = ColumnMapping(
    target='type',
    text_features=['text']
)

In [None]:
data_drift_report = Report(metrics=[
    DataDriftPreset(num_stattest='ks', cat_stattest='psi', num_stattest_threshold=0.2, cat_stattest_threshold=0.2),
])

data_drift_report.run(reference_data=reviews_ref, current_data=reviews_cur, column_mapping=column_mapping)
data_drift_report.show(mode='inline')
# data_drift_report

In [None]:
data_drift_report.json()


In [None]:
data_quality_report = Report(metrics=[
    DataQualityPreset()
])

data_quality_report.run(reference_data=reviews_ref, current_data=reviews_cur, column_mapping=column_mapping)
data_quality_report.show(mode='inline')

In [None]:
data_quality_report.as_dict()


In [None]:
data_quality_report.json()


In [None]:
target_drift_report = Report(metrics=[
    TargetDriftPreset()
])

target_drift_report.run(reference_data=reviews_ref, current_data=reviews_cur, column_mapping=column_mapping)
target_drift_report.show(mode='inline')

In [None]:
target_drift_report.json()


In [None]:
regression_report = Report(metrics=[
    RegressionPreset()
])

regression_report.run(reference_data=reviews_ref, current_data=reviews_cur, column_mapping=ColumnMapping(
    target='Rating',
    prediction='prediction',
    numerical_features=['Age', 'Positive_Feedback_Count'],
    categorical_features=['Division_Name', 'Department_Name', 'Class_Name'],
    text_features=['Review_Text', 'Title'],
    task='regression'
  )
)
regression_report.show(mode='inline')

In [None]:
regression_report.json()



In [None]:
classification_report = Report(metrics=[
    ClassificationPreset()
])
classification_report.run(reference_data=reviews.sample(n=5000, replace=False), current_data=reviews.sample(n=5000, replace=False), column_mapping=ColumnMapping(
    target='Rating',
    prediction='prediction',
    numerical_features=['Age', 'Positive_Feedback_Count'],
    categorical_features=['Division_Name', 'Department_Name', 'Class_Name'],
    text_features=['Review_Text', 'Title'],
    task='classification'
  )
)

classification_report.show(mode='inline')

In [None]:
text_overview_report = Report(metrics=[
    TextOverviewPreset(column_name="text")
])

text_overview_report.run(reference_data=reviews_ref, current_data=reviews_cur, column_mapping=column_mapping)
text_overview_report.show(mode='inline')

In [None]:
dataset_metrics_with_text_report = Report(metrics=[
    DatasetSummaryMetric(), 
    DatasetMissingValuesMetric(),
    DatasetCorrelationsMetric(), 
    ConflictTargetMetric(),
    ConflictPredictionMetric(),
    DatasetDriftMetric(),
    DataDriftTable(),
    TargetByFeaturesTable(columns=['Review_Text', 'Title']),
    ClassificationQualityByFeatureTable(), 
])

dataset_metrics_with_text_report.run(reference_data=reviews_ref, current_data=reviews_cur, column_mapping=column_mapping)
dataset_metrics_with_text_report.show(mode='inline')

In [None]:
column_metrics_with_text_report = Report(metrics=[
    ColumnSummaryMetric(column_name="Review_Text"),
    ColumnMissingValuesMetric(column_name="Review_Text"), 
    ColumnRegExpMetric(column_name="Review_Text", reg_exp=r'.*love*.'),
    ColumnDriftMetric(column_name="Review_Text"),
    ColumnSummaryMetric(column_name="Title"),
    ColumnMissingValuesMetric(column_name="Title"), 
    ColumnRegExpMetric(column_name="Title", reg_exp=r".*love*."),
    ColumnDriftMetric(column_name="Title"),
])

column_metrics_with_text_report.run(reference_data=reviews_ref, current_data=reviews_cur, column_mapping=column_mapping)
column_metrics_with_text_report.show(mode='inline')

In [None]:
text_specific_metrics_report = Report(metrics=[
    TextDescriptorsDriftMetric(column_name="Review_Text"),
    TextDescriptorsDistribution(column_name="Review_Text"),
    TextDescriptorsCorrelationMetric(column_name="Review_Text"),
])

text_specific_metrics_report.run(reference_data=reviews_ref, current_data=reviews_cur, column_mapping=column_mapping)
text_specific_metrics_report.show(mode='inline')

In [None]:
no_target_performance_suite = TestSuite(tests=[
    NoTargetPerformanceTestPreset()
])

no_target_performance_suite.run(reference_data=reviews_ref, current_data=reviews_cur, column_mapping=column_mapping)
no_target_performance_suite.show(mode='inline')

In [None]:
data_stability_suite = TestSuite(tests=[
    DataStabilityTestPreset()
])

data_stability_suite.run(reference_data=reviews_ref, current_data=reviews_cur, column_mapping=column_mapping)
data_stability_suite.show(mode='inline')

In [None]:
data_quality_suite = TestSuite(tests=[
    DataQualityTestPreset()
])

data_quality_suite.run(reference_data=reviews_ref, current_data=reviews_cur, column_mapping=column_mapping)
data_quality_suite.show(mode='inline')