In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import evidently
#from evidently.pipeline.column_mapping import ColumnMapping
from evidently import ColumnMapping

from evidently.report import Report
from evidently.metrics.base_metric import generate_column_metrics
from evidently.metric_preset import DataDriftPreset, TargetDriftPreset,DataQualityPreset
from evidently.metrics import *

from evidently.test_suite import TestSuite
from evidently.tests.base_test import generate_column_tests
from evidently.test_preset import DataStabilityTestPreset, NoTargetPerformanceTestPreset, DataQualityTestPreset
from evidently.tests import *

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [2]:
# Refer :- https://www.kaggle.com/rinnqd/reduce-memory-usage

def reduce_memory_usage(df):
  
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
# application_test=reduce_memory_usage(pd.read_csv('./Projet+Mise+en+prod+-+home-credit-default-risk/application_test.csv', sep=',')) 
# application_train= reduce_memory_usage(pd.read_csv('./Projet+Mise+en+prod+-+home-credit-default-risk/application_train.csv', sep=','))

In [7]:
PATH = "/Users/innakonar/Desktop/PythonScripts/Projet+Mise+en+prod+-+home-credit-default-risk/"
application_train = pd.read_csv(PATH+"application_train.csv")
application_test = pd.read_csv(PATH+"application_test.csv")

In [8]:
df_loaded_predictions = pd.read_csv("predictions.csv")

In [9]:
application_test['PREDICTION'] = df_loaded_predictions.values

In [11]:
application_train.info(max_cols=122)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307511 entries, 0 to 307510
Data columns (total 122 columns):
 #    Column                        Non-Null Count   Dtype  
---   ------                        --------------   -----  
 0    SK_ID_CURR                    307511 non-null  int64  
 1    TARGET                        307511 non-null  int64  
 2    NAME_CONTRACT_TYPE            307511 non-null  object 
 3    CODE_GENDER                   307511 non-null  object 
 4    FLAG_OWN_CAR                  307511 non-null  object 
 5    FLAG_OWN_REALTY               307511 non-null  object 
 6    CNT_CHILDREN                  307511 non-null  int64  
 7    AMT_INCOME_TOTAL              307511 non-null  float64
 8    AMT_CREDIT                    307511 non-null  float64
 9    AMT_ANNUITY                   307499 non-null  float64
 10   AMT_GOODS_PRICE               307233 non-null  float64
 11   NAME_TYPE_SUITE               306219 non-null  object 
 12   NAME_INCOME_TYPE            

In [12]:
application_test.info(max_cols=122)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48744 entries, 0 to 48743
Data columns (total 122 columns):
 #    Column                        Non-Null Count  Dtype  
---   ------                        --------------  -----  
 0    SK_ID_CURR                    48744 non-null  int64  
 1    NAME_CONTRACT_TYPE            48744 non-null  object 
 2    CODE_GENDER                   48744 non-null  object 
 3    FLAG_OWN_CAR                  48744 non-null  object 
 4    FLAG_OWN_REALTY               48744 non-null  object 
 5    CNT_CHILDREN                  48744 non-null  int64  
 6    AMT_INCOME_TOTAL              48744 non-null  float64
 7    AMT_CREDIT                    48744 non-null  float64
 8    AMT_ANNUITY                   48720 non-null  float64
 9    AMT_GOODS_PRICE               48744 non-null  float64
 10   NAME_TYPE_SUITE               47833 non-null  object 
 11   NAME_INCOME_TYPE              48744 non-null  object 
 12   NAME_EDUCATION_TYPE           48744 non-null

In [13]:
numerical_features = application_train.select_dtypes(include=['int16', 'int32', 'int64', 'float16', 'float32', 'float64']).columns.tolist()
categorical_features = application_train.select_dtypes(include=['object']).columns.tolist()

for col in ['TARGET', 'PREDICTION', 'SK_ID_CURR']:
    if col in numerical_features:
        numerical_features.remove(col)


In [14]:
column_mapping = {
    "target": "TARGET",
    "prediction": "PREDICTION",
    "datetime": None,  
    "id": "SK_ID_CURR",  
    "numerical_features": numerical_features,
    "categorical_features": categorical_features,
    "text_features": None, 
}



In [15]:
application_test.rename(columns={"PREDICTION": "TARGET"}, inplace=True)


In [None]:
# reference_data = application_train
# current_data = application_test
# Création d'un rapport
report = Report(metrics=[DataDriftPreset()])
# Calcul des métriques pour les données
report.run(reference_data=application_train, current_data=application_test)


In [None]:
report.show(mode='inline')

In [None]:
data_quality_report = Report(metrics=[DataQualityPreset()])

# Run the report on the data
data_quality_report.run(reference_data=application_train, current_data=application_test, column_mapping=None)  # if application_test is the dataset you want to analyze

# Display the report
data_quality_report.show(mode='inline')

In [None]:
test_suite = TestSuite(tests=[
    DataStabilityTestPreset(),
    DataQualityTestPreset()
])


In [None]:
test_suite.run(current_data=application_test, reference_data=application_train, column_mapping=None)


In [None]:
test_suite.show(mode='inline')


In [None]:
test_suite.save_html("stability_quality_report.html")


In [None]:
report.save_html("file.html")

In [None]:
suite = TestSuite(tests=[
    NoTargetPerformanceTestPreset(),
])

suite.run(current_data=application_test, reference_data=application_train)
suite

In [None]:
import os
print(os.getcwd())
