# EazyML Data Quality Template

## Define Imports

In [None]:
!pip install --upgrade eazyml-data-quality
!pip install --upgrade eazyml-automl
!pip install gdown python-dotenv

In [2]:
import os
from eazyml_data_quality import (
    ez_init,
    ez_data_quality
)

from eazyml import ez_display_df, ez_display_json
import gdown
import pandas as pd

from dotenv import load_dotenv
load_dotenv()

True

## 1. Initialize EazyML

In [3]:
ez_init(access_key=os.getenv('EAZYML_ACCESS_KEY'))

{'success': True,
 'message': 'Initialized successfully. You may revoke your consent to sharing usage stats anytime. You have exclusive paid access.'}

## 2. Define Dataset Files and Outcome Variable

In [None]:
gdown.download_folder(id='1EobxYR3pg_Z3Sd4sETfe4aJLAsT98fL2')

In [5]:
# Names of the files that will be used by EazyML APIs
eazyml_train_file = os.path.join('data', "Heart_Attack_traindata.csv")
eazyml_test_file = os.path.join('data', "Heart_Attack_testdata.csv")

# The column name for outcome of interest
outcome = "class"

## 3. EazyML Data Quality Assessment

### 3.1 Call ez_data_quality API, Perform All Checks

In [6]:
options = {
    "data_shape": "yes",
    "data_emptiness": "yes",
    "data_balance": "yes",
    "impute": "yes",
    "data_outliers": "yes",
    "remove_outliers": "yes",
    "outcome_correlation": "yes",
    "data_drift": "yes",
    "model_drift": "yes",
    "prediction_data": eazyml_test_file,
    "data_completeness": "yes",
    "data_correctness": "yes"
}

res = ez_data_quality(eazyml_train_file, outcome, options)

### 3.2 Data Quality Assessment Results

#### 3.2.1 Data Quality Alerts: Check if Any Alerts Are True

In [7]:
alerts = res['data_bad_quality_alerts']
ez_display_json(alerts)

<IPython.core.display.JSON object>

#### 3.2.2 Data Completeness?

In [8]:
ez_display_json(res['data_completeness_quality'])

<IPython.core.display.JSON object>

#### 3.2.3 Data Balanced?

In [9]:
ez_display_json(res['data_balance_quality'])

<IPython.core.display.JSON object>

#### 3.2.4 Data Correctness?

In [10]:
ez_display_json(res['data_correctness_quality'])

<IPython.core.display.JSON object>

#### 3.2.5 Data Correlations? Look for Strongly Correlated Features

In [11]:
feat_list = list(res['data_correlation_quality']['data_correlation'].keys())
df_corr = pd.DataFrame(columns=feat_list)
corr_dict = dict()

for feat in res['data_correlation_quality']['data_correlation']:
    corr_list = [0.0000 for i in range(len(df_corr.columns))]
    corr_val = dict()
    corr_list[feat_list.index(feat)] = 1.0000
    for another_feat in res['data_correlation_quality']['data_correlation'][feat]:
        corr_list[feat_list.index(another_feat)] = res['data_correlation_quality']['data_correlation'][feat][another_feat]
        if res['data_correlation_quality']['data_correlation'][feat][another_feat] > 0.90:
            corr_val[another_feat] = res['data_correlation_quality']['data_correlation'][feat][another_feat]
    df_corr.loc[feat] = corr_list
    if len(corr_val) != 0:
        corr_dict[feat] = corr_val

In [12]:
ez_display_json(corr_dict)

<IPython.core.display.JSON object>

#### 3.2.6 Data Emptiness?

In [13]:
ez_display_json(res['data_emptiness_quality'])

<IPython.core.display.JSON object>

#### 3.2.7 Data Dimension? Is it Adequate?

In [14]:
ez_display_json(res['data_shape_quality'])

<IPython.core.display.JSON object>

#### 3.2.8 Data Outliers? 

In [15]:
try:
    outlier_df = pd.DataFrame(data=res['data_outliers_quality']['outliers']['data'], \
                              columns=res['data_outliers_quality']['outliers']['columns'], \
                              index=res['data_outliers_quality']['outliers']['indices'])
    ez_display_df(outlier_df.head())
except:
    print ("no outlier")

Unnamed: 0,age,gender,impluse,pressurehight,pressurelow,glucose,kcm,troponin,class
1028,68,1,89,145,68,134.0,0.706,10.0,positive
7,63,1,60,214,82,87.0,300.0,2.37,positive
12,64,1,60,199,99,92.0,3.43,5.37,positive
530,31,0,64,130,70,263.0,142.6,0.003,positive
1047,55,0,96,105,70,66.0,300.0,0.003,positive


#### 3.2.9 Data Drift (Between Train and Test Datasets)

In [16]:
ez_display_json(res['drift_quality']['data_drift_analysis'])

<IPython.core.display.JSON object>

In [17]:
ks_drift = res['drift_quality']['data_drift_analysis']['ks_data_drift_analysis']['feature : p_value']
drift_columns = []
for feature in ks_drift:
    if ks_drift[feature] < 0.05:
        drift_columns.append(feature)
        print(feature, ks_drift[feature])

impluse 0.009


#### 3.2.10 Model Drift (Between Train and Test Datasets)

In [18]:
ez_display_json(res['drift_quality']['model_drift_analysis'])

<IPython.core.display.JSON object>

In [19]:
interval_drift = res['drift_quality']['model_drift_analysis']['interval_model_drift_analysis']['feature : OF_I']
model_drift_columns = []
for feature in interval_drift:
    if interval_drift[feature] < 0.05:
        model_drift_columns.append(feature)
        print(feature, interval_drift[feature])