In [1]:
import pandas as pd
from scipy.stats import ks_2samp


In [5]:
training_file_path= "artifact\\07_29_2023_11_59_24\\data_ingestion\\ingested\\train.csv"

In [6]:
testing_file_path= 'artifact\\07_29_2023_11_59_24\data_ingestion\\ingested\\test.csv'

In [7]:
train_df= pd.read_csv(training_file_path)
test_df= pd.read_csv(testing_file_path)

In [8]:
train_df.columns

Index(['class', 'aa_000', 'ac_000', 'ad_000', 'ae_000', 'af_000', 'ag_000',
       'ag_001', 'ag_002', 'ag_003',
       ...
       'ee_002', 'ee_003', 'ee_004', 'ee_005', 'ee_006', 'ee_007', 'ee_008',
       'ee_009', 'ef_000', 'eg_000'],
      dtype='object', length=164)

In [9]:
test_df.columns

Index(['class', 'aa_000', 'ac_000', 'ad_000', 'ae_000', 'af_000', 'ag_000',
       'ag_001', 'ag_002', 'ag_003',
       ...
       'ee_002', 'ee_003', 'ee_004', 'ee_005', 'ee_006', 'ee_007', 'ee_008',
       'ee_009', 'ef_000', 'eg_000'],
      dtype='object', length=164)

In [10]:
def get_data_drift_report(base_df, current_df, threshold= 0.5):
    report= {}
    for column in base_df.columns:
        b_df= base_df[column]
        c_df= current_df[column]
        is_same_dist= ks_2samp(b_df, c_df)
        if threshold<=is_same_dist.pvalue:
            is_found= False
        else:   
            is_found= True
        report.update({
        column: {
        'p_value': float(is_same_dist.pvalue),
        'drift_status': is_found}
            })
    return report

In [11]:
drift_report= get_data_drift_report(train_df, test_df)
drift_report

{'class': {'p_value': 1.0, 'drift_status': False},
 'aa_000': {'p_value': 0.5014534999567812, 'drift_status': False},
 'ac_000': {'p_value': 0.45178907553769865, 'drift_status': True},
 'ad_000': {'p_value': 0.8340209128807592, 'drift_status': False},
 'ae_000': {'p_value': 0.9999999983385853, 'drift_status': False},
 'af_000': {'p_value': 0.9999999999999993, 'drift_status': False},
 'ag_000': {'p_value': 1.0, 'drift_status': False},
 'ag_001': {'p_value': 0.9999999999998179, 'drift_status': False},
 'ag_002': {'p_value': 0.9998409112061302, 'drift_status': False},
 'ag_003': {'p_value': 0.6852818900773574, 'drift_status': False},
 'ag_004': {'p_value': 0.32534608191746717, 'drift_status': True},
 'ag_005': {'p_value': 0.15055189019108572, 'drift_status': True},
 'ag_006': {'p_value': 0.4722416625279099, 'drift_status': True},
 'ag_007': {'p_value': 0.3601172260069937, 'drift_status': True},
 'ag_008': {'p_value': 0.5548427987360498, 'drift_status': False},
 'ag_009': {'p_value': 0.946

In [12]:
from sensor.utils.main_utils import write_yaml_file
import os, sys

In [13]:
write_yaml_file(file_path= os.path.join(os.getcwd(), 'report.yaml'), content= drift_report, replace= True)

In [14]:
df= pd.read_csv(training_file_path)

In [16]:
df.head(20)

Unnamed: 0,class,aa_000,ac_000,ad_000,ae_000,af_000,ag_000,ag_001,ag_002,ag_003,...,ee_002,ee_003,ee_004,ee_005,ee_006,ee_007,ee_008,ee_009,ef_000,eg_000
0,neg,12,48.0,26.0,0.0,0.0,0.0,0.0,0.0,0.0,...,216.0,92.0,64.0,40.0,0.0,0.0,0.0,0.0,0.0,0.0
1,neg,58254,2130706000.0,1472.0,0.0,0.0,0.0,0.0,0.0,0.0,...,741986.0,341960.0,579446.0,491444.0,319394.0,112148.0,55792.0,1434.0,0.0,0.0
2,neg,8,6.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,...,56.0,16.0,20.0,46.0,4.0,0.0,0.0,0.0,0.0,0.0
3,neg,2618,92.0,60.0,0.0,0.0,0.0,0.0,24.0,108250.0,...,5360.0,3452.0,9544.0,16656.0,19664.0,53026.0,678.0,0.0,0.0,0.0
4,neg,32282,1140.0,1004.0,0.0,0.0,0.0,0.0,0.0,0.0,...,322130.0,146460.0,285444.0,269008.0,151928.0,87216.0,129828.0,3566.0,0.0,0.0
5,neg,32810,,,0.0,0.0,0.0,0.0,0.0,0.0,...,274144.0,126084.0,252082.0,270834.0,205688.0,117554.0,188814.0,8606.0,0.0,0.0
6,neg,254,18.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,...,544.0,204.0,860.0,1376.0,1246.0,6322.0,0.0,0.0,0.0,0.0
7,neg,2168,56.0,36.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4198.0,1850.0,4404.0,86032.0,978.0,84.0,50.0,10.0,0.0,0.0
8,neg,38024,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,289008.0,150976.0,318968.0,206590.0,165736.0,136416.0,347438.0,22926.0,0.0,0.0
9,neg,39886,2130706000.0,250.0,0.0,0.0,0.0,0.0,0.0,0.0,...,150368.0,77882.0,170118.0,244138.0,717966.0,337942.0,30006.0,48.0,0.0,0.0
