In [1]:
import os
import pandas
os.chdir('../')
os.getcwd()

'c:\\Users\\Marina\\Desktop\\cicd-project'

### Data Schema Validation

Checking if the data that is going to be passed throught the pipeline has the same strucuture (range, categories names, name of the columns, non - null - values), of the data in which we trained the model and the pipeline. That way, we prevent future problems, and make sure, that our new data is going to fit the preprocessing pipeline, and the schema that our model expects.

In [4]:
import pandera as pa
from pandera import Column, DataFrameSchema

# Define the schema using the older syntax
student_performance_schema = DataFrameSchema({
    # Gender categories
    "gender": Column(
        str,
        nullable=False,
        checks=pa.Check.isin(['female', 'male'])
    ),
    
    # Race/ethnicity categories
    "race_ethnicity": Column(
        str,
        nullable=False,
        checks=pa.Check.isin(['group A', 'group B', 'group C', 'group D', 'group E'])
    ),
    
    # Parental education categories
    "parental_level_of_education": Column(
        str,
        nullable=False,
        checks=pa.Check.isin([
            "bachelor's degree",
            "master's degree",
            "associate's degree",
            "some college",
            "high school",
            "some high school"
        ])
    ),
    
    # Lunch categories
    "lunch": Column(
        str,
        nullable=False,
        checks=pa.Check.isin(['standard', 'free/reduced'])
    ),
    
    # Test preparation categories
    "test_preparation_course": Column(
        str,
        nullable=False,
        checks=pa.Check.isin(['none', 'completed'])
    ),
    
    # Numeric score columns
    "math_score": Column(
        int,
        nullable=False,
        checks=[
            pa.Check.greater_than_or_equal_to(0),
            pa.Check.less_than_or_equal_to(100)
        ]
    ),
    
    "reading_score": Column(
        int,
        nullable=False,
        checks=[
            pa.Check.greater_than_or_equal_to(0),
            pa.Check.less_than_or_equal_to(100)
        ]
    ),
    
    "writing_score": Column(
        int,
        nullable=False,
        checks=[
            pa.Check.greater_than_or_equal_to(0),
            pa.Check.less_than_or_equal_to(100)
        ]
    )
    })


#### Positive Testcase

In [32]:
import pandas as pd
df = pd.read_csv('./raw_data/students.csv')

In [23]:
try:
    validated_schema_df = student_performance_schema.validate(df)
    print("Validation successful!")
except pa.errors.SchemaError as e:
    print("Validation failed!")
    print(e)

Validation successful!


#### Negative Testcase

In [14]:
import pandas as pd
wrong_df = pd.read_csv('./research/data_validation_wrong_schema.csv')

In [15]:
try:
    validated_df_wrong_df = student_performance_schema.validate(wrong_df)
    print("Validation successful!")
except pa.errors.SchemaError as e:
    print("Validation failed!")
    print(e)

Validation failed!
Column 'race_ethnicity' failed element-wise validator number 0: isin(['group A', 'group B', 'group C', 'group D', 'group E']) failure cases: group F, group G


### Statistical Cheking

Checking for data depreciation

#### Precomputing Reference Statistics


In [21]:
from pathlib import Path

In [71]:
import pandas.api.types as pd_types

df = pd.read_csv("raw_data/students.csv")

# Identify numeric features using pandas API for types
numeric_features = [col for col in df.columns if pd_types.is_numeric_dtype(df[col])]

# Identify categorical features (objects or strings) using pandas API for types
categorical_features = [col for col in df.columns if pd_types.is_object_dtype(df[col])]



In [36]:
import pandas as pd
import json

# Example reference data
reference_data = pd.read_csv("raw_data/students.csv")

# Compute statistics
reference_stats = {}

# Numerical features
for feature in numeric_features:
    reference_stats[feature] = {
        "mean": reference_data[feature].mean(),
        "std": reference_data[feature].std(),
        "percentiles": reference_data[feature].quantile([0.25, 0.5, 0.75]).to_dict()
    }

# Categorical features
for feature in categorical_features:
    reference_stats[feature] = {
        "value_counts": reference_data[feature].value_counts(normalize=True).to_dict(),
    }

# Save to a JSON file
with open("schemas/reference_stats.json", "w") as f:
    json.dump(reference_stats, f)


#### Drift detection

In [63]:
import json
import numpy as np
import pandas as pd
from typing import Union, Dict, Any

def statistical_dataframe_validation(
    data_frame_to_validate: pd.DataFrame, 
    reference_stats_path: Union[str, Dict[str, Any]],
    numerical_tolerance: float = 3.0,
    categorical_tolerance: float = 0.1
) -> bool:
    """
    Validate a DataFrame against reference statistics for data drift detection.
    
    Parameters:
    -----------
    data_frame_to_validate : pd.DataFrame
        The DataFrame to be validated
    reference_stats_path : str or dict
        Path to JSON file or dictionary containing reference statistics
    numerical_tolerance : float, optional
        Number of standard deviations for numerical feature validation (default: 3.0)
    categorical_tolerance : float, optional
        Percentage difference tolerance for categorical features (default: 0.1)
    
    Returns:
    --------
    bool
        True if DataFrame passes validation, False otherwise
    """
    # Load reference statistics
    if isinstance(reference_stats_path, str):
        with open(reference_stats_path, 'r') as f:
            reference_stats = json.load(f)
    else:
        reference_stats = reference_stats_path
    
    # Validate each feature
    for feature, stats in reference_stats.items():
        # Skip if feature not in dataframe
        if feature not in data_frame_to_validate.columns:
            continue
        
        # Numerical feature validation
        if 'mean' in stats:
            feature_data = data_frame_to_validate[feature]
            
            # Check mean and standard deviation
            current_mean = feature_data.mean()
            current_std = feature_data.std()
            
            # Compare mean
            mean_diff = abs(current_mean - stats['mean'])
            if mean_diff > numerical_tolerance * stats['std']:
                return False
            
            # Compare standard deviation
            std_ratio = current_std / stats['std']
            if std_ratio < 1/1.5 or std_ratio > 1.5:
                return False
            
            # # Check percentile ranges
            # percentiles = stats.get('percentiles', {})
            # for percentile, ref_value in percentiles.items():
            #     current_percentile = feature_data.quantile(float(percentile))
            #     if abs(current_percentile - ref_value) > numerical_tolerance * stats['std']:
            #         return False
        
        # Categorical feature validation
        elif 'value_counts' in stats:
            # Calculate current value counts
            current_value_counts = data_frame_to_validate[feature].value_counts(normalize=True)
            ref_value_counts = stats['value_counts']
            
            # Compare categorical distributions
            for category, ref_proportion in ref_value_counts.items():
                current_proportion = current_value_counts.get(category, 0)
                if abs(current_proportion - ref_proportion) > categorical_tolerance:
                    return False
    
    return True

#### Testing:

In [64]:
# Completely different data distribution
wrong_dist_df = pd.read_excel('research/data_validation_wrong_distribution.xlsx')
stats_path = 'schemas/reference_stats.json'
validated = statistical_dataframe_validation(wrong_dist_df, stats_path, 3, 0.1)
validated

False

In [None]:
# Sightly different data distribution
sightly_diff_dist_df = pd.read_excel('research/data_validation_sightly_different_dist.xlsx')
stats_path = 'schemas/reference_stats.json'
validated = statistical_dataframe_validation(sightly_diff_dist_df, stats_path, 3, 0.1)
validated

True

In [None]:
# Sightly different data distribution, with lower treshold
sightly_diff_dist_df = pd.read_excel('research/data_validation_sightly_different_dist.xlsx')
stats_path = 'schemas/reference_stats.json'
validated = statistical_dataframe_validation(sightly_diff_dist_df, stats_path, 0.3, 0.1)
validated

False