In [1]:
import os
import sys
import pandas as pd

import logging 

logging.getLogger("snowflake").setLevel(logging.WARNING)
logging.getLogger("snowflake.connector").setLevel(logging.WARNING)
logging.getLogger("snowflake.snowpark").setLevel(logging.WARNING)


%pwd
os.chdir("../")

# Add the absolute path to src/ so Python can find automatch
src_path = os.path.abspath("src")
if src_path not in sys.path:
    sys.path.append(src_path)
    
%pwd


'c:\\Users\\fiscarelli\\Desktop\\Progetti\\Manpower IT\\Auto-Match\\Candidates-to-Jobs-Auto-Match-Cortex-AI'

In [2]:
%pwd

'c:\\Users\\fiscarelli\\Desktop\\Progetti\\Manpower IT\\Auto-Match\\Candidates-to-Jobs-Auto-Match-Cortex-AI'

In [3]:
from autoMatch.utils.snowflake_utils import get_snowpark_session
session = get_snowpark_session()

Initiating login request with your identity provider. A browser window should have opened for you to complete the login. If you can't see it, check existing browser windows, or your OS settings. Press CTRL+C to abort and try again...
Going to open: https://login.microsoftonline.com/e2ba81b8-03fe-407c-96a1-f4bc0f512e7d/saml2?SAMLRequest=nZJNc5swEIb%2FCqOeAUHwBxrbGcduxu64KbVx%2BnETsDhqQKJaEZz%2B%2Bso4nkkPyaE3Rjy7erTvTq6PdeU8gUah5JQEHiUOyFwVQh6mZJ%2FeumPioOGy4JWSMCXPgOR6NkFeVw2bt%2BZBbuF3C2gc20gi639MSaslUxwFMslrQGZytpt%2F3rDQo6zRyqhcVeRVyfsVHBG0sYaXkgKF1XswpmG%2B33Wd1115Sh%2F8kFLq09i31An5cOGP9k1v8IFPoxNvCYsnL243Qp5H8J5WdoaQrdI0cZMvu5Q484vqQklsa9A70E8ih%2F12cxZAa3CziyMajr3Ozs2FVqsGPP6n1eChVF1Z8UfIVd20xnb37JdfQuFX6iDsANbLKWkeRaHNt032KeOr77Aa7WuM7trjV9x85NtwEWdLnhz2afqr2OT3P3Li3F8SDk8JrxFbWMtTrsYe0XDgBtSlcRpELBiyaOjF0eAncZbWT0hu%2BsqLfO%2Fh1SLXClVplKyEhN4SwoyPg2zs0qsS3IiOcjce8sAtoyyn5SAIYVT4p7RDct4g1ovo2f%2FOZeK%2F7vKylHc2p%2FUyUZXIn51bpWtu3o4x8IL%2BRBRu2aMMai6qeVFoQLRxVpXqFhq4sbtvdAvEn51v%2

In [None]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataValidationConfig:
    root_dir: str
    database: str
    schema: str
    input_table: dict
    table_schema: dict
    status_file: str


In [None]:
from autoMatch.constants import *
from autoMatch.utils.common import read_yaml, create_directories
from autoMatch import logger

class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_validation_config(self) -> DataValidationConfig:
        config = self.config.data_validation
        schema = self.schema.data_validation

        create_directories([config.root_dir])

        data_validation_config = DataValidationConfig(
            root_dir=config.root_dir,
            database=config.database,
            schema=config.schema,
            input_table=config.input_table,
            table_schema=schema.table_schema,
            status_file = config.STATUS_FILE
        )

        return data_validation_config

In [None]:
class DataValidation:
    def __init__(self, config: DataValidationConfig):
        self.config = config


    def validate_all_columns(self, session)-> bool:
        try:
            validation_status = True

            database = self.config.database
            schema = self.config.schema
            input_table = self.config.input_table
            table_schema = self.config.table_schema
            status_file = self.config.status_file

            for input_table, intput_table_snowflake in input_table.items():
                schema_check = True
                df = session.table(f"{database}.{schema}.{intput_table_snowflake}")

                columns = table_schema[input_table].columns
                df_cols = [col.lower() for col in df.columns]
                schema_cols = [col.lower() for col in columns.keys()]

                missing_columns = list(set(schema_cols) - set(df_cols))
                extra_columns = list(set(df_cols) - set(schema_cols))

                for col in missing_columns:
                    validation_status = False
                    schema_check = False
                    logger.info(f"Column {col} from schema is missing in the dataframe")

                for col in extra_columns:
                    logger.info(f"Column {col} is present in the dataframe but not specified in the schema")

                for field in df.schema.fields:
                    for col_name, col_type in columns.items():
                        if col_name == field.name.lower():
                            if col_type not in str(field.datatype).lower():
                                validation_status = False
                                schema_check = False
                                logger.info(f"Dataframe column {col_name} with type ({col_type}) does not match schema column type {field.name} ({field.datatype})")

                logger.info(f"Validation check for table {intput_table_snowflake}: {'SUCCESS' if schema_check else 'FAIL'}")
                with open(status_file, 'w') as f:
                    f.write(f"Validation status: {validation_status}")

            return validation_status
        
        except Exception as e:
            raise e

In [7]:
try:
    config = ConfigurationManager()
    data_validation_config = config.get_data_validation_config()
    data_validation = DataValidation(config=data_validation_config)
    df = data_validation.validate_all_columns(session)
except Exception as e:
    raise e

[2025-10-09 16:16:49,854: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-10-09 16:16:49,859: INFO: common: yaml file: params.yaml loaded successfully]
[2025-10-09 16:16:49,867: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-10-09 16:16:49,869: INFO: common: created directory at: artifacts]
[2025-10-09 16:16:49,871: INFO: common: created directory at: artifacts/data_validation]
[2025-10-09 16:16:49,961: INFO: 3478419935: Validation check for table candidates: SUCCESS]
[2025-10-09 16:16:50,149: INFO: 3478419935: Validation check for table italian_cities: SUCCESS]
