In [1]:
import os
import sys
import pandas as pd


%pwd
os.chdir("../")

# Add the absolute path to src/ so Python can find automatch
src_path = os.path.abspath("src")
if src_path not in sys.path:
    sys.path.append(src_path)
    
%pwd


'c:\\Users\\fiscarelli\\Desktop\\Progetti\\Manpower IT\\Auto-Match\\Candidates-to-Jobs-Auto-Match-Cortex-AI'

In [2]:
%pwd


'c:\\Users\\fiscarelli\\Desktop\\Progetti\\Manpower IT\\Auto-Match\\Candidates-to-Jobs-Auto-Match-Cortex-AI'

In [3]:
from autoMatch.utils.snowflake_utils import get_snowpark_session
session = get_snowpark_session()

[2025-10-09 14:35:25,448: INFO: connection: Snowflake Connector for Python Version: 3.7.0, Python Version: 3.10.11, Platform: Windows-10-10.0.26100-SP0]
[2025-10-09 14:35:25,448: INFO: connection: This connection is in OCSP Fail Open Mode. TLS Certificates would be checked for validity and revocation status. Any other Certificate Revocation related exceptions or OCSP Responder failures would be disregarded in favor of connectivity.]
Initiating login request with your identity provider. A browser window should have opened for you to complete the login. If you can't see it, check existing browser windows, or your OS settings. Press CTRL+C to abort and try again...
Going to open: https://login.microsoftonline.com/e2ba81b8-03fe-407c-96a1-f4bc0f512e7d/saml2?SAMLRequest=nZJPb%2BIwEMW%2FSuQ9x3HCfwuoKKgq2rbLklCpezPJBKw6dmo7BPj06wSQuof2sDfLfjO%2F53kzvjsWwjuANlzJCQoxQR7IVGVc7iZokzz4Q%2BQZy2TGhJIwQScw6G46NqwQJZ1Vdi%2FX8FGBsZ5rJA1tHiao0pIqZrihkhVgqE1pPHt%2BohEmlBkD2jocupZkhjvW3tqSBkFd17juYKV3QUQIC

In [4]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: str
    database: str
    schema: str
    input_table: str
    output_table: str
    italian_cities_file: str
    output_table_italian_cities: str
    columns: dict
    start_date: str
    end_date: str
    italian_cities_string_columns: dict
    italian_cities_numeric_columns: dict


In [5]:
from autoMatch.constants import *
from autoMatch.utils.common import read_yaml, create_directories
from autoMatch import logger

class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    
    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion
        schema = self.schema.data_ingestion

        create_directories([config.root_dir])

        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            database=config.database,
            schema=config.schema,
            input_table=config.input_table,
            output_table = config.output_table,
            italian_cities_file = config.italian_cities_file,
            output_table_italian_cities = config.output_table_italian_cities,
            columns = schema.columns,
            start_date = schema.date_range.start_date,
            end_date = schema.date_range.end_date,
            italian_cities_string_columns = schema.cities_file_columns.string_columns,
            italian_cities_numeric_columns = schema.cities_file_columns.numeric_columns,
        )

        return data_ingestion_config

In [10]:
from snowflake.snowpark.functions import col

class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config

    def read_table(self, session):
        """
        Reads input table
        Function returns Snowflake dataframe
        """
        database = self.config.database
        schema = self.config.schema
        input_table = self.config.input_table
        columns = self.config.columns
        start_date = self.config.start_date
        end_date = self.config.end_date

        df = session.table(f"{database}.{schema}.{input_table}")
        df = df.select([col(c) for c in columns])
        df = df.filter((col("date_added") >= start_date) & (col("date_added") <= end_date))
        logger.info(f"Table {input_table} successfully read. Number of rows: {df.count()}")

        return df

    def read_cities_file(self, session):
        """
        Reads XLSX file containing italian cities
        Function returns Snowflake dataframe
        """
        italian_cities_file = self.config.italian_cities_file
        string_columns = self.config.italian_cities_string_columns
        numeric_columns = self.config.italian_cities_numeric_columns

        df = pd.read_excel(italian_cities_file, header=0)

        # Rename columns for consistency (optional but recommended)
        df.columns = [col.strip().replace(" ", "_").lower() for col in df.columns]
        df = df[string_columns + numeric_columns]
        
        # Convert ZIP to string (preserve leading zeros)
        df["zip"] = df["zip"].apply(lambda x: str(int(x)) if pd.notnull(x) else None)
        
        # Convert string columns
        for col in string_columns:
            df[col] = df[col].astype(str).str.strip()

        # Convert latitude and longitude to float, handle NaNs
        for col in numeric_columns:
            df[col] = pd.to_numeric(df[col], errors="coerce")

        logger.info(f"XLSX file containing italian cities successfully read")
        print(df.head(3))
        print(df.info())

        return session.create_dataframe(df)

    def write_table(self, df, table_name = 'output_table'):
        """
        Writes table
        Function returns nothing
        """

        df.write.save_as_table(table_name, mode="overwrite")
        logger.info(f"Table {table_name} successfully written")

  


In [None]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    df = data_ingestion.read_table(session)
    data_ingestion.write_table(df, data_ingestion_config.output_table)
    df = data_ingestion.read_cities_file(session)
    data_ingestion.write_table(df, data_ingestion_config.output_table_italian_cities)
except Exception as e:
    raise e

[2025-10-09 14:55:39,878: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-10-09 14:55:39,881: INFO: common: yaml file: params.yaml loaded successfully]
[2025-10-09 14:55:39,888: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-10-09 14:55:39,890: INFO: common: created directory at: artifacts]
[2025-10-09 14:55:39,892: INFO: common: created directory at: artifacts/data_ingestion]
[2025-10-09 14:55:43,382: INFO: 626174755: XLSX file containing italian cities successfully read]
                      unique_identifier          city_name province  \
0  02f27cc1-ab58-4115-93b1-3cde2ff3754e  Abano Terme Bagni       PD   
1  b38d8576-e4a6-49b1-8d52-4e2ae0559cf2         Abatemarco       SA   
2  dd0097c0-51c2-408e-aee0-7a1c98d85428  Abazia Di Sulmona       AQ   

  province_ext    zip   latitude  longitude  
0       Padova  35031        NaN        NaN  
1      Salerno  84040  40.144668  15.355906  
2     L'Aquila  67030        NaN        NaN  
<class 'panda