In [1]:
import os
import sys
import pandas as pd

import logging

logging.getLogger("snowflake").setLevel(logging.WARNING)
logging.getLogger("snowflake.connector").setLevel(logging.WARNING)
logging.getLogger("snowflake.snowpark").setLevel(logging.WARNING)


%pwd
os.chdir("../")

# Add the absolute path to src/ so Python can find automatch
src_path = os.path.abspath("src")
if src_path not in sys.path:
    sys.path.append(src_path)
    
%pwd


'c:\\Users\\fiscarelli\\Desktop\\Progetti\\Manpower IT\\Auto-Match\\Candidates-to-Jobs-Auto-Match-Cortex-AI'

In [2]:
%pwd


'c:\\Users\\fiscarelli\\Desktop\\Progetti\\Manpower IT\\Auto-Match\\Candidates-to-Jobs-Auto-Match-Cortex-AI'

In [3]:
from autoMatch.utils.snowflake_utils import get_snowpark_session
session = get_snowpark_session()

Initiating login request with your identity provider. A browser window should have opened for you to complete the login. If you can't see it, check existing browser windows, or your OS settings. Press CTRL+C to abort and try again...
Going to open: https://login.microsoftonline.com/e2ba81b8-03fe-407c-96a1-f4bc0f512e7d/saml2?SAMLRequest=nZJbb%2BIwEIX%2FSuR9TuIYuoAFVEChi9QLKmml7ZtJJmDVsVOP09D%2B%2BjpcpO5D%2B7Bvln1mvuM5M7zclyp4A4vS6BFJIkoC0JnJpd6OyGO6CPskQCd0LpTRMCLvgORyPERRqopParfTD%2FBaA7rAN9LI24cRqa3mRqBErkUJyF3G15PbG84iygUiWOdx5FSSo%2FSsnXMVj%2BOmaaKmExm7jRmlNKaD2KtayS%2FyBVH9zKiscSYz6lyy93%2F6BpHEtNsivMITVqfCqdTHEfxE2RxFyP%2Bk6Spc3a9TEkzOv5sZjXUJdg32TWbw%2BHBzNIDewXQ96FLWjxo%2FtxBqayqIxEdtIUJtmkKJF8hMWdXOd4%2F8KS4gj5XZSj%2Bz5dWIVC8yVx%2BL%2FfU9lrfL17mbd%2F66efp8lz5Ni0JukJYpXO96YjZZsO48I8HTOWHWJrxErGGp21ydv6LsIkxoSHspHXBGeacTsR57JsGV9ye1cIfKs%2FmDj6iUmTVoCme0khoOLoFtRD%2FZ9EPaKSDs0l4WDn6LJCy6m4wWFwmDXh636TFy3CB%2BMGLH%2FzuXYfy1y2kp73xOy6uVUTJ7DxbGlsJ9H2MSJYcbmYfFQcqhFFJN8twCoo9TKdPMLA

In [4]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: str
    database: str
    schema: str
    input_table: str
    input_table_cleaned: str
    output_table: str


In [5]:
from autoMatch.constants import *
from autoMatch.utils.common import read_yaml, create_directories
from autoMatch import logger

class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_ingestion_config = DataTransformationConfig(
            root_dir=config.root_dir,
            database=config.database,
            schema=config.schema,
            input_table=config.input_table,
            input_table_cleaned=config.input_table_cleaned,
            output_table = config.output_table,
        )

        return data_ingestion_config

In [None]:
from snowflake.snowpark.functions import col, trim, lower, length, expr
from snowflake.snowpark.types import StringType

from snowflake.snowpark.functions import udf


class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config

    def clean_description_old(self, session):
        """
        Reads input table
        Cleans description column:
            - removes rows with empty description
            - replaces multiple consecutive whitespaces with a single whitespace (preserves newlines)
            - removes all html tags
            - lowercases all text
        Performs Named Entity Recognition
        Function returns Snowflake dataframe
        """
        database = self.config.database
        schema = self.config.schema
        input_table = self.config.input_table
        input_table_cleaned = self.config.input_table_cleaned

        df = session.table(f"{database}.{schema}.{input_table}").limit(100)
        df = df.filter((col("description").is_not_null()) & (trim(col("description")) != ""))


        def normalize_whitespace(text: str) -> str:
            import re
            return re.sub(r'[ \t]+', ' ', text).strip()

        normalize_whitespace_udf = udf(
            normalize_whitespace, 
            return_type=StringType(), 
            input_types=[StringType()]
            )

        df = df.with_column("description", normalize_whitespace_udf(df["description"]))


        def clean_html(text: str) -> str:
            from bs4 import BeautifulSoup
            if not text:
                return ""
            return BeautifulSoup(text, "html.parser").get_text()

        clean_html_udf = udf(
            clean_html,
            return_type=StringType(),
            input_types=[StringType()],
            packages=["beautifulsoup4"]
            )

        df = df.with_column("description", clean_html_udf(df["description"]))

        df = df.with_column("description", lower(df["description"]))

        df = df.filter((col("description").is_not_null()) & 
                       (trim(col("description")) != "") &
                       (length(trim(col("description"))) > 5) &
                       (~col("description").like("%None%")) &
                       (~col("description").like("%null%"))
                       )

        df = df.with_column("description", col("description").cast("STRING"))
        
        def remove_special_chars(text: str) -> str:
            import re
            if not text:
                return ""
            return re.sub(r'[^a-zA-Z0-9\s]', '', text)
        remove_special_chars_udf = udf(
            remove_special_chars,
            return_type=StringType(),
            input_types=[StringType()]
        )
        df = df.with_column("description", remove_special_chars_udf(df["description"]))

        df.write.save_as_table(
            input_table_cleaned,
            mode="overwrite",
        )
        logger.info(f"Table {input_table} successfully cleaned")

        query = f"""
            SELECT
                *,
                SNOWFLAKE.CORTEX.COMPLETE(
                    'claude-4-sonnet',
                    CONCAT(
                        'Estrai dal seguente testo i campi: age (numero), date_of_birth (YYYY-MM-DD), location (stringa), zip_code (numero), last_job, second_last_job, third_last_job, skills (stringa). ',
                        'Rispondi in formato JSON, senza testo extra, attieniti a questo esempio: {{"age": 30, "date_of_birth": "1993-05-12", "location": "Milano", "zip_code": 20100, "last_job": "Data Engineer", "second_last_job": "Developer", "third_last_job": "Intern", "skills": "Python, SQL"}}. ',
                        'Testo: ', description
                    )
                ) AS parsed_json
            FROM {database}.{schema}.{input_table_cleaned}

            """
         
        import json

        response_format = {
            "età": "Qual è l età del candidato? (numero)",
            "data_di_nascita": "Qual e la data di nascita del candidato? Restituisci solo uno in formato stringa ANNO-MESE-GIORNO",
            "località": "Qual è la località del candidato? Restituisci solo uno in formato stringa",
            "cap": "Qual è il codice postale del candidato? Restituisci solo uno in formato stringa",
            "ultimo_lavoro": "Qual è l ultimo lavoro svolto dal candidato? Restituisci solo uno in formato stringa, il tipo di lavoro, non l azienda o il datore di lavoro",
            "penultimo_lavoro": "Qual è il penultimo lavoro svolto dal candidato? Restituisci solo uno in formato stringa, il tipo di lavoro, non l azienda o il datore di lavoro",
            "terzultimo_lavoro": "Qual è il terzultimo lavoro svolto dal candidato? Restituisci solo uno in formato stringa, il tipo di lavoro, non l azienda o il datore di lavoro",
            "competenze": "Qual è l elenco delle competenze del candidato? (stringa di competenze separate da virgola)"
        }

        response_json = json.dumps(response_format)

        query = f"""
            SELECT 
                *, 
                AI_EXTRACT(
                text => TO_VARCHAR(description),
                responseFormat => PARSE_JSON('{response_json}')
                ) AS ner_json
                FROM {database}.{schema}.{input_table_cleaned}
        """
        
        qq = f"""
                AI_EXTRACT(
                text => description,
                responseFormat => PARSE_JSON('{response_json}')
                )
        """
        qq2 = """ 
                SNOWFLAKE.CORTEX.COMPLETE(
                    'claude-4-sonnet',
                    CONCAT(
                        'Estrai dal seguente testo i campi: age (numero), date_of_birth (YYYY-MM-DD), location (stringa), zip_code (numero), last_job, second_last_job, third_last_job, skills (stringa). ',
                        'Rispondi in formato JSON, senza testo extra, attieniti a questo esempio: {{"age": 30, "date_of_birth": "1993-05-12", "location": "Milano", "zip_code": 20100, "last_job": "Data Engineer", "second_last_job": "Developer", "third_last_job": "Intern", "skills": "Python, SQL"}}. ',
                        'Testo: ', description
                    )
                )
                """

        '''
        df = session.sql(f"SELECT * FROM {database}.{schema}.{input_table_cleaned}")
        descriptions = df.select(col("description").alias("description")).collect()
        rcount = ecount = 0
        for row in descriptions:
            desc = row[0]  # or row["description"] if aliased
            try:
                single_df = df.filter(col("description") == desc)
                single_df = single_df.with_column("ner_json", expr(qq))
                single_df.select("ner_json").show()
                rcount += 1
            except Exception as e:
                ecount += 1
                print(f"Errore {ecount} su: {desc}")
        print(f"Numero totale di errori: {ecount}")
        '''

        df_with_ner = session.sql(query).collect()
        df_with_ner = df_with_ner.filter(df_with_ner["ner_json"].is_not_null())
        df_with_ner = df_with_ner.filter(
            (col("ner_json")["response"].is_not_null()) &
            (trim(col("ner_json")["response"]) != "")
            )

        df_with_ner = df_with_ner.with_columns(
            ["age", "date_of_birth", "location", "zip_code", "last_job", "second_last_job", "third_last_job", "skills"],
            [
                col("ner_json")["response"]["età"].cast("STRING"),
                col("ner_json")["response"]["data_di_nascita"].cast("STRING"),
                col("ner_json")["response"]["località"].cast("STRING"),
                col("ner_json")["response"]["cap"].cast("STRING"), 
                col("ner_json")["response"]["ultimo_lavoro"].cast("STRING"),
                col("ner_json")["response"]["penultimo_lavoro"].cast("STRING"),
                col("ner_json")["response"]["terzultimo_lavoro"].cast("STRING"),
                col("ner_json")["response"]["competenze"].cast("STRING")
            ]
        )

        df_with_ner = df_with_ner.drop("ner_json")

        #print(df_with_ner.show(100))
        logger.info(f"NER on {input_table} table successful")


        return df_with_ner


    def clean_description(self, session):
        """
        Reads input table
        Cleans description column:
            - removes rows with empty description
            - replaces multiple consecutive whitespaces with a single whitespace (preserves newlines)
            - removes all html tags
            - lowercases all text
        Function returns Snowflake dataframe
        """
        database = self.config.database
        schema = self.config.schema
        input_table = self.config.input_table
        input_table_cleaned = self.config.input_table_cleaned

        df = session.table(f"{database}.{schema}.{input_table}").limit(100)
        df = df.filter((col("description").is_not_null()) & (trim(col("description")) != ""))

        def build_normalize_whitespace_udf():
            def normalize(text: str) -> str:
                import re
                if text is None:
                    return ''
                return re.sub(r'[ \t]+', ' ', text).strip()

            return udf(normalize, return_type=StringType(), input_types=[StringType()])
        normalize_udf = build_normalize_whitespace_udf()
        df = df.with_column("description", normalize_udf(df["description"]))
        

        def build_clean_html_udf():
            from bs4 import BeautifulSoup
            def clean_html(text: str) -> str:
                if not text:
                    return ""
                return BeautifulSoup(text, "html.parser").get_text()

            return udf(
                clean_html,
                return_type=StringType(),
                input_types=[StringType()],
                packages=["beautifulsoup4"]
            )

        clean_html_udf = build_clean_html_udf()
        df = df.with_column("description", clean_html_udf(df["description"]))

        df = df.with_column("description", lower(df["description"]))

        df = df.filter((col("description").is_not_null()) & 
                       (trim(col("description")) != "") &
                       (length(trim(col("description"))) > 5) &
                       (~col("description").like("%None%")) &
                       (~col("description").like("%null%"))
                       )

        df = df.with_column("description", col("description").cast("STRING"))

        def build_remove_special_chars_udf():
            import re
            def remove_special_chars(text: str) -> str:
                if not text:
                    return ""
                return re.sub(r'[^a-zA-Z0-9\s]', '', text)

            return udf(
                remove_special_chars,
                return_type=StringType(),
                input_types=[StringType()]
            )
        remove_special_chars_udf = build_remove_special_chars_udf()
        df = df.with_column("description", remove_special_chars_udf(df["description"]))


        df.write.save_as_table(
            input_table_cleaned,
            mode="overwrite",
        )
        logger.info(f"Table {input_table} successfully cleaned")

        return df

    def apply_ner_cortexai(self, session):
        """
        Reads input table

        Performs Named Entity Recognition
        Function returns Snowflake dataframe
        """

        from snowflake.snowpark.functions import parse_json

        database = self.config.database
        schema = self.config.schema
        input_table_cleaned = self.config.input_table_cleaned

        query = f"""
            SELECT
                *,
                SNOWFLAKE.CORTEX.COMPLETE(
                    'claude-4-sonnet',
                    CONCAT(
                        'Estrai dal seguente testo i campi: age (numero), date_of_birth (YYYY-MM-DD), location (stringa), zip_code (numero), last_job, second_last_job, third_last_job, skills (stringa). ',
                        'Rispondi in formato JSON, senza testo extra, attieniti a questo esempio: {{"age": 30, "date_of_birth": "1993-05-12", "location": "Milano", "zip_code": 20100, "last_job": "Data Engineer", "second_last_job": "Developer", "third_last_job": "Intern", "skills": "Python, SQL"}}. ',
                        'Testo: ', description
                    )
                ) AS ner_json
            FROM {database}.{schema}.{input_table_cleaned}

            """

        df = session.sql(query)

        def build_clean_parsing_udf():
            def clean(x: str) -> str:
                if x is None:
                    return ''
                x = x.lower().lstrip()
                if x.startswith("```json"):
                    x = x[8:].lstrip()
                x = x.replace('\n', ' ').replace('\t', ' ').replace('\\', '').strip()
                x = ' '.join(x.split())
                if x.endswith("```"):
                    x = x[:-3].rstrip()
                if x.endswith("'") or x.endswith('"'):
                    x = x[:-1].rstrip()
                return x

            return udf(clean, return_type=StringType(), input_types=[StringType()])
        clean_udf = build_clean_parsing_udf()

        df = df.with_column("ner_json", clean_udf(df["ner_json"]))

        def build_is_valid_json_udf():
            import json
            def is_valid(text: str) -> bool:
                if not text:
                    return False
                try:
                    json.loads(text)
                    return True
                except Exception:
                    return False

            return udf(is_valid, return_type=BooleanType(), input_types=[StringType()])
        is_valid_json_udf = build_is_valid_json_udf()
        df = df.with_column("is_valid_json", is_valid_json_udf(df["ner_json"]))
        df = df.filter(col("is_valid_json") == True)
        df = df.drop("is_valid_json")

        df = df.with_column("ner_json", parse_json(col("ner_json")))

        df = df.filter(df["ner_json"].is_not_null())

        df = df.with_columns(
            ["age", "date_of_birth", "location", "zip_code", "last_job", "second_last_job", "third_last_job", "skills"],
            [
                col("ner_json")["age"].cast("STRING"),
                col("ner_json")["data_of_birth"].cast("STRING"),
                col("ner_json")["location"].cast("STRING"),
                col("ner_json")["zip_code"].cast("STRING"), 
                col("ner_json")["last_job"].cast("STRING"),
                col("ner_json")["second_last_job"].cast("STRING"),
                col("ner_json")["third_last_job"].cast("STRING"),
                col("ner_json")["skills"].cast("STRING")
            ]
        )

        df = df.drop("ner_json")

        logger.info(f"NER on {input_table_cleaned} table successful")


        return df

    def write_table(self, session, df):
        """
        Writes input table
        Function returns nothing
        """
        output_table = self.config.output_table

        df.write.save_as_table(output_table, mode="overwrite")
        logger.info(f"Table {output_table} successfully written")

  


In [94]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.clean_description(session)
    df = data_transformation.apply_ner_cortexai(session)
    data_transformation.write_table(session, df)

except Exception as e:
    raise e

[2025-10-07 12:42:13,823: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-10-07 12:42:13,826: INFO: common: yaml file: params.yaml loaded successfully]
[2025-10-07 12:42:13,830: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-10-07 12:42:13,834: INFO: common: created directory at: artifacts]
[2025-10-07 12:42:13,836: INFO: common: created directory at: artifacts/data_transformation]
[2025-10-07 12:42:23,977: INFO: 1697729232: Table MPG_IT_AUTOMATCH_CANDIDATE successfully cleaned]
[2025-10-07 12:42:25,080: INFO: 1697729232: NER on MPG_IT_AUTOMATCH_CANDIDATE_CLEANED table successful]
[2025-10-07 12:42:40,524: INFO: 1697729232: Table MPG_IT_AUTOMATCH_CANDIDATE_FEATURES successfully written]


In [95]:
df_a = session.sql("SELECT * FROM MPG_IT_AUTOMATCH_CANDIDATE_FEATURES").to_pandas()
df_b = session.sql("SELECT * FROM AUTOMATCH_FINAL_TABLE").to_pandas()

df_a["CANDIDATEID"] = df_a["CANDIDATEID"].astype(str)
df_b["CANDIDATEID"] = df_b["CANDIDATEID"].astype(str)

df_merged = pd.merge(df_a, df_b, on="CANDIDATEID", suffixes=("_a", "_b"))
df_merged = df_merged[sorted(df_merged.columns)]
df_merged

Unnamed: 0,AGE_a,AGE_b,CANDIDATEID,DATE_ADDED,DATE_OF_BIRTH_a,DATE_OF_BIRTH_b,DESCRIPTION,LAST_JOB_a,LAST_JOB_b,LOCATION_a,LOCATION_b,SECOND_LAST_JOB_a,SECOND_LAST_JOB_b,SKILLS_a,SKILLS_b,THIRD_LAST_JOB_a,THIRD_LAST_JOB_b,ZIP_CODE_a,ZIP_CODE_b
0,58,58.0,5537727,2025-07-10 09:55:23.473,,1966-08-24,\n\n\n\n\n\nernesta \nseghetti \nistruzione ...,operatore pulizie uffici,operatore pulizie uffici,bracciano,bracciano,assistente educativa culturale,assistente educativa culturale aec/oepa,"capacità organizzative e gestionali, leadershi...","capacità organizzative e gestionali, leadershi...",receptionist front desk,receptionist front desk,,
1,,,5539300,2025-07-11 07:48:14.140,,,\n\n\n\n\n\nmarco broccio \nitaly available ...,transaction monitoring analyst,transaction monitoring analyst - freelance,italy,italy,market risk and behavioural analyst,market risk and behavioural analyst - freelance,"investigation and analysis, screening and tran...","investigation and analysis, screening and tran...",,,,
2,44,44.0,5525349,2025-07-03 04:31:18.100,,1980-05-18,\n\n\n\n\n\ncurriculum vitae \ndati personali...,socio in azienda global srl,socio in azienda global srl pulimentatura metalli,san giovanni valdarno,san giovanni valdarno (ar),,,"gestione amministrazione, vendite, back office...","gestione amministrazione, ufficio vendite, bac...",,,,
3,30,31.0,5547420,2025-07-16 09:16:40.633,,1993-04-05,\n simone facchini\n\n 63072 via borgo garibal...,programmatore cad/cam programmatore cnc,"programmatore cad/cam, programmatore cnc",castignano,castignano,programmatore cad/cam programmatore cnc,"programmatore cad/cam, programmatore cnc","programmazione cad/cam, cnc, fanuc, selca, hei...","programmazione fanuc, selca, heidenhain, progr...",agente assicurativo,agente assicurativo,63072,63072.0
4,16,16.0,5546372,2025-07-16 03:56:41.203,,2008-08-16,\n\n\n\n\n\nbruno gabriele cristiano \nnazion...,stagista,stagista fc impianti,vedano,"vedano, italia",stagista,stagista fc impianti,"infilaggio cavi, cablaggio, posa tubi, install...","installazione elettrica, cablaggio, posa tubi,...",,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88,30,30.0,5532601,2025-07-08 05:06:07.990,,1994-08-05,\n\n\n\n\n\nilaria abate \nnazionalit italian...,ispettore ambientale,ispettore ambientale,grosseto,grosseto,barista,barista,"elaborazione delle informazioni, pensare in mo...","elaborazione delle informazioni, pensare in mo...",babysitter,babysitter,58100,58100.0
89,53,53.0,5568356,2025-07-30 12:48:18.103,,1971-01-31,\n\n\n\n\n\nmaria angela cirasola \nnazionali...,assistente familiare,assistente familiare,bolano sp,bolano (sp),cameriera e lavapiatti,cameriera e lavapiatti,"assistenza personale, servizio tavoli, pulizie...","assistenza personale, somministrazione terapie...",addetta alle pulizie,addetta alle pulizie,,
90,18,18.0,5561500,2025-07-25 08:37:11.100,,2006-07-20,\n\n\n\n\n\ncurriculum vitae europass \ncurri...,,,siena,siena,,,"elettronica, elettrotecnica, automazione","elettronica e elettrotecnica, automazione",,,53100,53100.0
91,31,31.0,5558157,2025-07-23 10:16:22.037,,1993-01-14,\n\n\n\n\n\nnina nicoleta movila \ndata di na...,badante,badante,roma,roma,,,"social, windows, gestione posta elettronica, u...","social, windows, gestione posta elettronica, u...",,,166,166.0


In [96]:
result = session.sql(f"""SELECT AI_EXTRACT(
  text => 'Antonio Cavalli Software Engineer Location: Milano',
  responseFormat => PARSE_JSON('{{"name": "What is the first name of the employee?", "address": "What is the address of the employee?"}}')
 
);""").collect()
print(result[0][0])



{
  "response": {
    "address": "Milano",
    "name": "Antonio"
  }
}
