In [1]:
import os
import sys

import logging

logging.getLogger("snowflake").setLevel(logging.WARNING)
logging.getLogger("snowflake.connector").setLevel(logging.WARNING)
logging.getLogger("snowflake.snowpark").setLevel(logging.WARNING)


%pwd
os.chdir("../")

# Add the absolute path to src/ so Python can find automatch
src_path = os.path.abspath("src")
if src_path not in sys.path:
    sys.path.append(src_path)
    
%pwd


'c:\\Users\\fiscarelli\\Desktop\\Progetti\\Manpower IT\\Auto-Match\\Candidates-to-Jobs-Auto-Match-Cortex-AI'

In [2]:
%pwd


'c:\\Users\\fiscarelli\\Desktop\\Progetti\\Manpower IT\\Auto-Match\\Candidates-to-Jobs-Auto-Match-Cortex-AI'

In [3]:
from autoMatch.utils.snowflake_utils import get_snowpark_session
session = get_snowpark_session()

Initiating login request with your identity provider. A browser window should have opened for you to complete the login. If you can't see it, check existing browser windows, or your OS settings. Press CTRL+C to abort and try again...
Going to open: https://login.microsoftonline.com/e2ba81b8-03fe-407c-96a1-f4bc0f512e7d/saml2?SAMLRequest=nZJPb%2BIwEMW%2FSuQ9J3FSoGABFW0WNVK3i0q62uXmJJPUwrFT%2FyG0n36dAFL30B72Ztlv5vc8b%2BY3x4Z7B1CaSbFAUYCRB6KQJRP1Aj1na3%2BKPG2oKCmXAhboDTS6Wc41bXhLVta8iCd4taCN5xoJTfqHBbJKEEk100TQBjQxBdmufjyQOMCEag3KOBw6l5SaOdaLMS0Jw67rgu4qkKoOY4xxiGehU%2FWSb%2BgDov2a0SppZCH5peTo%2FvQJIgrxqEc4hSNszoW3TJxG8BUlP4k0uc%2Byjb%2F5uc2Qt7r87k4KbRtQW1AHVsDz08PJgHYObrezEY6nQefm5oNVsoWAvlsFgRayqzjdQyGb1hrXPXCnsIIy5LJmbmZpskDtnpW4zHc2eU3eowme2cN6J22e7DGv%2F9Qprb%2Bb4%2F3vKMkea5rWBfJ%2BXRKO%2B4RTrS2kos%2FVuCscj%2F0I%2BzHOopiMp2SMg8k03iEvcf6YoGaovJgffAQNK5TUsjJScCZgcAlxTqdRPvXxVQX%2BCF8X%2FmxCI78a5QWuxlEM12XYpxej0waRwYha%2Fu9c5uHHLuelfHQ5pclGcla8eWupGmo%2BjzEKouGGlX41SAk0lPFVWSrQ2sXJuezuFF

In [4]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class SearchEngineConfig:
    root_dir: str
    database: str
    schema: str
    input_table: str
    search_columns : dict
    attributes_columns: dict
    columns: dict
    search_service: str


In [5]:
from autoMatch.constants import *
from autoMatch.utils.common import read_yaml, create_directories

class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    def get_search_engine_config(self) -> SearchEngineConfig:
        config = self.config.search_engine
        schema = self.schema.search_engine

        create_directories([config.root_dir])

        search_engine_config = SearchEngineConfig(
            root_dir=config.root_dir,
            database=config.database,
            schema=config.schema,
            input_table=config.input_table,
            search_service=config.search_service,
            search_columns=schema.search_columns,
            attributes_columns = schema.attributes_columns,
            columns = schema.columns
        )

        return search_engine_config

In [6]:
from autoMatch import logger
from dotenv import load_dotenv
from snowflake.core import Root

        
class SearchEngine:
    def __init__(self, config: SearchEngineConfig):
        self.config = config

   
    def create_semantic_search_engine(self, session):
        """
        Creates Cortex Search Service on input table

        Function returns nothing
        """

        database = self.config.database
        schema = self.config.schema
        input_table = self.config.input_table
        search_service = self.config.search_service
        search_columns = self.config.search_columns
        attributes_columns = self.config.attributes_columns
        columns = self.config.columns

        load_dotenv()
        warehouse = os.getenv("SNOWFLAKE_WAREHOUSE")

        description_expr = " || '\\n' || ".join([
            f"'{' '.join(col.strip().split('_')).capitalize()}: ' || {col.strip()}"
            for col in search_columns
        ]) 
               
        query = f"""
            CREATE OR REPLACE CORTEX SEARCH SERVICE {search_service}
            ON description
            ATTRIBUTES {", ".join(attributes_columns)}
            WAREHOUSE = {warehouse}
            TARGET_LAG = '1 hour'
            AS (
                SELECT {", ".join(columns)},
                ({description_expr}) as description
                FROM {database}.{schema}.{input_table}
                );
            """
        
        logger.info(f"Cortex search index {search_service} successfully defined on table {input_table}")

        session.sql(query).collect()
  
        
    def get_column_specification(self, session):
        """
        Get Search Service columns:
            - search columns
            - attribute columns
            - all columns involved
        Function returns nothin
        """

        database = self.config.database
        schema = self.config.schema
        search_service = self.config.search_service

        search_service_result = session.sql(f"DESC CORTEX SEARCH SERVICE {database}.{schema}.{search_service}").collect()[0]
        attribute_columns = search_service_result.attribute_columns.split(",")
        search_columns = search_service_result.search_column
        columns = search_service_result.columns.split(",")

        logger.info(f"Column specifications: \nSearch columns: {search_columns} \nAttribute columns: {attribute_columns} \nAll columns: {columns}")

        return search_service, attribute_columns, search_columns, columns
    
    def query_cortex_search_service(self, session, query, filter={}, limit=5):
        """
        Queries the cortex search service in the session state and returns a list of results

        Returns query results
        """

        database = self.config.database
        schema = self.config.schema
        search_service = self.config.search_service
        columns = self.config.columns

        #_, _, _, columns = self.get_column_specification(session)

        cortex_search_service = (
            Root(session)
            .databases[database]
            .schemas[schema]
            .cortex_search_services[search_service]
        )
        context_documents = cortex_search_service.search(
            query,
            columns=columns,
            filter=filter,
            limit=limit)
        
        return context_documents.results
    
    def create_filter(self, max_age): #, skills):
        """
        Create a filter object to include only candidates with:
        - age <= max_age
        - AND all specified skills present in the 'skills' string column
        """
        filter_clauses = []

        # Age clause (directly append the valid @lte clause)
        age_clause = { "@lte": { "age": max_age } }
        filter_clauses.append(age_clause)

        # Skills clause: all skills must be present in the string
        #skill_and_clauses = [{ "@contains": { "skills": skill } } for skill in skills]
        #filter_clauses.extend(skill_and_clauses)

        return { "@and": filter_clauses }


In [None]:
try:
    config = ConfigurationManager()
    search_engine_config = config.get_search_engine_config()
    search_engine = SearchEngine(config=search_engine_config)
    #search_engine.create_semantic_search_engine(session)
    #search_engine.get_column_specification(session)
    results = search_engine.query_cortex_search_service(session, 
                                                        query='Data Scientist con esperienza in Kafka', 
                                                        filter=search_engine.create_filter(40), 
                                                        limit=5)
    for result in results:
        print(result)
        print("")

except Exception as e:
    raise e

'''
TODO

1. search service as is, filter on age. then crate app
2. add distance_km column in data_transformation and include it when creating the search service, then compute it based on location inserted in the app. and use it as filter
3. add hard skills in the filter (must convert skills column from a string of comma-separated skills to a vector of skills)
'''


[2025-10-20 14:58:56,051: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-10-20 14:58:56,055: INFO: common: yaml file: params.yaml loaded successfully]
[2025-10-20 14:58:56,061: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-10-20 14:58:56,061: INFO: common: created directory at: artifacts]
[2025-10-20 14:58:56,061: INFO: common: created directory at: artifacts/search_engine]
[2025-10-20 14:58:56,067: INFO: 3324900363: Cortex search index CANDIDATE_SEARCH_SERVICE successfully defined on table MPG_IT_AUTOMATCH_CANDIDATE_FEATURES]
[2025-10-20 15:00:12,821: INFO: 3324900363: Column specifications: 
Search columns: DESCRIPTION 
Attribute columns: ['AGE'] 
All columns: ['CANDIDATEID', 'AGE', 'LOCATION', 'LAST_JOB', 'SECOND_LAST_JOB', 'THIRD_LAST_JOB', 'SKILLS', 'DESCRIPTION']]
{'skills': 'oracle, informatica power center, kafka, powerbi, qliksense, apex, hadoop, hdfs, hive, impala, hbase, solr, splunk, sql server, java, sql, xml, python', '@scores': {

'\nTODO\n\n1. search service as is, filter on age. then crate app\n2. add hard skills in the filter (must convert skills column from a string of comma-separated skills to a vector of skills)\n3. add distance_km column in data_transformation and include it when creating the search service, then compute it based on location inserted in the app. and use it as filter\n'

In [12]:
session.sql("""select description 
            from IT_DISCOVERY.CONSUMER_INT_MODEL.MPG_IT_AUTOMATCH_CANDIDATE_CLEANED
            where candidateid = '5544264'
            """).collect()

[Row(DESCRIPTION="\n\n\n\n\n\nurbano arotce \xa0\ninformazioni personali \xa0\nmi sono trasferito recentemente in italia con la mia compagna, in cerca di nuove \xa0\nopportunità professionali e di crescita personale. sono una persona attenta, \xa0\nproattiva e fessibile, con una forte predisposizione al lavoro di squadra e al \xa0\ncontatto con il pubblico. mi adatto facilmente a nuovi contesti e sono determinato a \xa0\ncostruire un percorso solido e signifcativo nel ámbito lavorativo. \xa0\nesperienza professionale \xa0\nvia pianezza 82 torino \xa0\n3447663563 \xa0\ncameriere \xa0\nla gringa pizza e pasta\xa0pilar, argentina \xa0\ngennaio 2008 - aprile 2009 \xa0\npcoutletderqui@gmail.com \xa0\ndata di nascita : 21-08-1989 \xa0\nnazionalità : italiana - argentina \xa0\nsesso : maschile \xa0\nho lavorato come cameriere occupandomi del servizio ai tavoli, della pulizia e della preparazione delle \xa0\npostazioni. \xa0\naddetto alle vendite \xa0\nwicomp computacion\xa0pilar, argentina \x

In [8]:

session.sql("""
SELECT PARSE_JSON(
  SNOWFLAKE.CORTEX.SEARCH_PREVIEW(
      'CANDIDATE_SEARCH_SERVICE',
'{
        "query": "Data Scientist with experience in Python and Java ",
        "columns":[
            "last_job",
            "second_last_job",
            "skills"
        ],
        "limit":5
      }'
  )
)['results'] as results;
""").collect()

[Row(RESULTS='[\n  {\n    "@scores": {\n      "cosine_similarity": 0.58208793,\n      "text_match": 0.076404855\n    },\n    "last_job": "full stack developer & data scientist",\n    "second_last_job": "data scientist",\n    "skills": "python, flask, fastapi, postgresql, mongodb, api rest, javascript, react, html5, css3, jquery, figma, adobe xd, aws, azure databricks, git, docker, tableau, pandas, scipy, matplotlib, llm, openai, rag, scikit-learn, machine learning"\n  },\n  {\n    "@scores": {\n      "cosine_similarity": 0.5920123,\n      "text_match": 0.023447132\n    },\n    "last_job": "data analyst",\n    "second_last_job": "executive engineer",\n    "skills": "python, javascript, java, c/c++, sql, c#, html, css, pandas, numpy, power bi, tableau, matplotlib, sql server, mongodb, sqlite, pytorch, hugging face transformers, spacy, bert, scikit-learn, q-learning, genetic algorithms, svm, gmm, logistic regression, weights & biases, runpod, databricks, firebase, react, node.js, express,