In [1]:
import os
import sys

import logging

logging.getLogger("snowflake").setLevel(logging.WARNING)
logging.getLogger("snowflake.connector").setLevel(logging.WARNING)
logging.getLogger("snowflake.snowpark").setLevel(logging.WARNING)


%pwd
os.chdir("../")

# Add the absolute path to src/ so Python can find automatch
src_path = os.path.abspath("src")
if src_path not in sys.path:
    sys.path.append(src_path)
    
%pwd


'c:\\Users\\fiscarelli\\Desktop\\Progetti\\Manpower IT\\Auto-Match\\Candidates-to-Jobs-Auto-Match-Cortex-AI'

In [2]:
%pwd


'c:\\Users\\fiscarelli\\Desktop\\Progetti\\Manpower IT\\Auto-Match\\Candidates-to-Jobs-Auto-Match-Cortex-AI'

In [3]:
from autoMatch.utils.snowflake_utils import get_snowpark_session
session = get_snowpark_session()

Initiating login request with your identity provider. A browser window should have opened for you to complete the login. If you can't see it, check existing browser windows, or your OS settings. Press CTRL+C to abort and try again...
Going to open: https://login.microsoftonline.com/e2ba81b8-03fe-407c-96a1-f4bc0f512e7d/saml2?SAMLRequest=nZLBctowEIZfxaOebUsOKVgDZEgcCp20oeB0Mr0Ja01UZMmVZEzy9BUmnkkPyaE3j%2Fzt6tP%2BO746VjI4gLFCqwkiEUYBqEJzoXYT9JDPwxEKrGOKM6kVTNAzWHQ1HVtWyZrOGvek1vCnAesC30hZ2v2YoMYoqpkVlipWgaWuoJvZtzuaRJjWRjtdaInelHxcwawF47xhX8Kt8HpPztU0jtu2jdqLSJtdnGCMY5zGnjohn3r%2B6N%2F0Dk9iPDjxnvD46tXtWqjzCD7S2p4hSxd5vgpX95scBbNe9UYr21RgNmAOooCH9d1ZwHqD6006wMkoav3cQmiMriFiL42ByCrdlpLtodBV3TjfPfJfcQk8lnon%2FACW2QTVe8F3XDKcZ%2BXj4y5L2e0hZfPsvl7fHiq%2B4IvlvhLb49fZ7y8v5Y8CBT%2F7hJNTwktrG1iqU67OH%2BHkMiQ4TEhOMCUpxaNoQIa%2FUJB5P6GY6yp7%2Bc4jqkRhtNWl00oKBZ0lJFs2IttRiC9KCAd4WITpZ0bCcrAtcHlJEhjy%2BJR2gs4bRDsRM%2F3fuYzjt11el%2FK7z2mZrbQUxXMw16Zi7v0YSUS6E8HDskMpVEzIGecGrPVxSqnbGwPM%2Bd13pgEUT8%2B3%2

In [4]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class SearchEngineConfig:
    root_dir: str
    database: str
    schema: str
    input_table: str
    search_columns : dict
    attributes_columns: dict
    columns: dict
    search_service: str


In [5]:
from autoMatch.constants import *
from autoMatch.utils.common import read_yaml, create_directories

class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    def get_search_engine_config(self) -> SearchEngineConfig:
        config = self.config.search_engine
        schema = self.schema.search_engine

        create_directories([config.root_dir])

        search_engine_config = SearchEngineConfig(
            root_dir=config.root_dir,
            database=config.database,
            schema=config.schema,
            input_table=config.input_table,
            search_service=config.search_service,
            search_columns=schema.search_columns,
            attributes_columns = schema.attributes_columns,
            columns = schema.columns
        )

        return search_engine_config

In [None]:
from autoMatch import logger
from dotenv import load_dotenv
from snowflake.core import Root

        
class SearchEngine:
    def __init__(self, config: SearchEngineConfig):
        self.config = config

   
    def create_semantic_search_engine(self, session):
        """
        Creates Cortex Search Service on input table

        Function returns nothing
        """

        database = self.config.database
        schema = self.config.schema
        input_table = self.config.input_table
        search_service = self.config.search_service
        search_columns = self.config.search_columns
        attributes_columns = self.config.attributes_columns
        columns = self.config.columns

        load_dotenv()
        warehouse = os.getenv("SNOWFLAKE_WAREHOUSE")

        description_expr = " || '\\n' || ".join([
            f"'{' '.join(col.strip().split('_')).capitalize()}: ' || {col.strip()}"
            for col in search_columns
        ]) 
               
        query = f"""
            CREATE OR REPLACE CORTEX SEARCH SERVICE {search_service}
            ON description
            ATTRIBUTES {", ".join(attributes_columns)}
            WAREHOUSE = {warehouse}
            TARGET_LAG = '1 hour'
            AS (
                SELECT {", ".join(columns)},
                ({description_expr}) as description
                FROM {database}.{schema}.{input_table}
                );
            """

        session.sql(query).collect()
        logger.info(f"Cortex search index {search_service} successfully defined on table {input_table}")

        
    def get_column_specification(self, session):
        """
        Get Search Service columns:
            - search columns
            - attribute columns
            - all columns involved
        Function returns nothin
        """

        database = self.config.database
        schema = self.config.schema
        search_service = self.config.search_service

        search_service_result = session.sql(f"DESC CORTEX SEARCH SERVICE {database}.{schema}.{search_service}").collect()[0]
        attribute_columns = search_service_result.attribute_columns.split(",")
        search_columns = search_service_result.search_column
        columns = search_service_result.columns.split(",")

        logger.info(f"Column specifications: \nSearch columns: {search_columns} \nAttribute columns: {attribute_columns} \nAll columns: {columns}")

        return search_service, attribute_columns, search_columns, columns
    
    def query_cortex_search_service(self, session, query, filter={}, limit=5):
        """
        Queries the cortex search service in the session state and returns a list of results

        Returns query results
        """

        database = self.config.database
        schema = self.config.schema
        search_service = self.config.search_service
        columns = self.config.columns

        #_, _, _, columns = self.get_column_specification(session)

        cortex_search_service = (
            Root(session)
            .databases[database]
            .schemas[schema]
            .cortex_search_services[search_service]
        )
        context_documents = cortex_search_service.search(
            query,
            columns=columns,
            filter=filter,
            limit=limit)
        
        return context_documents.results
    
    def create_filter(self, max_age=None, province=None):
        """
        Create a filter object to include only candidates with:
        - age <= max_age (if provided)
        - AND province_ext == province (if provided)
        """
        filter_clauses = []

        # Age clause
        if max_age is not None:
            age_clause = { "@lte": { "age": max_age } }
            filter_clauses.append(age_clause)

        # Province clause
        if province:
            province_clause = { "@eq": { "province_ext": province } }
            filter_clauses.append(province_clause)

        # Return combined filter if any clauses exist
        if filter_clauses:
            return { "@and": filter_clauses }
        else:
            return {}  # No filters applied

In [None]:
try:
    config = ConfigurationManager()
    search_engine_config = config.get_search_engine_config()
    search_engine = SearchEngine(config=search_engine_config)
    #search_engine.create_semantic_search_engine(session)
    #search_engine.get_column_specification(session)
    results = search_engine.query_cortex_search_service(session, 
                                                        query='Data Scientist con esperienza in Python', 
                                                        filter=search_engine.create_filter(40, 'Torino'), 
                                                        limit=5)
    for result in results:
        print(result)
        print("")

except Exception as e:
    raise e

'''
TODO

1. search service as is, filter on age. then crate app
2. add distance_km column in data_transformation and include it when creating the search service, then compute it based on location inserted in the app. and use it as filter
3. add hard skills in the filter (must convert skills column from a string of comma-separated skills to a vector of skills)
'''


[2025-10-21 12:26:41,348: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-10-21 12:26:41,350: INFO: common: yaml file: params.yaml loaded successfully]
[2025-10-21 12:26:41,356: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-10-21 12:26:41,359: INFO: common: created directory at: artifacts]
[2025-10-21 12:26:41,361: INFO: common: created directory at: artifacts/search_engine]
{'skills': 'python, r, sql, postgresql, machine learning, git, linux, java, apache spark, mysql, stata, numpy, pandas, scikit-learn, matplotlib', '@scores': {'text_match': 0.13517979, 'cosine_similarity': 0.5679546}, 'province_ext': 'Roma', 'third_last_job': 'ricercatore statistico', 'second_last_job': 'analista funzionale', 'location': 'roma', 'candidateid': '5562519', 'last_job': 'data scientist', 'age': '33'}

{'skills': 'python, java, c, pandas, pydicom, machine learning, intelligenza artificiale, programmazione web, basi di dati', '@scores': {'text_match': 0.00490691, '

'\nTODO\n\n1. search service as is, filter on age. then crate app\n2. add distance_km column in data_transformation and include it when creating the search service, then compute it based on location inserted in the app. and use it as filter\n3. add hard skills in the filter (must convert skills column from a string of comma-separated skills to a vector of skills)\n'

In [19]:
session.sql("""select * 
            from IT_DISCOVERY.CONSUMER_INT_MODEL.MPG_IT_AUTOMATCH_CANDIDATE_FEATURES
            where distance_km < 20 and
            """).collect()

SnowparkSQLException: (1304): 01bfdb32-0207-6a1b-0000-7309447ea38e: 001003 (42000): SQL compilation error:
syntax error line 3 at position 38 unexpected '<EOF>'.

In [None]:
session.sql("DESC TABLE IT_DISCOVERY.CONSUMER_INT_MODEL.MPG_IT_AUTOMATCH_CANDIDATE_FEATURES;").collect()

[Row(name='DATE_ADDED', type='TIMESTAMP_NTZ(9)', kind='COLUMN', null?='Y', default=None, primary key='N', unique key='N', check=None, expression=None, comment=None, policy name=None, privacy domain=None),
 Row(name='CANDIDATEID', type='NUMBER(38,0)', kind='COLUMN', null?='Y', default=None, primary key='N', unique key='N', check=None, expression=None, comment=None, policy name=None, privacy domain=None),
 Row(name='LOCATION', type='VARCHAR(16777216)', kind='COLUMN', null?='Y', default=None, primary key='N', unique key='N', check=None, expression=None, comment=None, policy name=None, privacy domain=None),
 Row(name='LAST_JOB', type='VARCHAR(16777216)', kind='COLUMN', null?='Y', default=None, primary key='N', unique key='N', check=None, expression=None, comment=None, policy name=None, privacy domain=None),
 Row(name='SECOND_LAST_JOB', type='VARCHAR(16777216)', kind='COLUMN', null?='Y', default=None, primary key='N', unique key='N', check=None, expression=None, comment=None, policy name=No

In [None]:

session.sql("""
SELECT PARSE_JSON(
  SNOWFLAKE.CORTEX.SEARCH_PREVIEW(
      'CANDIDATE_SEARCH_SERVICE',
'{
        "query": "Data Scientist with experience in Python and Java ",
        "columns":[
            "last_job",
            "second_last_job",
            "skills"
        ],
        "limit":5
      }'
  )
)['results'] as results;
""").collect()

[Row(RESULTS='[\n  {\n    "@scores": {\n      "cosine_similarity": 0.59865344,\n      "text_match": 0.023422932\n    },\n    "last_job": "data analyst",\n    "second_last_job": "executive engineer",\n    "skills": "python, javascript, java, c/c++, sql, c#, html, css, pandas, numpy, power bi, tableau, matplotlib, sql server, mongodb, sqlite, pytorch, hugging face transformers, spacy, bert, scikit-learn, q-learning, genetic algorithms, svm, gmm, logistic regression, weights & biases, runpod, databricks, firebase, react, node.js, express, django, langchain, hadoop, spark, git, jupyter, vs code, docker, agile/scrum"\n  },\n  {\n    "@scores": {\n      "cosine_similarity": 0.55122876,\n      "text_match": 0.070841834\n    },\n    "last_job": "smart passenger information system",\n    "second_last_job": "data scientist",\n    "skills": "python, java, sql, scikit-learn, tensorflow, keras, pytorch, oracle sql, mysql, mongodb, pyspark, microsoft power bi, tableau, matplotlib, seaborn, git, micr