#### Relevant imports

In [1]:
import csv
import io
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer, util
from groq import Groq
from sqlalchemy import create_engine, text, Result
import pandas as pd


  from .autonotebook import tqdm as notebook_tqdm


**Embedder**  
Embedder model is used from sentence transformer  
This is used for making semantic search

In [2]:
# Load embedder model
# model = SentenceTransformer('all-mpnet-base-v2')
model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')

**Similarity Search**  
This function makes the semantic similarity search of a string  
It searches the strings that are closely related to search string by meanining  
The reference strings re-ordered based on the similarity  
If there is a distance threshold provided, only the ones relevant are provided

In [66]:
def semantic_similarity_rank(
    search_string: str,
    sentences: list[str],
    threshold: float = 0.0
) -> tuple[list[str], list[int]]:
    """
    Ranks sentences based on semantic similarity to the search_string.

    Args:
        search_string (str): The input query.
        sentences (list[str]): List of sentences to compare.
        threshold (float): Similarity threshold (0 means no threshold).

    Returns:
        tuple: (reordered_sentences, original_indexes)
    """

    # Encode search string and sentence list
    search_embedding = model.encode(search_string, convert_to_tensor=True)
    sentence_embeddings = model.encode(sentences, convert_to_tensor=True)

    # Compute cosine similarity score
    cosine_scores = util.cos_sim(search_embedding, sentence_embeddings)[0]

    # Pair sentences with scores and original indices
    indexed_scores = [
        (i, s, float(score)) for i, (s, score) in enumerate(zip(sentences, cosine_scores))
        if threshold == 0.0 or float(score) >= threshold
    ]

    # Sort by score descending
    indexed_scores.sort(key=lambda x: x[2], reverse=True)

    # Extract reordered sentences and original indices
    reordered_sentences = [s for _, s, _ in indexed_scores]
    original_indexes = [i for i, _, _ in indexed_scores]

    # Output the reordered senteces and the re-ordered indices in original set
    return reordered_sentences, original_indexes
    
    # # Additional output
    # scores = [sc[2] for sc in indexed_scores]
    # return reordered_sentences, original_indexes, scores


**Read Data File**  
Data File with call record summary read into data frame and the summary column segregated as a list of string  

In [67]:
# Read the CSV and consider the relevant column in a list
Data = pd.read_csv ('call_records.csv')
Summary = Data['Call Summary'].astype (str).tolist ()
# Summary

**Semantic Search**  
For different criteria in the call summary, semantic search is made to capture the indices  
Diff threshold values depending on the criteria

In [None]:
CB_Criteria = "follow up request"
Shortlist, CB_Order = semantic_similarity_rank (CB_Criteria, Summary, 0.4)
print (CB_Order)

CL_Criteria = "clarification sought"
Shortlist, CL_Order = semantic_similarity_rank (CL_Criteria, Summary, 0.3)
print (CL_Order)



**Update in CSV**  
Respective records are marked for the criteria match  
This can further be used in the process as structured data file


In [None]:
Data.loc [CB_Order, [CB_Criteria]] = 'Yes'
Data.loc [CL_Order, [CL_Criteria]] = 'Yes'
Data.to_csv ('call_record_updated.csv', index=False)
Data