#### Relevant imports

In [None]:
import csv
import io
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer, util
from groq import Groq
from sqlalchemy import create_engine, text, Result

>Create a Groq Client

In [2]:
load_dotenv()
client = Groq()

**Embedder**  
Embedder model is used from sentence transformer  
This is used for making semantic search

In [3]:
# Load embedder model
# model = SentenceTransformer('all-mpnet-base-v2')
model = SentenceTransformer('all-MiniLM-L6-v2')

**Similarity Search**  
This function makes the semantic similarity search of a string  
It searches the strings that are closely related to search string by meanining  
The reference strings re-ordered based on the similarity  
If there is a distance threshold provided, only the ones relevant are provided

In [4]:
def semantic_similarity_rank(
    search_string: str,
    sentences: list[str],
    threshold: float = 0.0
) -> tuple[list[str], list[int]]:
    """
    Ranks sentences based on semantic similarity to the search_string.

    Args:
        search_string (str): The input query.
        sentences (list[str]): List of sentences to compare.
        threshold (float): Similarity threshold (0 means no threshold).

    Returns:
        tuple: (reordered_sentences, original_indexes)
    """

    # Encode search string and sentence list
    search_embedding = model.encode(search_string, convert_to_tensor=True)
    sentence_embeddings = model.encode(sentences, convert_to_tensor=True)

    # Compute cosine similarity score
    cosine_scores = util.cos_sim(search_embedding, sentence_embeddings)[0]

    # Pair sentences with scores and original indices
    indexed_scores = [
        (i, s, float(score)) for i, (s, score) in enumerate(zip(sentences, cosine_scores))
        if threshold == 0.0 or float(score) >= threshold
    ]

    # Sort by score descending
    indexed_scores.sort(key=lambda x: x[2], reverse=True)

    # Extract reordered sentences and original indices
    reordered_sentences = [s for _, s, _ in indexed_scores]
    original_indexes = [i for i, _, _ in indexed_scores]

    # Output the reordered senteces and the re-ordered indices in original set
    return reordered_sentences, original_indexes
    
    # # Additional output
    # scores = [sc[2] for sc in indexed_scores]
    # return reordered_sentences, original_indexes, scores


>Let's try using it for sentence similarity search.  
>Some sentences are provided with varied meaning  
>See how it picks the relevance with search string

In [None]:
# Example usage of semantic similarity
sentences = ["Software update improved cloud data processing.",
                "Researchers study atmospheric carbon capture.",
                "Legislators debated new trade agreement's economic impact.",
                "Personalized medicine targets cancer with genetics.",
                "Oil price swings impact consumer spending.",
                "Community programs address urban food deserts.",
                "Adaptive learning boosts STEM student engagement.",
                "Basel exhibition challenged art perceptions.",
                "Heavy monsoon rains disrupted regional transport.",
                "Court ruling set AI content IP precedent.",
                "Optimizing supply chain cuts production costs.",
                "Martian rover data hints at subsurface ice.",
                "Phishing attack compromised corporate credentials.",
                "Smart cities prioritize green transport.",
                "Drought-resistant crops ensure food security.",
                "Underdog team's win made sports headlines.",
                "Pompeii finds reveal Roman daily life.",
                "Sustainable tourism gains eco-conscious travelers.",
                "Cognitive biases influence financial decisions.",
                "Report found discrepancies in project spending."]

search = 'AI is interesting'

# Make a meaning search
sentence_re_ordered, index_re_oredered = semantic_similarity_rank (search, sentences)

# Check the order of relevance
# print (scores)
print (index_re_oredered)
for snt in sentence_re_ordered :

    print (snt)


In [None]:
sentences = ['This phone has good battery', 
             'Laptop gets slow after a while',
             'My PC is not very efficient',
             'My smart phone has clarity of sound']

search = 'Too much screen time is not good'

# Make a meaning search
sentence_re_ordered, index_re_oredered = semantic_similarity_rank (search, sentences)

# Check the order of relevance
# print (scores)
print (index_re_oredered)
for snt in sentence_re_ordered :

    print (snt)

**Utility**  
Util functions that are required for hadling CSV results  
CSV format is chosen considering the number of rows that needs to be handled by LLM  
if JSON, the context may be oversized and limits might hit

In [5]:
def result_to_csv_string(result: Result, delimiter: str = ',') -> tuple[str, int]:
    """
    Convert qeury result to CSV-formatted string.

    Args:
        result (Result): The result of conn.execute().
        delimiter (str): Delimiter used in CSV (default is comma).

    Returns:
        tuple: (CSV string including headers, number of data rows)
    """

    # Get column names
    headers = result.keys()

    # Get all rows
    rows = result.fetchall()

    # Use StringIO to build CSV string
    output = io.StringIO()
    writer = csv.writer(output, delimiter=delimiter)

    # Write headers and data rows
    writer.writerow(headers)
    writer.writerows(rows)

    csv_string = output.getvalue()
    row_count = len(rows)

    return csv_string, row_count


def extract_column_from_csv(csv_text: str, column_name: str, delimiter: str = ',') -> list[str]:
    """
    Extracts a column from CSV text as a list of strings based on the column header.

    Args:
        csv_text (str): The full CSV content as string.
        column_name (str): The name of the column to extract.
        delimiter (str): The CSV delimiter (default is ',').

    Returns:
        list[str]: List of values from the specified column.
    """
    reader = csv.DictReader(io.StringIO(csv_text), delimiter=delimiter)
    return [row[column_name] for row in reader if column_name in row]


def filter_csv_rows_by_index(csv_text: str, row_indexes: list[int], delimiter: str = ',') -> str:
    """
    Filters specific data rows from CSV text by their row index (excluding the header row).

    Args:
        csv_text (str): The full CSV content as string.
        row_indexes (list[int]): List of 0-based row numbers (excluding header).
        delimiter (str): The CSV delimiter (default is ',').

    Returns:
        str: New CSV string with only selected rows (including header).
    """
    reader = csv.reader(io.StringIO(csv_text), delimiter=delimiter)
    rows = list(reader)
    
    if not rows:
        return ""

    header = rows[0]
    data_rows = rows[1:]

    selected_rows = [data_rows[i] for i in row_indexes if 0 <= i < len(data_rows)]

    output = io.StringIO()
    writer = csv.writer(output, delimiter=delimiter)
    writer.writerow(header)
    writer.writerows(selected_rows)

    return output.getvalue()

**Instructions**
Query instruction is adapted to consider the product choice of user along with question / prompt  

In [14]:
R_Instr = "Using the context given, provide response to the user question or statement.\
            Context is provided as CSV formatted string.\
            Answer to the question with details"

>SQLite Engine with connection created  
>The product catalogue database is used

In [6]:
# There is an engine instance created, which can handle multiple connetions
sql_engine = create_engine("sqlite:///Sample_3 - Copy.db")
conn = sql_engine.connect ()

**Product category**  
From the user query, identify product category by making semantic search  
The distinct values of the product type is matched to the user prompt  
Depending on if there is a relevant product type (threshold is given) the product selection is made

In [None]:
Prompt = "I need a ear phone which has durable battery"
# Prompt = "Give me some latest monitor that has good brightness"
# Prompt = "Can you suggest me a Phone that has sensitive touch screen and durable battery?"
# Prompt = "Which is the good TV?"

# Identify Product type by semantic search
result = conn.execute (text(f"""
                            SELECT DISTINCT Product_Type from product_catalogue
                              """))

# Get the Product categories from database and make it into a list
rows = result.fetchall()
Categories = [str(row[0]) for row in rows]
# print (Categories)

Shortlist, Order = semantic_similarity_rank (Prompt, Categories, 0.3)
Shortlist


**Query**  
Filter based on the product type that is identified.  
The result is then fed for further semantic search

In [None]:
# Query from DB for the Product type that is identified
result = conn.execute (text(f"""
                            SELECT * from product_catalogue WHERE Product_Type = '{Shortlist[0]}'
                              """))

# The output is then convereted into CSV text
CSV_Result, Nb_Rows = result_to_csv_string (result)
print (Nb_Rows)

# Extract the User Feedback Column
Feedback = extract_column_from_csv (CSV_Result, 'User_Feedback')



**Semantic Search**  
Since the feedback column is textual data, semantic searach is applied to identify the relevant ones  
Top k numbers are then filtered based on the re-ordered ranking  
This is used as context to LLM for answering user query  

In [None]:
# How many to be filtered
top_k = 10

# Make semantic similarity in feedback column and get the order by relevance
Shortlist, Order = semantic_similarity_rank (Prompt, Feedback)
print (Order)

# Filter out the required numbers. this is the indices that is required after the re-ordering
Filter = Order[:top_k]

# Filter the CSV content by required row numbers
Context = filter_csv_rows_by_index (CSV_Result, Filter)
# print (Context)


**LLM answer**
With the context that is categorically and semantically filtered, it is then provided to LLM  
Along with this context user query is responded 

In [None]:
messages=[
    {
        "role": "system",
        "content": R_Instr
    },

    {
        "role": "user",
        "content":"Context : \n"+ Context + "Query : \n" + Prompt
    }
]
completion = client.chat.completions.create(
    messages=messages,
    model="llama3-70b-8192",
)

print (completion.choices[0].message.content)