# Load The Dataset

In [1]:
import pandas as pd
import numpy as np
from qdrant_client import QdrantClient
from qdrant_client.http.models import VectorParams
from sentence_transformers import SentenceTransformer
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.docstore.document import Document
from langchain.vectorstores import Qdrant
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
from sklearn.model_selection import train_test_split
import requests
from langchain.schema import BaseRetriever

  from tqdm.autonotebook import tqdm, trange


In [2]:
df = pd.read_csv("/home/ubuntu/ModernArabertandTicketClassification/Scope 2/Embedding File/customer_support_tickets.csv")

In [3]:
df.head()

Unnamed: 0,Ticket ID,Customer Name,Customer Email,Customer Age,Customer Gender,Product Purchased,Date of Purchase,Ticket Type,Ticket Subject,Ticket Description,Ticket Status,Resolution,Ticket Priority,Ticket Channel,First Response Time,Time to Resolution,Customer Satisfaction Rating
0,1,Marisa Obrien,carrollallison@example.com,32,Other,GoPro Hero,2021-03-22,Technical issue,Product setup,I'm having an issue with the {product_purchase...,Pending Customer Response,,Critical,Social media,2023-06-01 12:15:36,,
1,2,Jessica Rios,clarkeashley@example.com,42,Female,LG Smart TV,2021-05-22,Technical issue,Peripheral compatibility,I'm having an issue with the {product_purchase...,Pending Customer Response,,Critical,Chat,2023-06-01 16:45:38,,
2,3,Christopher Robbins,gonzalestracy@example.com,48,Other,Dell XPS,2020-07-14,Technical issue,Network problem,I'm facing a problem with my {product_purchase...,Closed,Case maybe show recently my computer follow.,Low,Social media,2023-06-01 11:14:38,2023-06-01 18:05:38,3.0
3,4,Christina Dillon,bradleyolson@example.org,27,Female,Microsoft Office,2020-11-13,Billing inquiry,Account access,I'm having an issue with the {product_purchase...,Closed,Try capital clearly never color toward story.,Low,Social media,2023-06-01 07:29:40,2023-06-01 01:57:40,3.0
4,5,Alexander Carroll,bradleymark@example.com,67,Female,Autodesk AutoCAD,2020-02-04,Billing inquiry,Data loss,I'm having an issue with the {product_purchase...,Closed,West decision evidence bit.,Low,Email,2023-06-01 00:12:42,2023-06-01 19:53:42,1.0


In [4]:
def transform_row(row):
    return {
        "ticket_id": row["Ticket ID"], 
        "summary": row["Ticket Subject"], 
        "description": row["Ticket Description"],  
        "priority": row["Ticket Priority"],  
        "status": row["Ticket Status"],  
        "reporter": row["Customer Email"],  
        "label": row["Ticket Type"],  
        "created_at": row["Date of Purchase"],
    }

# Apply transformation to each row
transformed_data = df.apply(transform_row, axis=1)

# Convert the transformed data into a new DataFrame
df_Jira = pd.DataFrame(transformed_data.tolist())

In [5]:
df_Jira.head()

Unnamed: 0,ticket_id,summary,description,priority,status,reporter,label,created_at
0,1,Product setup,I'm having an issue with the {product_purchase...,Critical,Pending Customer Response,carrollallison@example.com,Technical issue,2021-03-22
1,2,Peripheral compatibility,I'm having an issue with the {product_purchase...,Critical,Pending Customer Response,clarkeashley@example.com,Technical issue,2021-05-22
2,3,Network problem,I'm facing a problem with my {product_purchase...,Low,Closed,gonzalestracy@example.com,Technical issue,2020-07-14
3,4,Account access,I'm having an issue with the {product_purchase...,Low,Closed,bradleyolson@example.org,Billing inquiry,2020-11-13
4,5,Data loss,I'm having an issue with the {product_purchase...,Low,Closed,bradleymark@example.com,Billing inquiry,2020-02-04


In [6]:
train_df, test_df_ = train_test_split(df_Jira, test_size=0.2, random_state=42)

test_labels = test_df_['label'].copy()

test_df = test_df_.drop(columns=['label'])

In [7]:
test_df.head()

Unnamed: 0,ticket_id,summary,description,priority,status,reporter,created_at
4830,4831,Product setup,I'm having an issue with the {product_purchase...,High,Pending Customer Response,debra98@example.net,2020-03-24
7075,7076,Battery life,I'm having trouble connecting my {product_purc...,Low,Closed,carsonjames@example.net,2021-01-22
4715,4716,Refund request,I'm having an issue with the {product_purchase...,Low,Open,taylorjames@example.com,2020-06-07
2022,2023,Peripheral compatibility,I'm having an issue with the {product_purchase...,High,Closed,martinezkenneth@example.org,2021-02-20
676,677,Peripheral compatibility,I'm having an issue with the {product_purchase...,Medium,Open,rivasdavid@example.org,2021-08-01


In [9]:
url = "http://localhost:6333/collections/ticket_embeddings_paraphrase-MiniLM-L3-v2"
# response = requests.delete(url)
response = requests.get(url)


print(response.status_code)  # Should print 200 if successful
print(response.json())       # Print the server's response

200
{'result': {'status': 'green', 'optimizer_status': 'ok', 'indexed_vectors_count': 0, 'points_count': 0, 'segments_count': 8, 'config': {'params': {'vectors': {'size': 384, 'distance': 'Cosine'}, 'shard_number': 1, 'replication_factor': 1, 'write_consistency_factor': 1, 'on_disk_payload': True}, 'hnsw_config': {'m': 16, 'ef_construct': 100, 'full_scan_threshold': 10000, 'max_indexing_threads': 0, 'on_disk': False}, 'optimizer_config': {'deleted_threshold': 0.2, 'vacuum_min_vector_number': 1000, 'default_segment_number': 0, 'max_segment_size': None, 'memmap_threshold': None, 'indexing_threshold': 20000, 'flush_interval_sec': 5, 'max_optimization_threads': None}, 'wal_config': {'wal_capacity_mb': 32, 'wal_segments_ahead': 0}, 'quantization_config': None, 'strict_mode_config': {'enabled': False}}, 'payload_schema': {}}, 'status': 'ok', 'time': 0.003894824}


In [37]:
# Connect to Qdrant
client = QdrantClient("http://localhost:6333")  

# Create a collection for embeddings
client.recreate_collection(
    collection_name="ticket_embeddings_paraphrase-MiniLM-L3-v2",
    vectors_config=VectorParams(
        size=384,  # MiniLM L3 size is 384
        distance="Cosine"
    )
)


  client.recreate_collection(


True

In [38]:
def transform_train(row):
    return {
        "ticket_id": row["ticket_id"], 
        "summary": row["summary"], 
        "description": row["description"],  
        "priority": row["priority"],  
        "status": row["status"],  
        "reporter": row["reporter"],  
        "label": row["label"],  
        "created_at": row["created_at"],
    }

# Apply transformation to each row
transformed_data = train_df.apply(transform_train, axis=1)
print(transformed_data[0])

{'ticket_id': 1, 'summary': 'Product setup', 'description': "I'm having an issue with the {product_purchased}. Please assist.\n\nYour billing zip code is: 71701.\n\nWe appreciate that you have requested a website address.\n\nPlease double check your email address. I've tried troubleshooting steps mentioned in the user manual, but the issue persists.", 'priority': 'Critical', 'status': 'Pending Customer Response', 'reporter': 'carrollallison@example.com', 'label': 'Technical issue', 'created_at': '2021-03-22'}


## Creating Embeddings and Using Vector database to do Similarity Search

In [39]:
# Using Vector Database to do Similarity Search (In this cell and the next one)
# Define the embedding models
models = {
    "RoBERTa": SentenceTransformer("sentence-transformers/all-roberta-large-v1"),
    "all-MiniLM-L6-v2": SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2"),
    "all-mpnet-base-v2": SentenceTransformer("sentence-transformers/all-mpnet-base-v2"),
    "modernbert-embed-base": SentenceTransformer("nomic-ai/modernbert-embed-base"), # Error 404 (Private) Must have access to downlaod
    "bge-small-en": SentenceTransformer("BAAI/bge-small-en"),
    "paraphrase-MiniLM-L6-v2": SentenceTransformer("sentence-transformers/paraphrase-MiniLM-L6-v2"),
    "paraphrase-MiniLM-L3-v2": SentenceTransformer("sentence-transformers/paraphrase-MiniLM-L3-v2"),
    "all-distilroberta-v1": SentenceTransformer("sentence-transformers/all-distilroberta-v1"),
}

# Select the model to use
model = models["paraphrase-MiniLM-L3-v2"]

# Generate embeddings for train data
texts_to_embed = [
    f"Ticket ID: {row['ticket_id']}, Summary: {row['summary']}, "
    f"Description: {row['description']}, Priority: {row['priority']}, "
    f"Status: {row['status']}, Reporter: {row['reporter']}, "
    f"Label: {row['label']}, Created At: {row['created_at']}"
    for row in transformed_data
]

# Encode all ticket information
train_embeddings = model.encode(texts_to_embed, show_progress_bar=True)

# Upload embeddings and metadata to Qdrant
client.upload_collection(
    collection_name="ticket_embeddings_paraphrase-MiniLM-L3-v2",
    vectors=train_embeddings,
    payload=[
        {
            "ticket_id": row["ticket_id"],
            "summary": row["summary"],
            "description": row["description"],
            "priority": row["priority"],
            "status": row["status"],
            "reporter": row["reporter"],
            "label": row["label"],
            "created_at": row["created_at"]
        }
        for row in transformed_data
    ]
)

print("Embeddings successfully uploaded to Qdrant!")

Batches: 100%|██████████| 212/212 [00:40<00:00,  5.24it/s]


Embeddings successfully uploaded to Qdrant!


In [None]:
# Select a single ticket from test_df
test_ticket = test_df.iloc[10]  # Select the ticket (adjust index as needed)

# Combine fields for embedding
test_text = (
    f"Ticket ID: {test_ticket['ticket_id']}, Summary: {test_ticket['summary']}, "
    f"Description: {test_ticket['description']}, Priority: {test_ticket['priority']}, "
    f"Status: {test_ticket['status']}, Reporter: {test_ticket['reporter']}, "
    f"Created At: {test_ticket['created_at']}"
)

# Generate embedding for the test ticket
test_embedding = model.encode(test_text)

# Perform similarity search in Qdrant
search_results = client.search(
    collection_name="ticket_embeddings_paraphrase-MiniLM-L3-v2",
    query_vector=test_embedding,
    limit=5  # Retrieve top 5 similar tickets
)

# Extract scores and labels
scores = np.array([result.score for result in search_results])
labels = [result.payload["label"] for result in search_results]

# Apply Softmax to normalize scores
def softmax(x):
    exp_x = np.exp(x - np.max(x))  # Stability trick: subtract max
    return exp_x / exp_x.sum()

normalized_scores = softmax(scores)

# Print results
for label, raw_score, norm_score in zip(labels, scores, normalized_scores):
    print(f"Label: {label}, Raw Score: {raw_score:.4f}, Confidence Score: {norm_score:.4f}")
    

Label: Technical issue, Raw Score: 0.8649, Confidence Score: 0.2015
Label: Refund request, Raw Score: 0.8623, Confidence Score: 0.2010
Label: Cancellation request, Raw Score: 0.8540, Confidence Score: 0.1994
Label: Cancellation request, Raw Score: 0.8527, Confidence Score: 0.1991
Label: Billing inquiry, Raw Score: 0.8520, Confidence Score: 0.1990


  search_results = client.search(


## Using LangChain Retrievers (Does the Similarity Search)

In [49]:
# -----------------------------------------------------------------------------
# STEP A: CONNECT TO QDRANT & CREATE COLLECTION
# -----------------------------------------------------------------------------
client = QdrantClient("http://localhost:6333")

# Create/recreate a collection (adjust vector size to match your embedding model)
collection_name = "ticket_embeddings_all-MiniLM-L6-v2"
client.recreate_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(
        size=384,     # The dimensionality of the embedding model
        distance="Cosine"
    )
)

# -----------------------------------------------------------------------------
# STEP B: PREPARE YOUR DATA AS A LIST OF DOCUMENTS
# -----------------------------------------------------------------------------

docs = []
for row in transformed_data:
    page_content = (
        f"Ticket ID: {row['ticket_id']}, Summary: {row['summary']}, "
        f"Description: {row['description']}, Priority: {row['priority']}, "
        f"Status: {row['status']}, Reporter: {row['reporter']}, "
        f"Label: {row['label']}, Created At: {row['created_at']}"
    )
    metadata = {
        "ticket_id": row["ticket_id"],
        "summary": row["summary"],
        "description": row["description"],
        "priority": row["priority"],
        "status": row["status"],
        "reporter": row["reporter"],
        "label": row["label"],
        "created_at": row["created_at"]
    }
    docs.append(Document(page_content=page_content, metadata=metadata))

# -----------------------------------------------------------------------------
# STEP C: CREATE THE LANGCHAIN EMBEDDING FUNCTION & VECTORSTORE
# -----------------------------------------------------------------------------
# Use SentenceTransformer via LangChain’s Embedding wrapper
model_name = "sentence-transformers/all-MiniLM-L6-v2"
embedding_function = SentenceTransformerEmbeddings(model_name=model_name)

# If you want LangChain to handle indexing automatically, do:
vectorstore = Qdrant.from_documents(
    documents=docs,
    embedding=embedding_function,
    url="http://localhost:6333",           
    collection_name=collection_name,
    api_key=None                           
)

# Uploading manually 

# vectorstore = Qdrant(
#     client=client,
#     collection_name=collection_name,
#     embedding=embedding_function
# )

# -----------------------------------------------------------------------------
# STEP D: CREATE THE RETRIEVER AND QUERY
# -----------------------------------------------------------------------------
retriever: BaseRetriever = vectorstore.as_retriever(
    search_kwargs={"k": 5}  # Return top 5 similar tickets
)

# Example test ticket:
# (If you have a DataFrame named test_df)
test_ticket = test_df.iloc[10]
test_text = (
    f"Ticket ID: {test_ticket['ticket_id']}, "
    f"Summary: {test_ticket['summary']}, "
    f"Description: {test_ticket['description']}, "
    f"Priority: {test_ticket['priority']}, "
    f"Status: {test_ticket['status']}, "
    f"Reporter: {test_ticket['reporter']}, "
    f"Created At: {test_ticket['created_at']}"
)

retrieved_docs = retriever.get_relevant_documents(test_text)

# -----------------------------------------------------------------------------
# STEP E: VIEW RESULTS
# -----------------------------------------------------------------------------
for doc in retrieved_docs:
    print("Retrieved label:", doc.metadata.get("label"))
    print("Score is not directly exposed here, but doc metadata is shown below:")
    print(doc.metadata)
    print("Document excerpt:", doc.page_content[:100], "...")
    print("--------------------------------------------------")


  client.recreate_collection(


Retrieved label: Technical issue
Score is not directly exposed here, but doc metadata is shown below:
{'ticket_id': 7329, 'summary': 'Account access', 'description': "I'm having an issue with the {product_purchased}. Please assist.\n\n\nIt will be taken care of by your team to do so as soon as possible for you. Be patient!!!\n\n\nThis is a very basic product. I've tried clearing the cache and data for the {product_purchased} app, but the issue persists.", 'priority': 'Low', 'status': 'Closed', 'reporter': 'william31@example.net', 'label': 'Technical issue', 'created_at': '2021-10-21', '_id': 'b8e200f6-21a2-46ce-96bf-7e60c320e14a', '_collection_name': 'ticket_embeddings_all-MiniLM-L6-v2'}
Document excerpt: Ticket ID: 7329, Summary: Account access, Description: I'm having an issue with the {product_purchas ...
--------------------------------------------------
Retrieved label: Billing inquiry
Score is not directly exposed here, but doc metadata is shown below:
{'ticket_id': 2161, 'summar

In [24]:
# -----------------------------------------------------------------------------
# STEP A: CONNECT TO QDRANT (ASSUMING YOUR COLLECTION ALREADY EXISTS)
# -----------------------------------------------------------------------------
client = QdrantClient("http://localhost:6333")  # or your Qdrant endpoint

collection_name = "ticket_embeddings_paraphrase-MiniLM-L3-v2"

# Create/recreate a collection (adjust vector size to match your embedding model)
client.recreate_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(
        size=384,     # The dimensionality of the embedding model
        distance="Cosine"
    )
)

# -----------------------------------------------------------------------------
# STEP B: IF YOU NEED TO LOAD NEW DATA, CREATE A DOC LIST
# -----------------------------------------------------------------------------
docs = []
for row in transformed_data:
    page_content = (
        f"Ticket ID: {row['ticket_id']}, Summary: {row['summary']}, "
        f"Description: {row['description']}, Priority: {row['priority']}, "
        f"Status: {row['status']}, Reporter: {row['reporter']}, "
        f"Label: {row['label']}, Created At: {row['created_at']}"
    )
    metadata = {
        "ticket_id": row["ticket_id"],
        "summary": row["summary"],
        "description": row["description"],
        "priority": row["priority"],
        "status": row["status"],
        "reporter": row["reporter"],
        "label": row["label"],
        # "created_at": row["created_at"]
    }
    docs.append(Document(page_content=page_content, metadata=metadata))

# -----------------------------------------------------------------------------
# STEP C: CREATE THE LANGCHAIN EMBEDDING FUNCTION & QDRANT VECTORSTORE
# -----------------------------------------------------------------------------
# -- Instead of storing actual SentenceTransformer objects, store model string names:
models = {
    # "RoBERTa": "sentence-transformers/all-roberta-large-v1",
    # "all-MiniLM-L6-v2": "sentence-transformers/all-MiniLM-L6-v2",
    # "all-mpnet-base-v2": "sentence-transformers/all-mpnet-base-v2",
    # "modernbert-embed-base": "nomic-ai/modernbert-embed-base",
    # "bge-small-en": "BAAI/bge-small-en",
    # "paraphrase-MiniLM-L6-v2": "sentence-transformers/paraphrase-MiniLM-L6-v2",
    # "paraphrase-MiniLM-L3-v2": "sentence-transformers/paraphrase-MiniLM-L3-v2",
    "all-distilroberta-v1": "sentence-transformers/all-distilroberta-v1",
}

# Pick the model's string identifier from the dictionary
model_name = models["paraphrase-MiniLM-L3-v2"]

# Pass the string to SentenceTransformerEmbeddings
embedding_function = SentenceTransformerEmbeddings(model_name=model_name)

# If you're inserting/updating documents (this indexes them into Qdrant)
vectorstore = Qdrant.from_documents(
    documents=docs,
    embedding=embedding_function,
    url="http://localhost:6333",
    collection_name=collection_name,
    api_key=None  # or your API key if needed
)

# -----------------------------------------------------------------------------
# STEP D: SEARCH WITH SCORES & COMPUTE CONFIDENCE (SOFTMAX)
# -----------------------------------------------------------------------------
test_ticket = test_df.iloc[10]
test_text = (
    f"Ticket ID: {test_ticket['ticket_id']}, "
    f"Summary: {test_ticket['summary']}, "
    f"Description: {test_ticket['description']}, "
    f"Priority: {test_ticket['priority']}, "
    f"Status: {test_ticket['status']}, "
    f"Reporter: {test_ticket['reporter']}, "
    f"Created At: {test_ticket['created_at']}"
)

# Search for the top 5 matches
results = vectorstore.similarity_search_with_score(query=test_text, k=5)

# Extract scores
scores = np.array([score for _, score in results])

# Define a softmax function
def softmax(x: np.ndarray):
    # Subtract max for numerical stability
    exp_x = np.exp(x - np.max(x))
    return exp_x / exp_x.sum()

# Compute confidence scores
confidence_scores = softmax(scores)

# -----------------------------------------------------------------------------
# STEP E: PRINT RESULTS
# -----------------------------------------------------------------------------
for (doc, raw_score), conf_score in zip(results, confidence_scores):
    print("Label:", doc.metadata["label"])
    print(f"Raw Score (similarity): {raw_score:.4f}")
    print(f"Confidence (softmax): {conf_score:.4f}")
    print("Excerpt:", doc.page_content[:80], "...")
    print("--------------------------------------------------")


  client.recreate_collection(


Label: Technical issue
Raw Score (similarity): 0.8649
Confidence (softmax): 0.2015
Excerpt: Ticket ID: 414, Summary: Product compatibility, Description: I'm having an issue ...
--------------------------------------------------
Label: Refund request
Raw Score (similarity): 0.8623
Confidence (softmax): 0.2010
Excerpt: Ticket ID: 1409, Summary: Account access, Description: I'm having an issue with  ...
--------------------------------------------------
Label: Cancellation request
Raw Score (similarity): 0.8540
Confidence (softmax): 0.1994
Excerpt: Ticket ID: 7488, Summary: Data loss, Description: I'm having an issue with the { ...
--------------------------------------------------
Label: Cancellation request
Raw Score (similarity): 0.8527
Confidence (softmax): 0.1991
Excerpt: Ticket ID: 2859, Summary: Account access, Description: I'm having an issue with  ...
--------------------------------------------------
Label: Billing inquiry
Raw Score (similarity): 0.8520
Confidence (softmax): 0.1

## Models Benchmarks

In [8]:
###############################################################################
# REFACTORED MODEL MAPPINGS
###############################################################################
# We store Hugging Face model strings here. 
# If the "bge-small-en" key needs to load "BAAI/bge-small-en", we explicitly map it.
models = {
    "all-MiniLM-L6-v2": "sentence-transformers/all-MiniLM-L6-v2",
    "all-mpnet-base-v2": "sentence-transformers/all-mpnet-base-v2",
    "paraphrase-MiniLM-L3-v2": "sentence-transformers/paraphrase-MiniLM-L3-v2",
    "bge-small-en": "BAAI/bge-small-en",
    "paraphrase-MiniLM-L6-v2": "sentence-transformers/paraphrase-MiniLM-L6-v2",
    "all-distilroberta-v1": "sentence-transformers/all-distilroberta-v1",
}

###############################################################################
# HELPER FUNCTIONS
###############################################################################
def build_langchain_docs(transformed_data):
    """Convert a list of ticket dicts into a list of LangChain Documents."""
    docs = []
    for row in transformed_data:
        page_content = (
            f" Summary: {row['summary']}, "
            f"Description: {row['description']}, Priority: {row['priority']}, "
        )
        metadata = {

            "label": row["label"],
        }
        docs.append(Document(page_content=page_content, metadata=metadata))
    return docs

def compute_softmax(scores):
    """Compute softmax over an array of scores."""
    exp_scores = np.exp(scores - np.max(scores))
    return exp_scores / exp_scores.sum()

def majority_vote_label(retrieved_docs):
    """Optional approach if you'd prefer a majority vote from top-K neighbors."""
    label_list = [doc.metadata["label"] for doc in retrieved_docs]
    return max(set(label_list), key=label_list.count)

###############################################################################
# MAIN BENCHMARK LOGIC
###############################################################################

# 1) Connect to Qdrant (assuming a local instance).
client = QdrantClient(url="http://localhost:6333")

# 2) Convert your training data to documents
train_docs = build_langchain_docs(transformed_data)

# 3) For each model, do the following:
#    - Create/recreate a Qdrant collection
#    - Create embeddings via LangChain
#    - Insert the train docs into Qdrant
#    - For each test ticket, retrieve top-K neighbors and pick a predicted label
#    - Compare predicted vs. actual labels
#    - Calculate metrics
results_summary = {}

for model_key, hf_model_id in models.items():
    print(f"======================== MODEL: {model_key} ========================")
    
    # A) Create the collection name for this model
    collection_name = f"ticket_embeddings_{model_key}"
    
    # B) Recreate the collection (this will wipe if it already exists)
    #    We first load the dimension by creating a temporary SentenceTransformer instance:
    temp_model = SentenceTransformer(hf_model_id)
    dim = temp_model.get_sentence_embedding_dimension()
    
    client.recreate_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=dim, distance="Cosine")
    )
    
    # C) Build the LangChain embedding function using the correct HF ID
    embedding_function = SentenceTransformerEmbeddings(model_name=hf_model_id)
    
    # D) Create Qdrant Vectorstore for this model
    vectorstore = Qdrant.from_documents(
        documents=train_docs,
        embedding=embedding_function,
        url="http://localhost:6333",
        collection_name=collection_name
    )
    
    # E) Evaluate on your test set
    y_true = []
    y_pred = []
    k = 5
    
    for idx, row in test_df.iterrows():
        test_text = (
            f"Summary: {row['summary']}, "
            f"Description: {row['description']}, Priority: {row['priority']}, "
        )
        true_label = test_labels[idx]
        
        # Retrieve top-K neighbors
        retrieved = vectorstore.similarity_search_with_score(test_text, k=k)
        
        if not retrieved:
            predicted_label = None
        else:
            top_doc, top_score = retrieved[0]
            predicted_label = top_doc.metadata["label"]
        
        y_true.append(true_label)
        y_pred.append(predicted_label)
    
    # F) Compute metrics
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average="weighted")
    precision = precision_score(y_true, y_pred, average="weighted", zero_division=0)
    recall = recall_score(y_true, y_pred, average="weighted", zero_division=0)
    
    print(f"Accuracy:  {accuracy:.4f}")
    print(f"F1 Score:  {f1:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print("Full classification report:")
    print(classification_report(y_true, y_pred, zero_division=0))
    
    results_summary[model_key] = {
        "accuracy": accuracy,
        "f1_score": f1,
        "precision": precision,
        "recall": recall
    }

print("\n===================== AGGREGATE BENCHMARK RESULTS =====================")
for m, metrics in results_summary.items():
    print(f"Model: {m}")
    for metric_name, value in metrics.items():
        print(f"  {metric_name}: {value:.4f}")
    print("----------------------------------------------------------------------")




  client.recreate_collection(
  embedding_function = SentenceTransformerEmbeddings(model_name=hf_model_id)


Accuracy:  0.9664
F1 Score:  0.9664
Precision: 0.9665
Recall:    0.9664
Full classification report:
                      precision    recall  f1-score   support

     Billing inquiry       0.97      0.96      0.97       357
Cancellation request       0.97      0.97      0.97       327
     Product inquiry       0.94      0.97      0.96       316
      Refund request       0.97      0.97      0.97       345
     Technical issue       0.97      0.96      0.97       349

            accuracy                           0.97      1694
           macro avg       0.97      0.97      0.97      1694
        weighted avg       0.97      0.97      0.97      1694



  client.recreate_collection(


Accuracy:  0.8654
F1 Score:  0.8653
Precision: 0.8656
Recall:    0.8654
Full classification report:
                      precision    recall  f1-score   support

     Billing inquiry       0.89      0.86      0.88       357
Cancellation request       0.86      0.83      0.85       327
     Product inquiry       0.87      0.91      0.89       316
      Refund request       0.86      0.85      0.85       345
     Technical issue       0.85      0.88      0.87       349

            accuracy                           0.87      1694
           macro avg       0.87      0.87      0.87      1694
        weighted avg       0.87      0.87      0.87      1694



  client.recreate_collection(


Accuracy:  0.6942
F1 Score:  0.7080
Precision: 0.7759
Recall:    0.6942
Full classification report:
                      precision    recall  f1-score   support

     Billing inquiry       0.72      0.66      0.69       357
Cancellation request       0.45      0.88      0.60       327
     Product inquiry       0.94      0.68      0.79       316
      Refund request       0.81      0.66      0.73       345
     Technical issue       0.95      0.60      0.74       349

            accuracy                           0.69      1694
           macro avg       0.78      0.70      0.71      1694
        weighted avg       0.78      0.69      0.71      1694



  client.recreate_collection(


Accuracy:  0.9410
F1 Score:  0.9410
Precision: 0.9413
Recall:    0.9410
Full classification report:
                      precision    recall  f1-score   support

     Billing inquiry       0.95      0.95      0.95       357
Cancellation request       0.91      0.94      0.92       327
     Product inquiry       0.95      0.94      0.94       316
      Refund request       0.95      0.93      0.94       345
     Technical issue       0.95      0.94      0.95       349

            accuracy                           0.94      1694
           macro avg       0.94      0.94      0.94      1694
        weighted avg       0.94      0.94      0.94      1694



  client.recreate_collection(


Accuracy:  0.8146
F1 Score:  0.8184
Precision: 0.8404
Recall:    0.8146
Full classification report:
                      precision    recall  f1-score   support

     Billing inquiry       0.86      0.82      0.84       357
Cancellation request       0.73      0.85      0.79       327
     Product inquiry       0.97      0.78      0.86       316
      Refund request       0.67      0.88      0.76       345
     Technical issue       0.97      0.74      0.84       349

            accuracy                           0.81      1694
           macro avg       0.84      0.81      0.82      1694
        weighted avg       0.84      0.81      0.82      1694



  client.recreate_collection(


Accuracy:  0.9693
F1 Score:  0.9693
Precision: 0.9694
Recall:    0.9693
Full classification report:
                      precision    recall  f1-score   support

     Billing inquiry       0.97      0.97      0.97       357
Cancellation request       0.98      0.97      0.97       327
     Product inquiry       0.96      0.98      0.97       316
      Refund request       0.96      0.97      0.97       345
     Technical issue       0.98      0.96      0.97       349

            accuracy                           0.97      1694
           macro avg       0.97      0.97      0.97      1694
        weighted avg       0.97      0.97      0.97      1694


Model: all-MiniLM-L6-v2
  accuracy: 0.9664
  f1_score: 0.9664
  precision: 0.9665
  recall: 0.9664
----------------------------------------------------------------------
Model: all-mpnet-base-v2
  accuracy: 0.8654
  f1_score: 0.8653
  precision: 0.8656
  recall: 0.8654
--------------------------------------------------------------------

In [10]:
test_df.__len__()

1694

In [41]:
model = SentenceTransformer("sentence-transformers/paraphrase-MiniLM-L3-v2")
test_embedding = model.encode(test_text)
print(f"Test embedding dimension: {len(test_embedding)}")  # Should print 384


Test embedding dimension: 384
