In [1]:
import pandas as pd
from typing import List, Literal, Dict
from pydantic import BaseModel, Field, field_validator, model_validator, ValidationError
from openai import OpenAI
import time
import os
from dotenv import find_dotenv, load_dotenv
import logging
import tomli
import re
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.pairwise import cosine_distances  # note: distance = 1 - similarity
import pandas as pd
from sklearn.cluster import BisectingKMeans

  from .autonotebook import tqdm as notebook_tqdm


# Embedder

In [2]:
model = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5",trust_remote_code=True)

<All keys matched successfully>


# Test Dataset

In [3]:
Sub_Category_Description = [['The customer received a generic response that did not address their specific query about flight bonuses.', 'The customer was directed to a link for guidelines instead of receiving a personalized explanation.'],
['The customer did not receive a detailed explanation for the increased ticket cost.', 'The resolution provided was incomplete or generic, leaving the customer unsatisfied.'],
['The customer did not receive specific details about the refund process for their canceled flight.', 'The customer was given a generic timeline without personalized information.'],
['The customer is unable to access their flight booking details due to website issues.', "The customer's problem was not resolved, and the advice provided was ineffective."],
['The customer struggled to reset their password due to unclear instructions in the provided guide.', 'The guide provided for resetting the password was not clear or easy to follow.'],
['The customer was not informed about the reasons for the delay in their refund processing.', 'The customer experienced a delay in the refund processing without any explanation from the support agent.'],
['The customer found the compensation policy document too complicated and lacking clear answers.', "The self-help resources provided were inadequate and did not address the customer's specific needs."],
['The customer was not provided with complete instructions for making large group bookings.', "The resolution provided did not fully address the customer's needs, leaving them with an incomplete solution."],
["The agent provided a standard response without addressing the specific details of the customer's extra seat request.", "The customer's specific request for an extra seat was not understood or addressed by the agent."],
['The customer struggled to navigate the website to change their flight booking.', 'The website instructions for changing a flight booking were unclear and difficult to follow.']]
sentences = [desc for sublist in Sub_Category_Description for desc in sublist]



# Convert Embeddings for use in ML model

In [5]:
embeddings = model.encode(sentences)
print(embeddings.shape)  # (4, 768) or similar

(20, 768)


# Cosine Similarity Test

In [None]:

# Cosine Similarity

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


# Similarity matrix
sim_matrix = cosine_similarity(embeddings)


# Convert to DataFrame for better readability
sim_df = pd.DataFrame(sim_matrix, index=sentences, columns=sentences)
sim_df = sim_df.round(2)
sim_df.to_csv("sim_df.csv")

# K-Means

In [None]:
# Number of clusters you want (tune as needed)
num_clusters = 4

# Normalize embeddings so KMeans clustering is closer to cosine similarity
normalized_embeddings = normalize(embeddings)




# Initialize and fit KMeans on normalized vectors
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(normalized_embeddings)

# Get cluster labels
labels = kmeans.labels_

# Create a DataFrame with sentences and their cluster labels
clustered_df = pd.DataFrame({
    'Sentence': sentences,
    'Cluster': labels
})

print(clustered_df)

# Optionally save to CSV
clustered_df.to_csv("sentence_clusters.csv", index=False)


k_values = range(2, 10)

inertia_list = []      # Sum of squared distances to closest cluster center (Elbow)
silhouette_list = []   # Silhouette scores for each k

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(normalized_embeddings)
    
    inertia_list.append(kmeans.inertia_)
    
    labels = kmeans.labels_
    silhouette_avg = silhouette_score(normalized_embeddings, labels)
    silhouette_list.append(silhouette_avg)

# Plot Elbow method
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(k_values, inertia_list, 'bo-')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Inertia (Sum of squared distances)')
plt.title('Elbow Method')

# Plot Silhouette scores
plt.subplot(1, 2, 2)
plt.plot(k_values, silhouette_list, 'ro-')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Scores')

plt.tight_layout()
plt.savefig('cluster_analysis.png')

# Agglomerative clustering


In [None]:


# Calculate cosine distance matrix from embeddings
cosine_dist = cosine_distances(embeddings)  # embeddings not normalized here

# Number of clusters you want
num_clusters = 3

# Agglomerative clustering using precomputed cosine distances
agglo = AgglomerativeClustering(n_clusters=num_clusters,metric = 'precomputed', linkage='average')
labels = agglo.fit_predict(cosine_dist)

# Put sentences and cluster labels in a DataFrame

print(clustered_df)

# Optionally save to CSV
clustered_df.to_csv("sentence_clusters.csv", index=False)


# Bisecting K-Means

In [9]:
normalized_embeddings = normalize(embeddings)
bisecting_kmeans = BisectingKMeans(n_clusters=4, random_state=0)

# Fit the model
bisecting_kmeans.fit(normalized_embeddings)

# Get cluster labels
labels = bisecting_kmeans.labels_

clustered_df = pd.DataFrame({'Sentence': sentences, 'Cluster': labels})

print(clustered_df)
clustered_df.to_csv("bisectingkmeans_sentence_clusters.csv", index=False)

                                             Sentence  Cluster
0   The customer received a generic response that ...        0
1   The customer was directed to a link for guidel...        2
2   The customer did not receive a detailed explan...        0
3   The resolution provided was incomplete or gene...        0
4   The customer did not receive specific details ...        0
5   The customer was given a generic timeline with...        0
6   The customer is unable to access their flight ...        2
7   The customer's problem was not resolved, and t...        0
8   The customer struggled to reset their password...        1
9   The guide provided for resetting the password ...        1
10  The customer was not informed about the reason...        0
11  The customer experienced a delay in the refund...        0
12  The customer found the compensation policy doc...        3
13  The self-help resources provided were inadequa...        3
14  The customer was not provided with complete in...  