In [7]:
import os
# First we change the directory to the root directory of the project.
os.chdir(os.getcwd().replace("\\notebooks\\clustering", ""))
import pandas as pd
import pickle

In [16]:
from bertopic import BERTopic
from umap import UMAP
from bertopic.vectorizers import ClassTfidfTransformer
from sklearn.cluster import KMeans, AgglomerativeClustering
import pyLDAvis
from hdbscan import HDBSCAN
import pyLDAvis.lda_model as lda
from sklearn.decomposition import LatentDirichletAllocation
import nltk
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer
from keyphrase_vectorizers import KeyphraseCountVectorizer
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
with open("data/ShotDB/examplesQA.pkl", "rb") as f:
    examples = pickle.load(f)
examples

[{'input': 'I retook the project report, and I noticed that you assigned a grade for it. However, on the student portal, Project 1.1 is still listed as a failed course. Could you please verify if everything is in order?',
  'output': 'I’m waiting for all examiners to submit the results of the repair assignments. Then I will process everything simultaneously.'},
 {'input': 'I am reaching out to inquire about the process concerning my academic credits, as I received a notification from the school indicating that my credit is lower than expected for the first semester. I have already contacted the school via email to explain my situation, and their response mentioned that I should consider any changes in credits on the student portal as an indication that the email has not been sent. Could you please provide me with an update on the current status of the process?',
  'output': 'I’m sorry but I cannot help you with this. I refer to the student advisors, in CC.'},
 {'input': 'I think there 

In [11]:
docs = [example["input"] for example in examples]

In [12]:
nltk.download('stopwords')
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dika1\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
sentence_model = SentenceTransformer("BAAI/bge-large-en-v1.5", device='cuda')

In [35]:
umap_model = UMAP(n_neighbors=15, n_components=20, min_dist=0.1, spread=1, metric='cosine')

In [36]:
hdbscan_model = HDBSCAN(min_cluster_size=2, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
# cluster_model = KMeans(n_clusters=20, random_state=42)
# cluster_model = AgglomerativeClustering(n_clusters=30, linkage='ward')

In [18]:
vectorizer_model = KeyphraseCountVectorizer(stop_words=stop_words)

In [19]:
ctfidf_model = ClassTfidfTransformer()

In [47]:
import openai
import tiktoken
from bertopic.representation import OpenAI
# Load local environment variables
from dotenv import load_dotenv
print("Environment variables are loaded = ", load_dotenv())

Environment variables are loaded =  True


In [48]:
tokenizer= tiktoken.encoding_for_model("gpt-3.5-turbo-0125")

# Create your representation model
client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
representation_model = OpenAI(
    client,
    model="gpt-3.5-turbo-0125",
    delay_in_seconds=2,
    chat=True,
    nr_docs=4,
    doc_length=100,
    tokenizer=tokenizer
)


In [55]:
topic_model = BERTopic(

    # Pipeline models
    embedding_model=sentence_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    representation_model=representation_model,
    # Hyperparameters
    top_n_words=5,
    verbose=True
)

In [56]:
# Train model
topics, probs = topic_model.fit_transform(docs)

2024-05-23 18:23:58,813 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

2024-05-23 18:23:59,894 - BERTopic - Embedding - Completed ✓
2024-05-23 18:23:59,895 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-23 18:24:01,696 - BERTopic - Dimensionality - Completed ✓
2024-05-23 18:24:01,696 - BERTopic - Cluster - Start clustering the reduced embeddings

`alltrue` is deprecated as of NumPy 1.25.0, and will be removed in NumPy 2.0. Please use `all` instead.

2024-05-23 18:24:01,699 - BERTopic - Cluster - Completed ✓
2024-05-23 18:24:01,700 - BERTopic - Representation - Extracting topics from clusters using representation models.
100%|██████████| 6/6 [00:16<00:00,  2.78s/it]
2024-05-23 18:24:19,094 - BERTopic - Representation - Completed ✓


In [51]:
# Fine-tune topic representations after training BERTopic
# vectorizer_model = KeyphraseCountVectorizer(stop_words=stop_words)
# topic_model.update_topics(docs, vectorizer_model=vectorizer_model)

In [57]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1,-1_Medical Issue Impact on Project Meeting Att...,[Medical Issue Impact on Project Meeting Atten...,[So to clarify even with my undergoing medical...
1,0,20,0_Grading and Skill Classes Queries,[Grading and Skill Classes Queries],[I know that it shows I have missed skills cla...
2,1,10,1_Health-related challenges affecting particip...,[Health-related challenges affecting participa...,[I was on the phone with emergency and ER GPs ...
3,2,10,2_Meeting Attendance Difficulties Due to Unfor...,[Meeting Attendance Difficulties Due to Unfore...,[I am reaching out to discuss my upcoming retu...
4,3,9,3_Attendance and Participation Issues in Phase...,[Attendance and Participation Issues in Phase ...,[From the beginning of the project the actual ...
5,4,7,4_Project Opening Attendance and Participation...,[Project Opening Attendance and Participation ...,[I wanted to ask a question regarding the Proj...


In [58]:
fig = topic_model.visualize_heatmap()
# fig.write_html("matrix.html")
fig

In [59]:
fig = topic_model.visualize_topics()
# fig.write_html("map.html")
fig

In [60]:
fig = topic_model.visualize_documents(docs, title="Question Clustering")
fig

In [61]:
fig.write_html("clustering.html")

In [46]:
pyLDAvis.enable_notebook()

tf_vectorizer = KeyphraseCountVectorizer(stop_words=stop_words)
dtm_tf = tf_vectorizer.fit_transform(docs)
lda_tf = LatentDirichletAllocation(n_components=15, random_state=0, max_iter=1000)
lda_tf.fit(dtm_tf)
fig = lda.prepare(lda_tf, dtm_tf, tf_vectorizer)
fig