In [3]:
import os
from dotenv import load_dotenv
from literalai import LiteralClient
from datetime import datetime
import json
from helpers import *

load_dotenv()
openai_api_key = os.getenv('OPENAI_API_KEY')
literal_client = LiteralClient()
threads = literal_client.api.get_threads().data

In [50]:
step_question = []
for thread in threads:
    for i in range(0, len(thread.steps)):
        # Question
        if thread.steps[i].name == "on_message":
            step_question.append(thread.steps[i])
        # Context
        if thread.steps[i].name == "stuff_documents_chain":
            step_context = thread.steps[i]
        # Answer
        if thread.steps[i].name == "ChatOpenAI":
            step_answer = thread.steps[i]

In [4]:
# Get question list
question_list = get_questions(threads)['question']
question_list

['how does Gaussian Mixture Model initialize the means and variance for the clusters?',
 'how do you create an array filled with random numbers using numpy?',
 'What is the syntax for using the .dot method',
 'Can you help me understand an error',
 '  ❌ Test case failed\n    Trying:\n        def test_q3(a1, a2):\n            assert np.allclose(dot_product, 32), f\'Expected dot product {32}, but got {dot_product}\'\n    Expecting nothing\n    ok\n    Trying:\n        test_q3(a1, a2)\n    Expecting nothing\n    **********************************************************************\n    Line 3, in q3 0\n    Failed example:\n        test_q3(a1, a2)\n    Exception raised:\n        Traceback (most recent call last):\n          File "/opt/anaconda3/lib/python3.12/doctest.py", line 1361, in __run\n            exec(compile(example.source, filename, "single",\n          File "", line 1, in \n            test_q3(a1, a2)\n          File "", line 2, in test_q3\n            assert np.allclose(dot_pr

## BERTopic

In [99]:
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer

from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import OpenAI

import openai

  from .autonotebook import tqdm as notebook_tqdm


In [115]:
# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(question_list, show_progress_bar=True)

# Step 2 - Reduce dimensionality
umap_model = UMAP(n_neighbors=2, n_components=2, min_dist=0.0, metric='cosine')

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=2, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(#tokenizer=LemmaTokenizer(),  # custom preprocessor
                                   stop_words="english",    # remove stop words
                                   ngram_range=(1, 3)#,         # use n-grams
                                   #min_df = 2
                                   )                

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer()

Batches: 100%|██████████| 2/2 [00:01<00:00,  1.23it/s]


In [106]:
# Step 6 - (Optional) Fine-tune topic representations
# keybert_model = KeyBERTInspired()
# pos_model = PartOfSpeech("en_core_web_sm")
# MMR to diversify topic representation
#mmr_model = MaximalMarginalRelevance(diversity=0.3)

# GPT-3.5
client = openai.OpenAI(api_key=openai_api_key)
prompt = """
I have a topic that contains the following documents: 
[DOCUMENTS]
The topic is described by the following keywords: [KEYWORDS]

Based on the information above, extract a short but highly descriptive topic label of at most 5 words. Make sure it is in the following format:
topic: <topic label>
"""
openai_model = OpenAI(client, model="gpt-4o-mini", exponential_backoff=True, chat=True, prompt=prompt)

representation_model = {
    "OpenAI": openai_model
}

In [117]:
# Run the model
topic_model = BERTopic(
  nr_topics=5,                        # Automatically determine the number of topics
  embedding_model=embedding_model,          
  umap_model=umap_model,                   
  hdbscan_model=hdbscan_model,              
  vectorizer_model=vectorizer_model,        
  ctfidf_model=ctfidf_model,                
  representation_model=representation_model,
  #top_n_words=5,
  verbose=True,
  calculate_probabilities=True # Calculate the topic probabilities per document
)

# Train model
topics, probs = topic_model.fit_transform(question_list, embeddings)

# Show topics
topic_model.get_topic_info()

2024-10-04 23:08:21,279 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-10-04 23:08:24,372 - BERTopic - Dimensionality - Completed ✓
2024-10-04 23:08:24,374 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-10-04 23:08:24,399 - BERTopic - Cluster - Completed ✓
2024-10-04 23:08:24,402 - BERTopic - Representation - Extracting topics from clusters using representation models.
100%|██████████| 13/13 [00:06<00:00,  2.12it/s]
2024-10-04 23:08:30,635 - BERTopic - Representation - Completed ✓
2024-10-04 23:08:30,636 - BERTopic - Topic reduction - Reducing number of topics
100%|██████████| 5/5 [00:01<00:00,  2.69it/s]
2024-10-04 23:08:32,546 - BERTopic - Topic reduction - Reduced number of topics from 13 to 5


Unnamed: 0,Topic,Count,Name,Representation,OpenAI,Representative_Docs
0,-1,2,-1_need help question_need help_need_question,"[need help question, need help, need, question...",[Help with Assignment Questions],"[I need help with Question 2 in Assignment 2, ..."
1,0,32,0_assignment_help_help question_question,"[assignment, help, help question, question, as...",[Assignment Help Questions Cluster],[Can you help me with question 2 from assignme...
2,1,15,1_model_arima_true_index,"[model, arima, true, index, use, seasonal, mod...",[ARIMA Model Configuration Issues],"[model=SARIMAX(spring_data_diff,order=(1,1,1),..."
3,2,7,2_line_test_q3_file_a2,"[line, test_q3, file, a2, a1, a1 a2, 32, test_...",[Numpy Dot Product Testing],"[What is the syntax for using the .dot method,..."
4,3,3,3_72 571_72_571_aic,"[72 571, 72, 571, aic, aic 72, aic 72 571, 002...",[AIC and Sigma2 Analysis],"[AIC= 72.571, AIC= -72.571, coef std err ..."


## Sentiment Analysis

In [5]:
analyze_sentiment(question_list)

Unnamed: 0,Question,Polarity,Subjectivity
0,how does Gaussian Mixture Model initialize the...,0.0,0.0
1,how do you create an array filled with random ...,-0.05,0.7
2,What is the syntax for using the .dot method,0.0,0.0
3,Can you help me understand an error,0.0,0.0
4,❌ Test case failed\n Trying:\n def...,0.066234,0.357359
5,what is the syntax for np.matmul,0.0,0.0
6,"def k_means(X,centroids,max_iterations=1):\n ...",0.0,0.0
7,What does mean(axis=0) do?,0.0,0.0
8,What is this course about? I'm not sure I'm in...,0.017857,0.712302
9,Are there assignments out?,0.0,0.0


## Keyword Extraction

In [6]:
extract_keywords(question_list)

Most common keywords and their frequencies:
assignment: 24
help: 16
question: 15
model: 10
arima: 7
line: 6
file: 6
data: 6
index: 6
use: 6
got: 5
x: 5
supported: 5
a1: 4
a2: 4
class: 4
need: 4
1a: 4
using: 3
best: 3
give: 3
differencing: 3
unsupported: 3
wa: 3
provided: 3
result: 3
forecast: 3
generated: 3
forecasting: 3
one: 3
due: 3
doe: 2
mean: 2
syntax: 2
error: 2
case: 2
failed: 2
trying: 2
def: 2
assert: 2
dot: 2
product: 2
expecting: 2
nothing: 2
isclose: 2
input: 2
type: 2
centroid: 2
range: 2
print: 2
course: 2
implement: 2
python: 2
code: 2
find: 2
p: 2
spring: 2
value: 2
pmdarima: 2
seasonal: 2
see: 2
import: 2
assignmen1: 2
1b: 2
gaussian: 1
mixture: 1
initialize: 1
variance: 1
cluster: 1
create: 1
array: 1
filled: 1
random: 1
number: 1
numpy: 1
method: 1
understand: 1
test: 1
ok: 1
q3: 1
example: 1
exception: 1
raised: 1
traceback: 1
recent: 1
call: 1
last: 1
exec: 1
compile: 1
filename: 1
single: 1
allclose: 1
b: 1
xfin: 1
isfinite: 1
typeerror: 1
ufunc: 1
could: 1
safel

## Question Complexity

In [8]:
analyze_complexity(question_list)

Unnamed: 0,Question,Flesch Reading Ease,Gunning Fog Index,Avg Syllables
0,how does Gaussian Mixture Model initialize the...,66.74,8.28,1.538462
1,how do you create an array filled with random ...,76.22,4.8,1.416667
2,What is the syntax for using the .dot method,96.18,3.6,1.222222
3,Can you help me understand an error,81.29,2.8,1.428571
4,❌ Test case failed\n Trying:\n def...,45.86,11.51,1.614173
5,what is the syntax for np.matmul,90.77,2.4,1.333333
6,"def k_means(X,centroids,max_iterations=1):\n ...",-104.64,16.05,3.592593
7,What does mean(axis=0) do?,92.8,1.6,1.25
8,What is this course about? I'm not sure I'm in...,115.64,2.6,1.0
9,Are there assignments out?,75.88,11.6,1.5
