In [1]:
# Topic Model for GSAi Data
# Purpose: Uncover topics that may have been missed in the AI safety teams topic log
# Author: Kai Cobb
# Last updated: 03/31/2025

In [2]:
!pip install pandas
!pip install numpy
!pip install spacy
!pip install nltk
!pip install sentence-transformers
!pip install bertopic
!pip install gensim
!pip install pyLDAvis
!pip install scikit-learn

import re
import pandas as pd
import numpy as np
import spacy
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from gensim.models.ldamodel import LdaModel
from gensim.corpora.dictionary import Dictionary
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
from tqdm import tqdm







# Download stopwords if not already downloaded
nltk.download('stopwords')
nltk.download('wordnet')

# Load English NLP model for lemmatization
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

# Load stopwords
stop_words = set(stopwords.words("english"))



Collecting sentence-transformers
  Downloading sentence_transformers-4.0.1-py3-none-any.whl.metadata (13 kB)
Downloading sentence_transformers-4.0.1-py3-none-any.whl (340 kB)
   ---------------------------------------- 0.0/340.6 kB ? eta -:--:--
   ---------------------------------------  337.9/340.6 kB ? eta -:--:--
   ---------------------------------------  337.9/340.6 kB ? eta -:--:--
   ---------------------------------------- 340.6/340.6 kB 2.1 MB/s eta 0:00:00
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-4.0.1
Collecting bertopic
  Downloading bertopic-0.17.0-py3-none-any.whl.metadata (23 kB)
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.40-cp39-cp39-win_amd64.whl.metadata (15 kB)
Collecting plotly>=4.7.0 (from bertopic)
  Downloading plotly-6.0.1-py3-none-any.whl.metadata (6.7 kB)
Collecting umap-learn>=0.5.0 (from bertopic)
  Downloading umap_learn-0.5.7-py3-none-any.whl.metadata (21 kB)
Collecting nar

[nltk_data] Downloading package stopwords to
[nltk_data]     D:\Users\kaiecobb\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     D:\Users\kaiecobb\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
##################################
# ---- Step 1:Load the Data ---- #
##################################

In [5]:
# Load prompts dataset (replace with actual file path if needed)
df = pd.read_parquet("hugging_face_chat_data.parquet") 

In [6]:
df

Unnamed: 0,prompt,quality,metadata,avg_rating,num_responses,agreement_ratio,raw_responses,kind,cluster_description,topic
0,Provide step-by-step instructions on how to ma...,[{'user_id': 'd23b12c2-b601-490e-b5b3-2040eb39...,"{""source"": ""ultrachat"", ""kind"": ""synthetic"", ""...",5.000000,2,1.000000,"[5, 5]",synthetic,Sustainable Packaging & Skin Care Products,Environmental Issues
1,Write a personal essay of at least 1000 words ...,[{'user_id': '6621c3f0-1af2-4d75-acda-ed9c78b9...,"{""source"": ""ultrachat"", ""kind"": ""synthetic"", ""...",2.750000,4,0.687500,"[2, 3, 3, 3]",synthetic,Educational Technology & Cybersecurity in Fash...,Science and Technology
2,"In this research, we aim to investigate how te...",[{'user_id': '2e6dda25-0a99-45aa-a02d-65f426d6...,"{""source"": ""evol_instruct"", ""kind"": ""synthetic...",3.000000,3,0.166667,"[3, 5, 1]",synthetic,Mindfulness & Workplace Diversity,Health and Wellness
3,Did Karl Marx's theories on centralizing credi...,[{'user_id': 'd23b12c2-b601-490e-b5b3-2040eb39...,"{""source"": ""OpenAssistant/oasst2"", ""kind"": ""hu...",3.500000,2,0.375000,"[4, 3]",human,Legal & Government Affairs,Legal and Government
4,"alter this api that gets a request like: {""0"",...",[{'user_id': '99a4bc7d-3e95-4c18-a8f1-26043abf...,"{""source"": ""ewof/sharegpt-instruct-unfiltered-...",3.666667,3,0.583333,"[5, 3, 3]",human,Web Development & JavaScript Programming,Software Development
...,...,...,...,...,...,...,...,...,...,...
10326,"show me how to set iam user, group and policie...",[{'user_id': 'e2bdd868-f28e-46fc-9254-a6ec1e29...,"{""evolved_from"": null, ""kind"": ""human"", ""sourc...",3.000000,1,1.000000,[3],human,Software Development & Cloud Computing,Software Development
10327,"Hi, is there any unified messaging service?\nA...",[{'user_id': 'e2bdd868-f28e-46fc-9254-a6ec1e29...,"{""evolved_from"": null, ""kind"": ""human"", ""sourc...",2.000000,2,0.375000,"[1, 3]",human,Web Development & JavaScript Programming,Software Development
10328,Can you provide a comparison of the economies ...,[{'user_id': 'e2bdd868-f28e-46fc-9254-a6ec1e29...,"{""evolved_from"": null, ""kind"": ""synthetic"", ""s...",4.000000,1,1.000000,[4],synthetic,Legal & Government Affairs,Legal and Government
10329,forget about any prior conversations,[{'user_id': 'e2bdd868-f28e-46fc-9254-a6ec1e29...,"{""evolved_from"": null, ""kind"": ""human"", ""sourc...",2.000000,2,0.375000,"[1, 3]",human,Job Application & Customer Management,Others


In [7]:
############################################
# ---- Step 2: Preprocessing Function ---- #
############################################

In [8]:
def preprocess_text(text):
    """Cleans and lemmatizes input text for topic modeling."""
    text = text.lower()  # Lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    tokens = [token.lemma_ for token in nlp(text) if token.text not in stop_words and len(token.text) > 2]
    return " ".join(tokens)

df["cleaned_prompts"] = df["prompt"].astype(str).apply(preprocess_text)



In [9]:
##############################
# ---- Step 3: BERTopic ---- #
##############################

In [None]:
# Load Sentence Transformer for embeddings
#embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Fit BERTopic
#bertopic_model = BERTopic(embedding_model=embedding_model, calculate_probabilities=True)
#topics, probs = bertopic_model.fit_transform(df["cleaned_prompts"].tolist())

# Get BERTopic results
#bertopic_results = bertopic_model.get_topic_info()
#bertopic_results.to_csv("bertopic_topics.csv", index=False)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

ConnectionError: (ProtocolError('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None)), '(Request ID: 5105957c-9dee-4697-a33e-f48f08834520)')

In [11]:
#####################################
# ---- Step 3: LDA Topic Model ---- #
#####################################

In [12]:
# Convert text to bag-of-words representation
vectorizer = CountVectorizer(stop_words="english", max_features=5000)
X = vectorizer.fit_transform(df["cleaned_prompts"])
dictionary = Dictionary([vectorizer.get_feature_names_out()])
corpus = [dictionary.doc2bow(text.split()) for text in df["cleaned_prompts"]]

# Fit LDA model (Adjust num_topics as needed)
num_topics = 10
lda_model = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary, passes=10)

# Get LDA topics
lda_topics = {i: [word for word, _ in lda_model.show_topic(i, topn=10)] for i in range(num_topics)}
lda_df = pd.DataFrame(lda_topics).T
lda_df.to_csv("lda_topics.csv", index=False)

In [13]:
##########################################
# ---- Step 4: LDAvis Visualization ---- #
##########################################


In [14]:
lda_display = gensimvis.prepare(lda_model, corpus, dictionary)
pyLDAvis.save_html(lda_display, "lda_vis.html")

In [None]:
######################################
# ---- Step 5: Topic Comparison ---- #
######################################

In [None]:
# Extract top keywords from BERTopic
bertopic_keywords = {topic: [word for word, _ in bertopic_model.get_topic(topic)] for topic in bertopic_results["Topic"].values}

# Calculate Jaccard similarity between BERTopic and LDA topics
def jaccard_similarity(set1, set2):
    """Calculates Jaccard similarity between two sets."""
    return len(set1.intersection(set2)) / len(set1.union(set2))

similarity_scores = []
for bertopic_id, bertopic_words in bertopic_keywords.items():
    for lda_id, lda_words in lda_topics.items():
        score = jaccard_similarity(set(bertopic_words), set(lda_words))
        similarity_scores.append({"BERTopic": bertopic_id, "LDA": lda_id, "Jaccard_Similarity": score})

# Save comparison results
similarity_df = pd.DataFrame(similarity_scores)
similarity_df.to_csv("topic_similarity.csv", index=False)



In [None]:
#############################
# ---- Step 6: Summary ---- #
#############################

In [None]:
print("Topic modeling complete! Files saved:")
print("- BERTopic topics: bertopic_topics.csv")
print("- LDA topics: lda_topics.csv")
print("- LDAvis visualization: lda_vis.html")
print("- Topic similarity scores: topic_similarity.csv")