In [None]:
# Topic Model for GSAi Prompt and Response Data using data from 3/14-3/31
# Purpose: Build a topic model that will help us uncover topics in both user prompts and assistant responses that may have been missed in the AI safety teams topic log
# MOD: This is a Convo version meaning the user and assistant prompts are consolidated by the ID
# MOD: This also applies bigram/trigram function to topic model
# Using only LDAvis, 10 topics 
# Author: Kai Cobb
# Last updated: 04/07/2025 

In [None]:

!pip install pandas numpy spacy nltk sentence-transformers bertopic gensim pyLDAvis scikit-learn

import re
import pandas as pd
import numpy as np
import spacy
import nltk
import json
import gensim
import pyLDAvis.gensim_models as gensimvis
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from gensim.models.ldamodel import LdaModel
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary
from collections import Counter
from wordcloud import WordCloud
from nltk.util import ngrams
import datetime

# Download stopwords if not already downloaded
nltk.download('stopwords')
nltk.download('wordnet')

# Load English NLP model for lemmatization
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

# Load stopwords
stop_words = set(stopwords.words("english"))
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

#IMPORT TQDM FOR RUNNING PROGRESS#
from tqdm import tqdm

In [None]:

##################################
# ---- Step 1:Load the Data ---- #
##################################

# Load prompts dataset (replace with actual file path if needed)
#Import the dataset#

df= pd.read_excel(r"D:\Users\kaiecobb\Documents\GitHub\NLP4Survey\Customer-Data-Survey-Analysis\Notebooks\GSAi Topic Model\MSG-P-AI-datadog-20250403.xlsx")

type(df)

# Overview of dataset 
print(df.info()) # Check column types and missing values
print(df.describe()) # Summary Statistics

#Preview first few rows
df.head()
df

In [None]:

df.rename(columns={'content': 'prompt'}, inplace=True)
# Concatenate use and assistent content in conversation order: 
# Group by conversation ID, sort by timestamp
df = df.sort_values(by=["id", "timestamp"])

# Group by Id and concatenate user + assistant messages
convo_df = df.groupby("id")["prompt"].apply(lambda x: " ".join(x.astype(str))).reset_index()

convo_df

In [None]:
############################################
# ---- Step 2: EDA and Pre-processing ---- #
############################################

In [None]:
# Clean and Preprocess Text (LDA & BERTopic Compatible)
def preprocess_text(text):
    """preprocess text for both BERT and LDA."""
    if pd.isnull(text): # Handle missing values
        return ""
    
    text = text.lower()  # Lowercase
    text = re.sub(r'\s+', ' ', text) # Remove extra whitespace
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove punctuation & numbers

    doc = nlp(text)
    tokens = [token.lemma_ for token in nlp(text) if token.text not in stop_words and len(token.text) > 2] # Lemmatization and stopword removal
    return " ".join(tokens)

# apply preprocess function
convo_df["cleaned_prompts"] = convo_df["prompt"].astype(str).apply(preprocess_text)

# Remove empty rows
convo_df = convo_df[convo_df["cleaned_prompts"].str.strip() !=""]

# Remove duplicates
convo_df = convo_df.drop_duplicates(subset=["cleaned_prompts"])

# Check cleaned prompts
convo_df["cleaned_prompts"].head()

In [None]:

# Word Frequency Analysis

# Tokenize cleaned text
all_words = " ".join(convo_df["cleaned_prompts"]).split()
word_freq = Counter(all_words)

# Most common words
print(word_freq.most_common(20))

# Word Cloud
wordcloud = WordCloud(width=800, height=400, background_color="white").generate(" ".join(all_words))
plt.figure(figsize=(10,5))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()


In [None]:

# Bi-gram & Tri-gram Analysis
from gensim.models.phrases import Phrases, Phraser

# Function to generate n-grams
def get_ngrams(texts, n=2, top_n=20):
    ngram_list = []
    for text in texts:
        tokens = text.split()
        ngram_list.extend(list(ngrams(tokens, n)))
    return Counter(ngram_list).most_common(top_n)


# Show top bi-grams & tri-grams
print("Top Bigrams:", get_ngrams(convo_df["cleaned_prompts"], 2))
print("Top Trigrams:", get_ngrams(convo_df["cleaned_prompts"], 3))

In [None]:
# Check for Anomalies after Cleaning
# Make sure dataset is clean before modeling

# Check missing values
print(convo_df.isnull().sum())

# Check duplicate prompts after cleaning
print("Duplicates:", convo_df.duplicated(subset=["cleaned_prompts"]).sum())

# Check text length distribution
convo_df["text_length"] = convo_df["cleaned_prompts"].apply(lambda x: len(x.split()))
convo_df["text_length"].describe()

In [None]:
# Formatting Processed Text for LDAvis


def generate_bigrams_trigrams(texts, min_count=5, threshold=10):
    """
    Apply bigrams and trigrams to tokenized texts.
    """
    tokenized_texts = [text.split() for text in texts]

    # Build the bigram and trigram models
    bigram = Phrases(tokenized_texts, min_count=min_count, threshold=threshold)
    trigram = Phrases(bigram[tokenized_texts], threshold=threshold)

    bigram_mod = Phraser(bigram)
    trigram_mod = Phraser(trigram)

    # Apply the models
    return [trigram_mod[bigram_mod[doc]] for doc in tokenized_texts]

#Tokenize   
tokenized_texts = generate_bigrams_trigrams(convo_df["cleaned_prompts"])
   
# Create dicitonary 
dictionary = Dictionary(tokenized_texts)

# Filter extremes 
dictionary.filter_extremes(no_below=5, no_above=0.7)

# Convert to corpus
corpus = [dictionary.doc2bow(text) for text in tokenized_texts]

In [None]:
# Function to generate and save an EDA report


def generate_eda_report(convo_df, text_column="cleaned_prompts", output_path="eda_report_GSAi_Topic_Model_10_Topics_Convo_v2.json"):
    report = {}

    # Dataset Overview
    report["Dataset Summary"] = {
        "Total Rows": int(len(convo_df)),
        "Missing Values": int(convo_df[text_column].isnull().sum()),
        "Duplicate Entries": int(convo_df.duplicated(subset=[text_column]).sum()),
        "Average Text Length": float(convo_df[text_column].apply(lambda x: len(x.split())).mean()),
        "Min Text Length": int(convo_df[text_column].apply(lambda x: len(x.split())).min()),
        "Max Text Length": int(convo_df[text_column].apply(lambda x: len(x.split())).max()),
    }

    # Word Frequency Analysis
    all_words = " ".join(convo_df[text_column]).split()
    word_freq = Counter(all_words)
    report["Top Words"] = [(word, int(freq)) for word, freq in word_freq.most_common(20)]

    # N-gram Analysis
    report["Top Bigrams"] = [(str(ngram), int(freq)) for ngram, freq in get_ngrams(convo_df[text_column], 2)]
    report["Top Trigrams"] = [(str(ngram), int(freq)) for ngram, freq in get_ngrams(convo_df[text_column], 3)]


    # Save report as JSON
    with open(output_path, "w") as f:
        json.dump(report, f, indent=4)

    print(f"EDA report saved to {output_path}")

# Run EDA and save results
generate_eda_report(convo_df)

In [None]:

#################################
# ---- Step 3: Topic Model ---- #
#################################

In [None]:


# Define a Logging Function for tracking LDAvis and BERTopic results
import datetime

def log_results(model_name, parameters, topics, coherence_score, output_path="model_results__GSAi_Topic_Model_10_Topics_Convo_v2.json"):
    """Logs model parameters and results."""
    log_entry = {
        "timestamp": str(datetime.datetime.now()),
        "model": model_name,
        "parameters": parameters,
        "topics": topics,
        "coherence_score": coherence_score,
    }

    # Load existing results if file exists
    try:
        with open(output_path, "r") as f:
            logs = json.load(f)
    except FileNotFoundError:
        logs = []

    logs.append(log_entry)

    # Save updated results
    with open(output_path, "w") as f:
        json.dump(logs, f, indent=4)

    print(f"Results logged to {output_path}")

In [None]:

###################
# LDA Topic Model #
###################

In [None]:

from itertools import product

# Function to train and evaluate LDA with different parameters
def tune_lda(dictionary, corpus, texts, num_topics_list, alpha_list, beta_list):
    best_model = None
    best_coherence = 0
    results = []

    # Create a list of all parameter combinations
    param_combinations = list(product(num_topics_list, alpha_list, beta_list))


    # Wrap the param combinations in tqdm
    for num_topics, alpha, beta in tqdm(param_combinations, desc="Tuning LDA Models", ncols=100):
        lda_model = LdaModel(
            corpus=corpus,
            id2word=dictionary,
            num_topics=num_topics,
            random_state=42,
            update_every=1,
            chunksize=100,
            passes=10,
            alpha=alpha,
            eta=beta
        )

        # Compute Coherence Score
        coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_score = coherence_model_lda.get_coherence()

        tqdm.write(f"Topics={num_topics}, Alpha={alpha}, Beta={beta}, Coherence={coherence_score:.4f}")

        results.append((num_topics, alpha, beta, coherence_score))

        # Track best model
        if coherence_score > best_coherence:
                best_model = lda_model
                best_coherence = coherence_score

    return best_model, results

# Define parameter search space
num_topics_list = [10]   # 10 as the intended number of topics
alpha_list = ['symmetric', 'asymmetric', 0.01, 0.1, 0.5]  # Test different alpha values
beta_list = ['symmetric', 0.01, 0.1, 0.5]  # Test different beta values

# Run LDA tuning
best_lda, lda_results = tune_lda(dictionary, corpus, tokenized_texts, num_topics_list, alpha_list, beta_list)

In [None]:
# Record and Analyze Results

# Convert results to a DataFrame
lda_results_df = pd.DataFrame(lda_results, columns=["Num Topics", "Alpha", "Beta", "Coherence Score"])

# Sort by best coherence score
lda_results_df = lda_results_df.sort_values(by="Coherence Score", ascending=False)

# Display top 5 results
print(lda_results_df.head())

In [None]:

#save lda model
best_lda.save("best_lda_model__GSAi_Topic_Model_10_Topics_Convo_v2.model")

#save dicitonary
dictionary.save("lda_dictionary.dict")

#save corpus
gensim.corpora.MmCorpus.serialize("lda_corpus.mm", corpus)

In [None]:

########################################
### LDAvis Topic Model Visualization ###
########################################

In [None]:

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
# Prepare the LDA visualization based off of best LDA model with num_topics=10, alpha='0.5', and beta=0.5
lda_display = gensimvis.prepare(best_lda, corpus, dictionary, sort_topics=False)

# Show LDA visualization
pyLDAvis.display(lda_display)

pyLDAvis.save_html(lda_display, "lda_topics_visualization__GSAi_Topic_Model_10_Topics_Convo_v2.html")
print("LDAvis HTML saved. Open 'lda_topics_visualization__GSAi_Topic_Model_10_Topics_Convo_v2.html' to explore topics.")

In [None]:

# Get topic distributions for each document
doc_topics = best_lda.get_document_topics(corpus, minimum_probability=0)

# Convert to a DataFrame
topic_matrix = pd.DataFrame([[tup[1] for tup in doc] for doc in doc_topics])

# Rename columns to "Topic_0", "Topic_1", ..., "Topic_N"
topic_matrix.columns = [f'Topic_{i}' for i in range(best_lda.num_topics)]

# Add Document ID (if available)
topic_matrix['Document_ID'] = range(len(topic_matrix))

# Check the output
print(topic_matrix.head())

In [None]:
# Assign most dominant topic
topic_matrix['Dominant_Topic'] = topic_matrix.iloc[:, :-1].idxmax(axis=1)

# Convert "Topic_0" → 0, "Topic_1" → 1, etc.
topic_matrix['Dominant_Topic'] = topic_matrix['Dominant_Topic'].apply(lambda x: int(x.split('_')[1]))

print(topic_matrix[['Document_ID', 'Dominant_Topic']].head())

In [None]:
# Get top words for each topic
topic_words = {i: [word for word, _ in best_lda.show_topic(i, topn=10)] for i in range(best_lda.num_topics)}

# Convert to DataFrame
topic_word_df = pd.DataFrame.from_dict(topic_words, orient='index', columns=[f'Word_{i}' for i in range(10)])

print(topic_word_df.head())

In [None]:
topic_matrix.to_csv("document_topic_matrix__GSAi_Topic_Model_10_Topics_Convo_v2.csv", index=False)
topic_word_df.to_csv("topic_word_distribution__GSAi_Topic_Model_10_Topics_Convo_v2.csv", index=True)

In [None]:

# Get the dominant topic for each document
topics_per_doc = [max(best_lda[doc], key=lambda x: x[1])[0] for doc in corpus]

# Count the occurrences of each topic
from collections import Counter
topic_counts = Counter(topics_per_doc)

# Print topic distribution
for topic_id, count in sorted(topic_counts.items()):
    print(f"Topic {topic_id}: {count} documents")

In [None]:

#Merge main df and topic_df
# Ensure main_df has a Document_ID column
main_df = convo_df.reset_index().rename(columns={"index": "Document_ID"})

# Merge main_df with topic_df
merged_df = main_df.merge(topic_matrix, on="Document_ID", how="left")

# Check the first few rows
print(merged_df.head())

In [None]:


merged_df

