In [None]:
from bertopic import BERTopic
from hdbscan import HDBSCAN
import pandas as pd
from transformers import pipeline
from umap import UMAP

In [None]:
# For reproducibility, set explicit parameters
umap_model = UMAP(
    n_neighbors=15,
    n_components=5,  # Number of dimensions after dimensionality reduction
    min_dist=0.0,
    metric="cosine",
    random_state=42,
)

In [None]:
hdbscan_model = HDBSCAN(
    min_cluster_size=5,
    metric="euclidean",
    cluster_selection_method="eom",
    prediction_data=True,
)

In [None]:
docs = pd.read_csv("data/raw/2024_COFE_SS_sample_text.csv")
docs = docs.iloc[1:]  # The first row is a comment on the column contents
docs

In [None]:
responses = docs["outcometxt"].dropna().to_list()
responses

In [None]:
from bertopic.representation import KeyBERTInspired

# Fine-tune your topic representations
representation_model = KeyBERTInspired()
topic_model = BERTopic(
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    representation_model=representation_model,
    top_n_words=20,
    verbose=True,
)

topics, probs = topic_model.fit_transform(
    responses,
)

In [None]:
topic_model.visualize_topics()

In [None]:
topic_model.visualize_documents(responses)

In [None]:
docs = topic_model.get_document_info(responses)

In [None]:
topic_distr, _ = topic_model.approximate_distribution(responses)

In [None]:
topic_model.visualize_distribution(topic_distr[1])

In [None]:
from dartmouth_langchain.llms import DartmouthChatModel
from langchain.prompts import PromptTemplate

llm = DartmouthChatModel(model_name="codellama-13b-instruct-hf")

# prompt_template = PromptTemplate.from_template(
#     """<|begin_of_text|><|start_header_id|>system<|end_header_id|>

# You are a topic representation model. Your task is to find a representative topic for a collection of similar texts.<|eot_id|><|start_header_id|>user<|end_header_id|>

# Here is the collection of texts that should all be labeled with the same topic representation: {texts}<|eot_id|><|start_header_id|>assistant<|end_header_id|>"""
# )
prompt_template = PromptTemplate.from_template(
    """<s>[INST] <<SYS>>
You are a topic representation model. Your task is to find a representative topic for a collection of similar texts.
<</SYS>>

Here is the collection of texts that should all be labeled with the same topic representation: {texts} [/INST] """
)

llm.invoke(prompt_template.format(texts="A happy dog. \n A cranky cat."))

In [None]:
docs["LLM_representation"] = None


def find_representation(group):
    texts = "\n\n".join(group.Document.to_list())
    try:
        response = llm.invoke(prompt_template.format(texts=texts))
        group["LLM_representation"] = response
    except Exception:
        pass
    return group


results = docs.groupby("Topic").apply(find_representation)

In [None]:
results.reset_index(drop=True).groupby("Topic").LLM_representation.unique().to_list()

In [None]:
sentiment_pipeline = pipeline(
    task="sentiment-analysis",
    model="distilbert/distilbert-base-uncased-finetuned-sst-2-english",
)
sentiment_pipeline(docs[docs.Topic == 0].Document.to_list()[:10])