In [None]:
from bertopic import BERTopic
from hdbscan import HDBSCAN
import pandas as pd
from transformers import pipeline
from umap import UMAP

In [2]:
# For reproducibility, set explicit parameters
umap_model = UMAP(
    n_neighbors=15,
    n_components=5,  # Number of dimensions after dimensionality reduction
    min_dist=0.0,
    metric="cosine",
    random_state=42,
)

In [3]:
hdbscan_model = HDBSCAN(
    min_cluster_size=5,  # Determines how many documents need to be in a topic
    metric="euclidean",
    cluster_selection_method="eom",
    prediction_data=True,
)

In [None]:
docs = pd.read_csv("data/raw/2024_COFE_SS_sample_text.csv")
docs = docs.iloc[1:]  # The first row is a comment on the column contents
docs = docs.dropna(subset="outcometxt")
docs.head(2)

In [None]:
docs.shape

In [None]:
from langchain_dartmouth.llms import ChatDartmouth
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import JsonOutputParser

sentence_splitter = ChatDartmouth(
    model_name="llama-3-1-8b-instruct", seed=42, temperature=0
)

list_transformer = (
    ChatDartmouth(model_name="llama-3-1-8b-instruct", seed=42, temperature=0)
    | JsonOutputParser()
)

SURVEY_PROMPT = "Please use the space below to describe the most important outcomes of your time as an undergraduate."

splitter_prompt = ChatPromptTemplate(
    [
        (
            "human",
            "The following is a response to the survey prompt '{survey_prompt}'. We want to analyze the topics mentioned in the response. To facilitate this analysis, split the response into stand-alone sentences. Make sure that the sentences can be analyzed in isolation. Make any references to previous sentences explicit by replacing pronouns with their proper noun. Here is the response: \n\n{response}",
        ),
    ]
)

list_transformer_prompt = ChatPromptTemplate(
    [
        (
            "system",
            "You are a text processor that converts text into valid JSON format.",
        ),
        (
            "human",
            "The following is a list of sentences. The sentences: \n\n{sentences}"
            + 'Reformat them into a JSON using the following schema: ```["sentence": <sentence text>, "sentence": <sentence text>]```',
        ),
    ]
)

SAMPLE_RESPONSE = "My most valuable experiences at Dartmouth had nothing to do with the classes that I was taking; if anything, those classes inhibited my from doing what I actually cared about, which was my extracurricular activities, political advocacy, and social connection, The environment that Dartmouth placed me in gave me the resources and situations necessary to excel at my non-classroom activities, which were the ones that I actually cared about since they will be what I do after I graduate and are what make me happy, "

print(SAMPLE_RESPONSE)
print("-" * 20)
response = sentence_splitter.invoke(
    splitter_prompt.format(survey_prompt=SURVEY_PROMPT, response=SAMPLE_RESPONSE)
)
response.pretty_print()
print("-" * 20)

response = list_transformer.invoke(
    list_transformer_prompt.format(sentences=response.content)
)
response

In [None]:
docs["outcometxt"]

In [None]:
def split_sentences(response):
    sentences = sentence_splitter.invoke(
        splitter_prompt.format(survey_prompt=SURVEY_PROMPT, response=response)
    )
    result = list_transformer.invoke(
        list_transformer_prompt.format(sentences=sentences.content)
    )
    return [sentence["sentence"] for sentence in result]


docs["outcometxt"] = docs["outcometxt"].apply(split_sentences)

In [None]:
docs = docs.explode(column="outcometxt")

In [None]:
responses = docs["outcometxt"].dropna().to_list()
responses

In [None]:
from bertopic.representation import KeyBERTInspired

# Fine-tune your topic representations
representation_model = KeyBERTInspired()
topic_model = BERTopic(
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    representation_model=representation_model,
    top_n_words=20,
    verbose=True,
)

topics, probs = topic_model.fit_transform(
    responses,
)

In [None]:
topic_model.visualize_topics()

In [None]:
topic_model.visualize_documents(responses)

In [None]:
docs = topic_model.get_document_info(responses)

In [None]:
subset = docs[docs.Topic == 0]
subset.sample(2)

In [None]:
subset_topic_model = BERTopic(
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    representation_model=representation_model,
    top_n_words=20,
    verbose=True,
)

subset_topics, probs = subset_topic_model.fit_transform(
    subset.Document,
)

In [None]:
subset_docs = subset_topic_model.get_document_info(subset.Document)

In [None]:
subset_topic_model.visualize_documents(subset_docs.Document.to_list())

In [None]:
pd.set_option("display.max_colwidth", None)
subset_docs[["Document", "Topic"]].query("Topic == 2 or Topic == 1")

In [None]:
topic_distr, _ = topic_model.approximate_distribution(responses)

In [None]:
topic_distr[topic_distr < 0.2] = 0

In [None]:
import seaborn as sns


sns.heatmap(topic_distr)

In [None]:
import pandas as pd

topic_distr = pd.DataFrame(topic_distr)
topic_distr

In [None]:
topic_association = topic_distr >= 0.2
topic_association.corr(method="kendall")

In [None]:
from langchain_dartmouth.llms import ChatDartmouth
from langchain.prompts import ChatPromptTemplate

llm = ChatDartmouth(model_name="llama-3-1-8b-instruct")

prompt = ChatPromptTemplate(
    [
        (
            "system",
            "You are a topic representation model. Your task is to find a representative topic for a collection of similar texts.",
        ),
        (
            "human",
            "Here is the collection of texts that should all be labeled with the same topic representation: {texts} ",
        ),
    ]
)

llm.invoke(prompt.format(texts="A happy dog. \n\n A cranky cat."))

In [None]:
docs["LLM_representation"] = None


def find_representation(group):
    texts = "\n\n".join(group.Document.to_list())
    try:
        response = llm.invoke(prompt.format(texts=texts))
        group["LLM_representation"] = response.content
    except Exception:
        pass
    return group


results = docs.groupby("Topic").apply(find_representation)

In [None]:
docs["LLM_representation"]

In [None]:
results.reset_index(drop=True).groupby("Topic").LLM_representation.unique().to_list()

In [None]:
sentiment_pipeline = pipeline(
    task="sentiment-analysis",
    model="distilbert/distilbert-base-uncased-finetuned-sst-2-english",
)
sentiment_pipeline(docs[docs.Topic == 0].Document.to_list()[:10])