In [67]:
import pandas as pd

from langchain_dartmouth.llms import ChatDartmouth
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser


from itertools import batched

In [None]:
docs = pd.read_csv("data/raw/2024_COFE_SS_sample_text.csv")
question = docs.loc[0, "outcometxt"]
question

In [None]:
docs = docs.iloc[1:]  # The first row is a comment on the column contents
docs = docs.dropna(subset="outcometxt")
responses = docs["outcometxt"].to_list()
responses[:5]

In [70]:
llm = ChatDartmouth(
    model_name="llama-3-1-8b-instruct",
    temperature=0,
    seed=42,
    max_tokens=60_000,
)

example_response = (
    "Dartmouth has been a transformative experience that's shaped my academic, "
    "personal, and professional growth. I've gained invaluable knowledge and "
    "skills through rigorous courses in econ and international relations, which "
    "have helped me develop a nuanced perspective on global issues. However, I've "
    "also struggled with the intense academic pressure and lack of diversity in "
    "certain departments.\n-----\n"
    "My time at Dartmouth has been marked by incredible opportunities for growth "
    "and learning, with notable high points including studying abroad in Asia and "
    "leading a successful student organization. However, I've also struggled with "
    "the intense competition for research funding and the sometimes isolating "
    "Upper Valley setting, which can make it difficult to connect with peers from "
    "other parts of the country."
)

example_output = """[
    {{
        "response" : <original response 1>,
        "topics": [
            {{"name": "academic and personal growth", "sentiment": "positive"}},
            {{"name": "academic rigor", "sentiment": "positive"}},
            {{"name": "academic pressure", "sentiment": "negative"}},
            {{"name": "lack of diversity", "sentiment": "negative"}}
        ]
    }},
    {{
        "response" : <original response 2>,
        "topics": [
            {{"name": "Dartmouth student organization experience", "sentiment": "positive"}},
            {{"name": "study abroad experience", "sentiment": "positive"}},
            {{"name": "research funding", "sentiment": "negative"}},
            {{"name": "Upper Valley setting", "sentiment": "negative"}}
        ]
    }}
]"""

prompt = PromptTemplate.from_template(
    (
        "The following are survey responses from students in their senior year at Dartmouth College.\n"
        "The prompt was: '{question}'\n"
        "Identify the topics mentioned in each response, "
        "as well as the sentiments expressed towards those topics. "
        "Format your response in valid JSON. Respond only with the JSON itself. "
        "Here is an example: \n\n"
        "Response: \n'" + example_response + "'\n\n"
        "Output: \n" + example_output + "\n\n"
        "Here are the responses to process, separated by '\\n------\\n':\n\n"
        "{responses}"
    )
)

In [76]:
parser = JsonOutputParser()
chain = prompt | llm | parser

outputs = []
for batch in batched(responses[:20], n=10):
    batch = "\n------\n".join(batch)
    response = chain.invoke({"responses": batch, "question": question})
    outputs.extend(response)

# TODO:

Consolidate the topics in a second pass:
- Take all topics identified in the first pass
- Prompt the model to merge overlapping topics to standardize the labels
- Go over first-pass results and use standardized labels