In [None]:
import sys
sys.path.append("../src/")
from utils import LocalPLM, LocalModelArguments

In [None]:
args = LocalModelArguments(
    model_name_or_path = "microsoft/Phi-4-mini-instruct",
    cuda_devices = "0",
    use_4bit_quantization = True,
    bnb_4bit_quant_type = "nf4",
    bnb_4bit_compute_dtype = "float16",
    use_nested_quant = True,
    use_reentrant = True
)

model = LocalPLM(args)

In [None]:
DATA_PATH = "../discord-chat"

In [None]:
import glob
def get_chat(user : str, path : str):
    user = "ThisGreenDingo"
    conversation = path + "/Direct Messages - " + user + " [*.csv"
    conversation = glob.glob(conversation)[0]

In [None]:
from datetime import timedelta

def is_new_conversation(delay : timedelta, max_delay_mins : int = 50):
    max_delay = timedelta(minutes = max_delay_mins)
    return delay > max_delay
    
def read_conversation(path : str) -> pd.DataFrame:
    def parse_time_str(time : str) -> datetime:
        return datetime.strptime(time,'%Y-%m-%dT%H:%M:%S.%f0%z')
        
    c = pd.read_csv(path)
    c["Date"] = c["Date"].map(parse_time_str)
    c["Delay"] = c["Date"] - c["Date"].shift(1)
    c["Delay"] = c["Delay"].fillna( timedelta(seconds=0) )
    c["Start"] = c["Delay"].map(is_new_conversation)
    c.loc[0, "Start"] = True
    return c

def get_chat(user : str, path : str) -> pd.DataFrame:
    path = path + "/Direct Messages - " + user + " [*.csv"
    conversation = glob.glob(path)

    if conversation:
        return read_conversation(conversation[0])
    else:
        raise FileNotFoundError(f"No conversation(s) found at path {path}")

In [None]:
def get_conversation_indices(messages : pd.DataFrame) -> list[int]:
    start_indices = messages[messages["Start"] == True].index

    indices = []
    for i in range(len(start_indices)):
        if i >= len(start_indices) - 1: continue
        indices.append(
            list(range(start_indices[i], start_indices[i+1]))
        )
    return indices

def split_by_conversations(messages : pd.DataFrame, min_conv_length : int = 5, max_conv_length : int = 30) -> list[pd.DataFrame]:
    conversation_indices = get_conversation_indices(messages)
    
    conversations = []
    for indices in conversation_indices:

        if len(indices) < min_conv_length: continue

        # Slice indices so they don't exceed max_conv_length
        indices = [indices[i:i + max_conv_length] for i in range(0, len(indices), max_conv_length)]

        for sub_indices in indices:
            conversations.append(messages.iloc[sub_indices])

    return conversations

In [None]:
c = get_chat("Grumpy Koala", DATA_PATH)

In [None]:
c = split_by_conversations(c)

In [None]:
c[0].head()

In [None]:
def to_string(messages : pd.DataFrame) -> str:
    messages = messages

    end_time = messages.iloc[-1].Date
    
    string = "Conversation history between " + ", ".join(messages.Author.unique())
    string += "\n" + end_time.strftime("%y/%m/%d, %H:%M:%S") + "\n\n"
    for i, message in messages.iterrows():
        string += f"{message.Author} {message.Date.strftime("%H:%M:%S")}"
        string += f"\n{message.Content}\n\n"
    
    return string

In [None]:
def gen_prompt(messages : pd.DataFrame, prompt : str, context : str | None = None) -> str:
    messages = to_string(messages)

    if context: prompt += f"\nContext: {context}.\nAnswer concisely."

    prompt = [{"role":"system","content":prompt}]
    
    prompt.append({"role":"user","content":messages})

    return prompt

In [None]:
def understand_conversation(messages : pd.DataFrame, context : str | None = None) -> dict:
    relationship_prompt= "Read the following conversation history and tell me what you think the relationship is between the users. Answer succinctly."
    relationship_prompt = gen_prompt(messages, relationship_prompt, context=context)
    relationship = model.generate(relationship_prompt,temperature=1,max_new_tokens = 128).text
    
    topic_prompt="Read the following conversation history and tell me what was discussed. Answer succinctly."
    topic_prompt = gen_prompt(messages, topic_prompt, context=relationship)
    topic = model.generate(topic_prompt,temperature=1,max_new_tokens = 128).text

    return {"relationship":relationship,"topic":topic}

In [None]:
from tqdm.notebook import tqdm

def understand_conversations(conversations : list[pd.DataFrame]) -> pd.DataFrame:
    # We will generate the context for each conversation
    contexts = []

    # Each conversation is given the context of the previous
    # conversation to recursively build meaning. To start with
    # we have zero previous context, so set context_str to None.
    context_str = None

    # Parse the meaning of each conversation sequentially
    for conversation in tqdm(conversations, "Understanding conversations"):
        context = understand_conversation(conversation, context=context_str)
        contexts.append(context)

        # Give the next conversation the summarised topic of this
        # conversation for added context to improve meaning extraction
        context_str = f"Previous discussion: {context["topic"]}"
    
    # Restructure contexts from list of dicts -> dict of lists
    contexts = pd.DataFrame(contexts).to_dict(orient='list')

    return pd.DataFrame(contexts)

In [None]:
contexts = understand_conversations(c)