In [None]:
import sys
sys.path.append("../src/")
from utils import LocalPLM, LocalModelArguments

In [None]:
args = LocalModelArguments(
    model_name_or_path = "microsoft/Phi-4-mini-instruct",
    cuda_devices = "0",
    use_4bit_quantization = True,
    bnb_4bit_quant_type = "nf4",
    bnb_4bit_compute_dtype = "float16",
    use_nested_quant = True,
    use_reentrant = True
)

model = LocalPLM(args)

In [None]:
DATA_PATH = "../discord-chat"

In [None]:
from datetime import timedelta, datetime
import pandas as pd
import numpy as np

    
def read_conversation(path : str) -> pd.DataFrame:
    def parse_time_str(time : str) -> datetime:
        return datetime.strptime(time,'%Y-%m-%dT%H:%M:%S.%f0%z')
        
    c = pd.read_csv(path)
    c["Date"] = c["Date"].map(parse_time_str)
    c["Delay"] = c["Date"] - c["Date"].shift(1)
    c["Delay"] = c["Delay"].fillna( timedelta(seconds=0) )
    return c

def get_chat(user : str, path : str) -> pd.DataFrame:
    path = path + "/Direct Messages - " + user + " [*.csv"
    conversation = glob.glob(path)

    if conversation:
        return read_conversation(conversation[0])
    else:
        raise FileNotFoundError(f"No conversation(s) found at path {path}")

In [None]:
import glob



def split_by_conversations(messages : pd.DataFrame, gap_mins : 50, min_conv_length : int = 5, max_conv_length : int = 30) -> list[pd.DataFrame]:
    """
    Split a Discord conversation history into a list of shorter conversations separated by gap_mins minutes.

    Args:
        messages (DataFrame): Discord conversation history.
        gap_mins (int): How many minutes must have elapsed since the last message for the current message to be treated as the start of a new conversation.
        min_conv_length (int): If a conversation has less messages than this, don't include it in the list.
        max_conv_length (int): If a conversation has more messages than this, slice it up into chunks of this size.

    Returns:
        conversations (list[DataFrame]): List of all conversations ordered from least to most recent.
    """
    def is_new_conversation(delay : timedelta, max_delay_mins : int = gap_mins):
        """
        Given a delay between messages, asses whether the delay is sufficient enough
        for the message to be considered the start of a new conversation.
        """
        max_delay = timedelta(minutes = max_delay_mins)
        return delay > max_delay

    def get_conversation_indices(messages : pd.DataFrame) -> list[list[int]]:
        """
        Given a Discord conversation history with a boolean column "Start"
        denoting the start of a conversation, return a 2D list containing
        the indices of all messages grouped by conversation.
    
        Args:
            messages (DataFrame): Discord conversation history with "Start" column.
        Returns:
            conversation_indices (list[list[int]])
        """
        start_indices = messages[messages["Start"] == True].index
    
        indices = []
        for i in range(len(start_indices)):
            if i >= len(start_indices) - 1: continue
            indices.append(
                list(range(start_indices[i], start_indices[i+1]))
            )
        return indices
    
    messages["Start"] = messages["Delay"].map(is_new_conversation)
    messages.loc[0, "Start"] = True
    
    conversation_indices = get_conversation_indices(messages)
    
    conversations = []
    for indices in conversation_indices:

        if len(indices) < min_conv_length: continue

        # Slice indices so they don't exceed max_conv_length
        indices = [indices[i:i + max_conv_length] for i in range(0, len(indices), max_conv_length)]
        
        for sub_indices in indices:
            if len(sub_indices) < min_conv_length: continue
            conversations.append(messages.iloc[sub_indices])

    return conversations

In [None]:
c = get_chat("ThisGreenDingo", DATA_PATH)

In [None]:
c = split_by_conversations(c, 120)

In [None]:
def to_string(messages : pd.DataFrame, context : str | None = None) -> str:
    messages = messages

    end_time = messages.iloc[-1].Date
    
    string = "Conversation history between " + ", ".join(messages.Author.unique()) + "."
    
    string += "\nObtained " + end_time.strftime("%y/%m/%d, %H:%M:%S") + "."

    if context: string += "\nContext of the conversation:\n" + context
    
    for i, message in messages.iterrows():
        string += f"\n\n{message.Author} {message.Date.strftime("%H:%M:%S")}"
        string += f"\n{message.Content}"
    
    return string

In [None]:
from typing import Literal
def gen_prompt(messages : pd.DataFrame, prompt : str, context : str | None = None, context_role : Literal["system", "user"] = "system") -> str:
    """
    Generate a Chat Template prompt to perform NLP tasks on a Discord conversation history.

    Args:
        messages (DataFrame): The conversation history.
        prompt (str): The system prompt to give the LLM.
        context (str, optional): Optional additional information related to the conversation. If provided, aids LLM performance.
        context_role (Literal["system", "user"]) : Whether to append the context to the system prompt or the conversation history. Adding context to the system prompt usually yields better results. Defaults to "system".
    """
    messages = to_string(messages, context= context if context_role == "user" else None)

    if context_role == "system": prompt += f"\nContext: {context}.\nAnswer concisely."

    prompt = [{"role":"system","content":prompt}]
    
    prompt.append({"role":"user","content":messages})

    return prompt

In [None]:
def understand_conversation(messages : pd.DataFrame, target_user : str = "alzter", context : str | None = None, context_role : Literal["system", "user"] = "system") -> dict:
    """
    Use NLP to understand the meaning of a Discord conversation history from the perspective of a single user.
    
    Returns three analyses of the conversation:
        - Interest: The level of interest from the target user in the conversation.
        - Relationship: The relationship between the target user and other users.
        - Topic: The topic of the conversation between the users.
    
    Args:
        messages (DataFrame): The conversation history.
        target_user (str): Which user to focus on when analysing the conversation.
        context (str, optional): Optional additional information related to the conversation. If provided, aids LLM performance.
        context_role (Literal["system", "user"]) : Whether to append the context to the system prompt or the conversation history. Adding context to the system prompt usually yields better results. Defaults to "system".
        
    Returns:
        understanding (dict): The analysis of the conversation.
    """
    interest_prompt=f"Read the following conversation and tell me how interested {target_user} sounds in it. Be succinct."
    interest_prompt = gen_prompt(messages, interest_prompt)
    interest = model.generate(interest_prompt,max_new_tokens = 64).text
    
    relationship_prompt= "Read the following conversation history and tell me what you think the relationship is between the users. Answer succinctly."
    
    if context: context += ", " + interest
    else: context = interest
    relationship_prompt = gen_prompt(messages, relationship_prompt, context=context)
    relationship = model.generate(relationship_prompt,temperature=1,max_new_tokens = 128).text
    
    topic_prompt="Read the following conversation history and tell me what was discussed. Answer succinctly."
    topic_prompt = gen_prompt(messages, topic_prompt, context=relationship + ", " + interest)
    topic = model.generate(topic_prompt,temperature=1,max_new_tokens = 128).text

    return {"interest":interest,"relationship":relationship,"topic":topic}

In [None]:
from tqdm.notebook import tqdm

def understand_conversations(conversations : list[pd.DataFrame]) -> pd.DataFrame:
    """
    Super understand_conversation:
    Analyses meaning for a series of Discord conversations sequentially.
    """
    # We will generate the context for each conversation
    contexts = []

    # Each conversation is given the context of the previous
    # conversation to recursively build meaning. To start with
    # we have zero previous context, so set context_str to None.
    context_str = None

    # Parse the meaning of each conversation sequentially
    for conversation in tqdm(conversations, "Understanding conversations"):
        context = understand_conversation(conversation, context=context_str)
        contexts.append(context)

        # Give the next conversation the summarised topic of this
        # conversation for added context to improve meaning extraction
        context_str = f"Previous discussion: {context["topic"]}"
    
    # Restructure contexts from list of dicts -> dict of lists
    contexts = pd.DataFrame(contexts).to_dict(orient='list')

    return pd.DataFrame(contexts)

In [None]:
c[0]

In [None]:
# Context in system prompt

understand_conversation(c[0])

In [None]:
# Context in user message

understand_conversation(c[0])

In [None]:
contexts = understand_conversations(c[3:10])

In [None]:
# Display all rows
pd.set_option('display.max_rows', None)

# Display all columns
pd.set_option('display.max_columns', None)

# Display full width columns
pd.set_option('display.max_colwidth', None)

# Optional: Adjust display width for better layout
pd.set_option('display.width', 1000)

contexts

In [None]:
context = " ".join([contexts.iloc[-1][i] for i in contexts.iloc[-1].keys()])

In [None]:
message = c[1 + 3]
context = contexts.iloc[1]
context = context = " ".join([context[i] for i in context.keys()])

print(
    model.generate(
        gen_prompt(
            message,
            "Read the following conversation history and predict alzter's next message. Start your response with 'alzter'.",
        context = context
        ),
        temperature=1
    ).text
)