In [None]:
import sys
sys.path.append("../src/")
from utils import LocalPLM, LocalModelArguments

In [None]:
args = LocalModelArguments(
    model_name_or_path = "microsoft/Phi-4-mini-instruct",
    cuda_devices = "0",
    use_4bit_quantization = True,
    bnb_4bit_quant_type = "nf4",
    bnb_4bit_compute_dtype = "float16",
    use_nested_quant = True,
    use_reentrant = True
)

model = LocalPLM(args)

In [None]:
DATA_PATH = "../discord-chat"

In [None]:
from datetime import timedelta, datetime
import pandas as pd
import numpy as np

def parse_discord_conversation(c : pd.DataFrame) -> pd.DataFrame:
    def parse_time_str(time : str) -> datetime:
        return datetime.strptime(time,'%Y-%m-%dT%H:%M:%S.%f0%z')

    c["Date"] = c["Date"].map(parse_time_str)
    c["Delay"] = c["Date"] - c["Date"].shift(1)
    c["Delay"] = c["Delay"].fillna( timedelta(seconds=0) )
    c = c.dropna(subset="Content").reset_index(drop=True)
    return c

def parse_discord_conversation_csv(path : str) -> pd.DataFrame:
    conversation = pd.read_csv(path)
    return parse_discord_conversation(conversation)

def get_chat(user : str, path : str) -> pd.DataFrame:
    path = path + "/Direct Messages - " + user + " [*.csv"
    conversation = glob.glob(path)

    if conversation:
        return parse_discord_conversation_csv(conversation[0])
    else:
        raise FileNotFoundError(f"No conversation(s) found at path {path}")

In [None]:
import glob

def split_by_conversations(messages : pd.DataFrame, gap_mins : 50, min_conv_length : int = 5) -> list[pd.DataFrame]:
    """
    Split a Discord conversation history into a list of shorter conversations separated by gap_mins minutes.

    Args:
        messages (DataFrame): Discord conversation history.
        gap_mins (int): How many minutes must have elapsed since the last message for the current message to be treated as the start of a new conversation.
        min_conv_length (int): If a conversation has less messages than this, don't include it in the list.
        max_conv_length (int): If a conversation has more messages than this, slice it up into chunks of this size.

    Returns:
        conversations (list[DataFrame]): List of all conversations ordered from least to most recent.
    """
    def is_new_conversation(delay : timedelta, max_delay_mins : int = gap_mins):
        """
        Given a delay between messages, asses whether the delay is sufficient enough
        for the message to be considered the start of a new conversation.
        """
        max_delay = timedelta(minutes = max_delay_mins)
        return delay > max_delay

    def get_conversation_indices(messages : pd.DataFrame) -> list[list[int]]:
        """
        Given a Discord conversation history with a boolean column "Start"
        denoting the start of a conversation, return a 2D list containing
        the indices of all messages grouped by conversation.
    
        Args:
            messages (DataFrame): Discord conversation history with "Start" column.
        Returns:
            conversation_indices (list[list[int]])
        """
        start_indices = messages[messages["Start"] == True].index
    
        indices = []
        for i in range(len(start_indices)):
            if i >= len(start_indices) - 1: continue
            indices.append(
                list(range(start_indices[i], start_indices[i+1]))
            )
        return indices
    
    messages["Start"] = messages["Delay"].map(is_new_conversation)
    messages.loc[0, "Start"] = True
    
    conversation_indices = get_conversation_indices(messages)
    
    conversations = []
    for indices in conversation_indices:

        if len(indices) < min_conv_length: continue
        conversation = messages.iloc[indices].reset_index(drop=True)
        conversations.append(conversation)

    return conversations

In [None]:
glob.glob(DATA_PATH + "/*.csv")

In [None]:
c = get_chat("Ben D", DATA_PATH)
c = split_by_conversations(c, 120)

In [None]:
def get_header(messages : pd.DataFrame, target_user : str | None = None) -> str:
    """
    Return statistics about a Discord conversation (i.e., users involved, time)
    """
    end_time = messages.iloc[-1].Date

    users = messages.Author.unique()
    if target_user:
        if not target_user in users:
            users = np.insert(users, 0, target_user)
            
    string = "Conversation between " + ", ".join(users) + "."
        
    string += "\nObtained " + end_time.strftime("%y/%m/%d, %H:%M:%S") + "."

    return string

def to_string(messages : pd.DataFrame, context : str | None = None, header : bool = True, target_user : str | None = None) -> str:
    """
    Convert a Discord conversation history from DataFrame into a raw string.
    """
    string = ""
    
    if header:
        string += get_header(messages, target_user=target_user)

    if context: string += "\nContext of the conversation:\n" + context
    
    for i, message in messages.iterrows():
        string += f"\n\n{message.Author} {message.Date.strftime("%H:%M:%S")}"
        string += f"\n{message.Content}"
    
    return string.strip()

In [None]:
from typing import Literal
def gen_prompt(messages : pd.DataFrame, prompt : str, context : str | None = None, context_role : Literal["system", "user"] = "system") -> str:
    """
    Generate a Chat Template prompt to perform NLP tasks on a Discord conversation history.

    Args:
        messages (DataFrame): The conversation history.
        prompt (str): The system prompt to give the LLM.
        context (str, optional): Optional additional information related to the conversation. If provided, aids LLM performance.
        context_role (Literal["system", "user"]) : Whether to append the context to the system prompt or the conversation history. Adding context to the system prompt usually yields better results. Defaults to "system".
    """
    messages = to_string(messages, context= context if context_role == "user" else None)

    if context_role == "system" and context: prompt += f"\nContext: {context}.\nAnswer concisely."

    prompt = [{"role":"system","content":prompt}]
    
    prompt.append({"role":"user","content":messages})

    return prompt

In [None]:
def understand_conversation(messages : pd.DataFrame, target_user : str, context : str | None = None, context_role : Literal["system", "user"] = "system") -> dict:
    """
    Use NLP to understand the meaning of a Discord conversation history from a third-person perspective.
    
    Returns three analyses of the conversation:
        - Topic: The topic of the conversation between the users.
        - Relationship: The relationship between the target user and other users.
        - Interest: The level of interest from the target user in the conversation.

    The analysis is done sequentially, from back to front:
        1. The personal interest of the target user in the conversation is gauged,
        2. The level of interest is used to assess the relationship between the users,
        3. The users' relationship is used as context when interpreting the subject of their conversation.
    
    Args:
        messages (DataFrame): The conversation history.
        target_user (str): Which user to focus on when analysing the conversation.
        context (str, optional): Optional additional information related to the conversation. If provided, aids LLM performance.
        context_role (Literal["system", "user"]) : Whether to append the context to the system prompt or the conversation history. Adding context to the system prompt usually yields better results. Defaults to "system".
        
    Returns:
        understanding (dict): The analysis of the conversation.
    """
    interest_prompt=f"Read the following conversation and tell me how interested {target_user} sounds in it. Be succinct."
    interest_prompt = gen_prompt(messages, interest_prompt, context=context, context_role=context_role)
    interest = model.generate(interest_prompt,max_new_tokens = 64).text
    
    relationship_prompt= f"Read the following conversation history and tell me what you think the relationship is between the users. Answer succinctly."
    
    if context: context += ", " + interest
    else: context = interest
    relationship_prompt = gen_prompt(messages, relationship_prompt, context=context, context_role=context_role)
    relationship = model.generate(relationship_prompt,temperature=1,max_new_tokens = 128).text
    
    topic_prompt="Read the following conversation history and tell me what was discussed. Answer succinctly."
    topic_prompt = gen_prompt(messages, topic_prompt, context=relationship + ", " + interest, context_role=context_role)
    topic = model.generate(topic_prompt,temperature=1,max_new_tokens = 128).text

    #return f"Conversation topic:\n{topic}\n\nRelationship between users:\n{relationship}\n\nPersonal interest:\n{interest}"
    return {"interest":interest,"relationship":relationship,"topic":topic}

In [None]:
def understand_conversation_pov(messages : pd.DataFrame, target_user : str, context : str | None = None, context_role : Literal["system", "user"] = "system") -> dict:
    """
    Use NLP to understand the meaning of a Discord conversation history from the perspective of a given user in first-person.
    
    Returns three analyses of the conversation:
        - Topic: The topic of the conversation between the users.
        - Relationship: The relationship between the target user and other users.
        - Interest: The level of interest from the target user in the conversation.

    The analysis is done sequentially, from back to front:
        1. The personal interest of the target user in the conversation is gauged,
        2. The level of interest is used to assess the relationship between the users,
        3. The users' relationship is used as context when interpreting the subject of their conversation.
    
    Args:
        messages (DataFrame): The conversation history.
        target_user (str): Which user to focus on when analysing the conversation.
        context (str, optional): Optional additional information related to the conversation. If provided, aids LLM performance.
        context_role (Literal["system", "user"]) : Whether to append the context to the system prompt or the conversation history. Adding context to the system prompt usually yields better results. Defaults to "system".
        
    Returns:
        understanding (dict): The analysis of the conversation.
    """
    # Get a string for the name of all other users
    other_users = " and ".join([i for i in messages.Author.unique() if not i == target_user])
    
    interest_prompt=f"Your name is {target_user}. Read one of your past text conversations with {other_users} and tell me how interested you were during it. Respond with first person perspective. Be succinct."
    interest_prompt = gen_prompt(messages, interest_prompt, context=context, context_role=context_role)
    interest = model.generate(interest_prompt,max_new_tokens = 64).text
    
    relationship_prompt= f"Your name is {target_user}. Read one of your past text conversations with {other_users} and tell me what your relationship is with them. Respond with first person perspective. Be succinct."
    
    if context: context += ", " + interest
    else: context = interest
    relationship_prompt = gen_prompt(messages, relationship_prompt, context=context, context_role=context_role)
    relationship = model.generate(relationship_prompt,temperature=1,max_new_tokens = 128).text
    
    topic_prompt=f"Your name is {target_user}. Read one of your past text conversations with {other_users} and tell me what you were talking about. Respond with first person perspective. Be succinct."
    topic_prompt = gen_prompt(messages, topic_prompt, context=relationship + ", " + interest, context_role=context_role)
    topic = model.generate(topic_prompt,temperature=1,max_new_tokens = 128).text

    #return f"Conversation topic:\n{topic}\n\nMy relationship with {other_users}:\n{relationship}\n\nMy interest in the conversation:\n{interest}"
    return {"interest":interest,"relationship":relationship,"topic":topic}

In [None]:
def understanding_to_string(understanding : dict, other_users : str) -> str:
    topic, relationship, interest = understanding["topic"], understanding["relationship"], understanding["interest"]
    return f"Conversation topic:\n{topic}\n\nMy relationship with {other_users}:\n{relationship}\n\nMy interest in the conversation:\n{interest}"

In [None]:
def predict_thought(conversation : pd.DataFrame, message_id : int, context="context", tokens : int = 128):
    conversation = conversation.reset_index(drop=True)[:message_id + 1] 
    target_message = conversation.iloc[message_id]
    target_text = target_message.Content
    target_user = target_message.Author
    other_users = " and ".join([i for i in conversation.Author.unique() if not i == target_user])
    
    context = understand_conversation_pov(conversation, target_user)

    thought_prompt = f"""
Your name is {target_user}. You are in a text conversation with {other_users}.
Read the conversation, then tell me what you are thinking as you say:
'{target_text}'. Answer in first-person tense. Be succinct.""".strip()

    thought_prompt = gen_prompt(conversation, thought_prompt, context=context)

    predicted_thought = model.generate(thought_prompt, temperature=1, max_new_tokens=tokens).text

    return predicted_thought

In [None]:
from tqdm.notebook import tqdm

def conversation_to_dataset(conversation : pd.DataFrame, target_user : str, batch_size : int = 10, thinking_tokens : int = 0) -> pd.DataFrame:
    """
    Convert a Discord conversation into a supervised chat dataset from the perspective of a given user.
    This can be used to predict messages from a given user (i.e., training a model to impersonate you).
    
    Conversations are split up into smaller batches to reduce the size of each input text.
    At the start of each new batch, a summarisation of the previous batch's conversation is given as context.
    This helps eliminate loss of semantic meaning when slicing conversations into chunks of arbitrary size.

    Optionally, you can allow an LLM to guess what the target user was thinking for each message.
    This feature is aimed to improve LLM response precision by getting in the head of the target user.
    
    Args:
        conversation (DataFrame): The conversation to convert.
        target_user (str): Which user we're trying to predict the messages of.
        batch_size (int, optional): Maximum number of new input messages per sample. Defaults to 10.
        thinking_tokens (int, optional): If > 0, predicts the thoughts of the target user for each message using a given number of tokens. Defaults to 0.
    """
    data = {"content" : [], "label" : []}
    
    # Get the index of each message
    indices = list(conversation.index)

    # Slice indices into batches / chunks
    chunks = [indices[i:i + batch_size] for i in range(0, len(indices), batch_size)]

    # If we want to predict the target user's thoughts
    # for each message, we should first gauge what the
    # relationship between the users is like for all
    # messages in the conversation. We can then use
    # this relationship info as context for the
    # thought prediction prompt to improve its accuracy.
    if thinking_tokens > 0:
        full_context = understand_conversation_pov( conversation, target_user=target_user )["relationship"]
    
    # Create an empty context for now
    context_str = None
    
    # For each batch
    for i, indices in enumerate(tqdm(chunks, "Parsing conversation batches", position=0)):
        start_index = indices[0]

        # Get the indices of the target user's messages
        user_indices = conversation.iloc[indices]
        user_indices = user_indices[user_indices.Author == target_user].index
        user_indices = list(user_indices)

        # For each user message
        for index in tqdm(user_indices, "Parsing messages", position=1):
            # Get all messages which preceded it in the batch as a string
            s = start_index
            if start_index == index: s -= 1
            inputs = conversation.iloc[s:index]
            inputs = to_string(inputs, header=True, context=context_str, target_user=target_user)
            
            # Get the user message itself as a string
            output = conversation.iloc[index:index+1]
            output = to_string(output, header=False, target_user=target_user)

            # Get the user's thought for the given message
            if thinking_tokens > 0:

                # We have to get the index of the user's message relative to
                # the start of the batch for .iloc[] to work inside the batch
                local_index = index - start_index

                # Create a context for the user's thought for the message
                # using their relationship with the other users + conversation history
                thinking_context = full_context + "\n" + context_str if context_str else full_context

                # Predict the user's thought for the message
                thought = predict_thought( conversation.iloc[indices], local_index, context_str, tokens=thinking_tokens)

                # Enclose the thought in <thinking> tags
                output = f"<thinking>{thought}</thinking>\n\n" + output

            data['content'].append(inputs)
            data['label'].append(output)
    
            print("\n----\nIN:")
            print(inputs)
            print("\n----\nOUT:")
            print(output)
            print("----")

        # At the end of each batch, summarise what was discussed
        # to use as the context string for the next batch.
        # (Only do this if there are more chunks remaining)
        if i < len(chunks) - 1:
            context_str = understand_conversation_pov( conversation.iloc[indices], context=context_str, target_user=target_user )["topic"]

    return pd.DataFrame(data)

In [None]:
conversation_to_dataset(c[0], "alzter", batch_size=10, thinking_tokens=128)

In [None]:
from tqdm.notebook import tqdm

def understand_conversations(conversations : list[pd.DataFrame], target_user : str) -> pd.DataFrame:
    """
    Super understand_conversation:
    Analyses meaning for a series of Discord conversations sequentially.
    """
    # We will generate the context for each conversation
    contexts = []

    # Each conversation is given the context of the previous
    # conversation to recursively build meaning. To start with
    # we have zero previous context, so set context_str to None.
    context_str = None

    # Parse the meaning of each conversation sequentially
    for conversation in tqdm(conversations, "Understanding conversations"):
        context = understand_conversation(conversation, target_user=target_user, context=context_str)
        contexts.append(context)

        # Give the next conversation the summarised topic of this
        # conversation for added context to improve meaning extraction
        context_str = f"Previous discussion: {context["topic"]}"
    
    # Restructure contexts from list of dicts -> dict of lists
    contexts = pd.DataFrame(contexts).to_dict(orient='list')

    return pd.DataFrame(contexts)