# Preprocessing WhatsApp Chat Data

This notebook provides a step-by-step guide to preprocess WhatsApp chat data for further analysis or model training.

## 1. Removing LRM Characters

WhatsApp includes LRM (Left-to-Right Mark) characters with some of the messages. We need to remove them to ensure consistent formatting.

In [None]:
def remove_lrm(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as f:
        content = f.read()

    cleaned_content = content.replace('\u200E', '')

    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(cleaned_content)

input_filename = '_chat.txt'
output_filename = 'LRM_removed.txt'

remove_lrm(input_filename, output_filename)
print(f"LRM characters removed from '{input_filename}' and saved to '{output_filename}'.")

## 2. Combining Multi-line Messages

Some messages span multiple lines in the chat file. This section combines those messages into a single line for each message.

In [None]:
def one_liner(input_file, output_file):
    with open(input_file, 'r') as f:
        lines = f.readlines()

    output_lines = []
    current_message = ""

    for line in lines:
        if line.strip():
            if line.startswith("[") and current_message:
                output_lines.append(current_message.replace('\n', ' '))
                current_message = line.strip()
            else:
                current_message += line.strip()

    if current_message:
        output_lines.append(current_message.replace('\n', ' '))

    with open(output_file, 'w') as f:
        f.write('\n'.join(output_lines))


input_file = "LRM_removed.txt"
output_file = "one_liner.txt"
one_liner(input_file, output_file)

## 3. Grouping Messages into Conversations

Using timestamps, we can group messages into conversations. Adjust the time threshold to change the conversation grouping logic.

In [None]:
from datetime import datetime, timedelta

def parse_timestamp(timestamp_str):
    return datetime.strptime(timestamp_str, "%y.%m.%d, %H:%M:%S")

def group_messages_into_conversations(messages, max_time_gap_seconds):
    conversations = []
    current_conversation = []

    i = 0
    while i < len(messages):
        if "[" in messages[i]:
            message_start = i
            while i < len(messages) and "]" not in messages[i]:
                i += 1
            if i < len(messages):
                current_conversation.append("".join(messages[message_start:i+1]))
        i += 1

    i = 0
    while i < len(current_conversation):
        if "[" in current_conversation[i]:
            if i > 0:
                prev_timestamp = parse_timestamp(current_conversation[i - 1].split("]")[0][1:])
                current_timestamp = parse_timestamp(current_conversation[i].split("]")[0][1:])
                time_gap = (current_timestamp - prev_timestamp).seconds

                if time_gap > max_time_gap_seconds:
                    conversations.append(current_conversation[:i])
                    current_conversation = current_conversation[i:]
                    i = 0
                    continue
        i += 1

    if current_conversation:
        conversations.append(current_conversation)

    return conversations

def remove_single_message_conversations(conversations):
    return [conversation for conversation in conversations if len(conversation) > 1]

with open("one_liner.txt", "r") as file:
    messages = file.readlines()

max_time_gap_seconds = 1800

conversations = group_messages_into_conversations(messages, max_time_gap_seconds)
filtered_conversations = remove_single_message_conversations(conversations)

output_filename = "conversations.txt"

with open(output_filename, "w") as output_file:
    for i, conversation in enumerate(filtered_conversations, start=1):
        output_file.write(f"Conversation {i}:\n")
        for message in conversation:
            output_file.write(message.strip() + "\n")
        output_file.write("-" * 40 + "\n")

print(f"Filtered conversations written to '{output_filename}'.")

## 4. Converting to OpenAI's Format

To use the data with OpenAI models, we need to convert the WhatsApp format into a format suitable for OpenAI. This section handles that conversion.

Important Notes:

-Change the user_name to the name of the person whose messaging is going to be simulated.

-Change "Your Name" to your actual WhatsApp name.

-You can edit the system prompt on line 27 to customize the system prompt.

In [None]:
import re
import json

def convert_conversation(conversation_lines, user_name):
    role_mapping = {"Your Name": "user", user_name: "assistant"}
    user_exists = False
    assistant_exists = False
    messages = []

    for line in conversation_lines:
        if "Conversation" in line:
            continue

        match = re.match(r'\[(\d{2}\.\d{2}\.\d{2}, \d{2}:\d{2}:\d{2})\] (\w+ \w+): (.+)', line)
        if match:
            sender = match.group(2)
            if sender == user_name:
                user_exists = True
            else:
                assistant_exists = True

            timestamp, content = match.group(1, 3)
            role = role_mapping.get(sender, "system")
            messages.append({"role": role, "content": content.strip()})

    if user_exists and assistant_exists:
        messages.insert(0, {"role": "system", "content": f"You are {user_name}."})
        return {"messages": messages}
    else:
        return None

user_name = "user name"

with open("conversations.txt", "r") as file:
    content = file.read()

conversations = content.split("Conversation")[1:]

with open("conversations.jsonl", "w") as output_file:
    for conversation in conversations:
        conversation_lines = conversation.strip().split("\n")
        result = convert_conversation(conversation_lines, user_name)
        if result:
            output_file.write(json.dumps(result) + "\n")


## 5. Splitting Data for Training and Validation

Before training a model, it's essential to split the data into training and validation sets. This ensures that we can evaluate the model's performance on unseen data.

In [None]:
import json
import random

def split_jsonl(input_file, train_output_file, val_output_file, split_ratio=0.9):
    with open(input_file, 'r') as f:
        lines = f.readlines()

    random.shuffle(lines)

    split_index = int(len(lines) * split_ratio)
    train_data = lines[:split_index]
    val_data = lines[split_index:]

    with open(train_output_file, 'w') as train_f:
        train_f.writelines(train_data)

    with open(val_output_file, 'w') as val_f:
        val_f.writelines(val_data)

input_jsonl_file = "conversations.jsonl"
train_output_jsonl = "finetune_training.jsonl"
val_output_jsonl = "finetune_validation.jsonl"
split_ratio = 0.9 

split_jsonl(input_jsonl_file, train_output_jsonl, val_output_jsonl, split_ratio)
print("Split complete. Training and validation JSONL files created.")