
Reading files

We will read the .txt files line by line and apply these filters:

    Remove lines containing a WhatsApp encryption notice
        ❌ Before: dd/mm/yyyy, hh:mm - Messages and calls are end-to-end encrypted. No one outside of this chat, not even WhatsApp, can read or listen to them. Tap to learn more.
        ✅ After: (Removed)

    Remove lines with <Media omitted>
        ❌ Before: dd/mm/yyyy, hh:mm - Person: <Media omitted>
        ✅ After: (Removed)

    Remove lines containing email addresses
        ❌ Before: dd/mm/yyyy, hh:mm - Person: example@gmail.com
        ✅ After: (Removed)

    Remove lines containing links
        ❌ Before: dd/mm/yyyy, hh:mm - Person: https://www.example.com/
        ✅ After: (Removed)

    Replace <This message was edited> with an empty string
        ❌ Before: dd/mm/yyyy, hh:mm - Person: hey, how are you? <This message was edited>
        ✅ After: dd/mm/yyyy, hh:mm - Person: hey, how are you?

    Remove lines with the text You deleted this message
        ❌ Before: dd/mm/yyyy, hh:mm - Person: You deleted this message
        ✅ After: (Removed)

    Remove lines with the text null
        ❌ Before: dd/mm/yyyy, hh:mm - Person: null
        ✅ After: (Removed)

    Remove lines with the text created group
        ❌ Before: dd/mm/yyyy, hh:mm - Person created group "group name"
        ✅ After: (Removed)

    Remove lines with the text added you
        ❌ Before: dd/mm/yyyy, hh:mm - Person added you
        ✅ After: (Removed)

    Replace tagging (@person) with an empty string

    ❌ Before: dd/mm/yyyy, hh:mm - Person: @person are you coming?
    ✅ After: dd/mm/yyyy, hh:mm - Person: are you coming?



In [2]:
import re 
import pandas as pd
import os

In [3]:
def read_whatsapp_chat(file_path : str ) -> pd.DataFrame:
    # desfine filtering pattens(remove lines contianing a whatsapp encryption notice)
    encryption_message = "Messages and calls are end-to-end encrypted. No one outside of this chat, not even WhatsApp, can read or listen to them. Tap to learn more."
    # remove lines with <media ommited>
    media_pattern = "<Media omitted>"
    # remove lines containing email addresss
    email_pattern = r'[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}'
    # remove lines containg links
    url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    # Repalce <This message was edited> with an empty string
    edited_message = "<This message was edited>"
    # Remove lines with the text You deleted this meaasge
    deleted_message = "You deleted this message"
    # Remove lines with  text null
    null_message = "null"
    # Remove lines with the text created group
    created_group_message = "created group"
    # remove lines with text added you 
    added_you_to_group_message = "added you"
    # replace the taaging (@person) with an empty string 
    tagging_pattern = r'@[\w\._%+-]+'
    
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    # apply filter to remove uneanted lines
    filtered_lines = []
    for line in lines:
        if (
            encryption_message not in line and
            deleted_message not in line and
            null_message != line.split(" ")[-1] and
            media_pattern not in line and
            created_group_message not in line and
            added_you_to_group_message not in line and
            not re.search(email_pattern, line) and
            not re.search(url_pattern, line)
        ):
            line = line.replace(edited_message, "").strip()
            line = re.sub(tagging_pattern, "", line).strip()
            filtered_lines.append(line)
    # Regular expresssion to match whatsapp message format
    pattern = r'(\d{1,2}/\d{1,2}/\d{2,4}, \d{1,2}:\d{2}(?:\s?[AP]M)?) - (.*?): (.*?)(?=\n\d{1,2}/\d{1,2}/\d{2,4}, \d{1,2}:\d{2}(?:\s?[AP]M)? -|$)'
    content = '\n'.join(filtered_lines)
    messages = re.findall(pattern, content, re.DOTALL)
    df = pd.DataFrame(messages, columns=['timestamp', 'sender', 'message'])
    
    if not df.empty:
        # Try parsing with multiple formats
        df['timestamp'] = pd.to_datetime(
            df['timestamp'],
            format='%m/%d/%y, %I:%M %p',  # For M/DD/YY, H:MM AM/PM
            errors='coerce'
        )
        # Try alternative format if parsing fails
        df['timestamp'] = df['timestamp'].combine_first(
            pd.to_datetime(
                df['timestamp'],
                format='%d/%m/%Y, %H:%M',  # For DD/MM/YYYY, HH:MM
                errors='coerce'
            )
        )
    return df


The all_chats dictionary holds the content of each file as a dataframe with three columns: timestamp, sender, and message.

In [4]:
from pathlib import Path

all_chats = {}
data_directory = Path("../Data/ptivate")
for file in data_directory.glob('*.txt'):
    file_name = file.stem
    all_chats[file_name] = read_whatsapp_chat(file)



Text sequence

The text should be merged into a single sequence to prepare it for the next step, where the BPE algorithm will be applied and the text will be encoded.


In [6]:
text_sequence = ""
for file_name, df in all_chats.items():
    if not df.empty:
        text_sequence += " ".join(df['message'].dropna().values) + " "

print(f"Total combined characters: {len(text_sequence)}")

# Save to file
output_path = "../output"
os.makedirs(output_path, exist_ok=True)

with open(os.path.join(output_path, "combined_text.tex"), "w", encoding='utf-8') as f:
    f.write(text_sequence)


Total combined characters: 12379
