## Reading files

In [None]:
import re
import pandas as pd

def read_whatsapp_chat(file_path: str) -> pd.DataFrame:

    # --- SYSTEM MESSAGE PATTERNS ---
    system_patterns = [
        r'end-to-end encrypted',
        r'You deleted this message',
        r'created group',
        r'added you',
        r'pinned a message',
        r'Click to call back'
    ]

    # --- MEDIA / CALL VARIANTS ---
    omitted_pattern = r'(image omitted|video omitted|sticker omitted|GIF omitted|audio omitted|video note omitted|voice note omitted|document omitted|<Media omitted>)'
    call_pattern = r'(Video call|Missed video call|Voice call)'

    # --- CONTENT CLEANING PATTERNS ---
    email_pattern = r'[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}'
    url_pattern = r'http[s]?://\\S+'
    tagging_pattern = r'@[\\w]+'
    edited_pattern = r'<This message was edited>'

    # --- TIMESTAMP START PATTERN ---
    timestamp_start = r'^\\[?\\d{1,2}/\\d{1,2}/\\d{2,4},'

    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    filtered_lines = []

    for line in lines:

        # Remove invisible unicode characters
        line = re.sub(r'[\\u200E\\u200F\\u202F\\u2068\\u2069\\u200B]', '', line).strip()

        # Skip empty lines
        if not line:
            continue

        # Only keep lines that start with a timestamp
        if not re.match(timestamp_start, line):
            continue

        # Skip system messages
        if any(re.search(pattern, line, re.IGNORECASE) for pattern in system_patterns):
            continue

        # Skip media messages
        if re.search(omitted_pattern, line, re.IGNORECASE):
            continue

        # Skip call messages (video, voice, missed)
        if re.search(call_pattern, line, re.IGNORECASE):
            continue

        # Skip emails and URLs
        if re.search(email_pattern, line):
            continue
        if re.search(url_pattern, line):
            continue

        # Remove edited tag and mentions
        line = re.sub(edited_pattern, '', line)
        line = re.sub(tagging_pattern, '', line)

        # Skip trailing null safely
        if line.endswith(' null'):
            continue

        filtered_lines.append(line.strip())

    content = "\n".join(filtered_lines)

    # Remove brackets around timestamps if present
    content = re.sub(
        r'\\[(\\d{1,2}/\\d{1,2}/\\d{2,4}, \\d{1,2}:\\d{2}(?::\\d{2})?\\s?[APap][Mm])\\]',
        r'\\1',
        content
    )

    # Extract structured messages
    pattern = r'(\\d{1,2}/\\d{1,2}/\\d{2,4}, \\d{1,2}:\\d{2}(?::\\d{2})?(?:\\s?[APap][Mm])?)\\s?(?:-|~)?\\s?(.*?): (.*?)(?=\\n\\d{1,2}/\\d{1,2}/\\d{2,4}, \\d{1,2}:\\d{2}|$)'
    messages = re.findall(pattern, content, re.DOTALL)

    df = pd.DataFrame(messages, columns=['timestamp', 'sender', 'message'])
    df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')

    # Drop failed timestamps
    df = df.dropna(subset=['timestamp'])

    return df
