# WhatsApp Chat Analysis

This notebook analyzes WhatsApp chat exports to extract insights from conversations.

In [1]:
import pandas as pd
import os
import re
from datetime import datetime

In [2]:
def extract_chat_name(filename):
    """Extract the chat name from the filename.

    Args:
        filename (str): The filename of the WhatsApp chat export

    Returns:
        str: The name of the chat (without 'WhatsApp Chat with ' prefix)
    """
    # Remove 'WhatsApp Chat with ' prefix and '.txt' extension
    chat_name = filename.replace('WhatsApp Chat with ', '').replace('.txt', '')
    return chat_name

In [3]:
def parse_whatsapp_message(line):
    """Parse a single line from a WhatsApp chat export into its components.

    Args:
        line (str): A line from the WhatsApp chat export

    Returns:
        dict: A dictionary containing the parsed components (datetime, sender, message)
        or None if the line couldn't be parsed
    """
    # Regular expression to match WhatsApp message format
    pattern = r'^(\d{2}/\d{2}/\d{4}, \d{2}:\d{2}) - ([^:]+): (.+)$'

    match = re.match(pattern, line)
    if match:
        datetime_str, sender, message = match.groups()
        try:
            # Parse datetime string
            dt = datetime.strptime(datetime_str, '%d/%m/%Y, %H:%M')
            return {
                'datetime': dt,
                'sender': sender.strip(),
                'message': message.strip(),
                'message_type': 'Media omitted' if message.strip() == '<Media omitted>' else 'text'
            }
        except ValueError:
            return None
    return None

In [4]:
def read_whatsapp_chat(filepath):
    """Read and parse a WhatsApp chat export file.

    Args:
        filepath (str): Path to the WhatsApp chat export file

    Returns:
        list: List of dictionaries containing parsed messages
    """
    messages = []
    chat_name = extract_chat_name(os.path.basename(filepath))

    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            # Skip empty lines
            if not line.strip():
                continue

            parsed = parse_whatsapp_message(line)
            if parsed:
                parsed['chat'] = chat_name
                messages.append(parsed)

    return messages

In [5]:
def process_all_chats(directory):
    """Process all WhatsApp chat export files in a directory.

    Args:
        directory (str): Path to directory containing WhatsApp chat exports

    Returns:
        pandas.DataFrame: DataFrame containing all messages from all chats
    """
    all_messages = []

    # Get all .txt files in the directory
    chat_files = [f for f in os.listdir(directory) if f.endswith('.txt')]

    for chat_file in chat_files:
        filepath = os.path.join(directory, chat_file)
        messages = read_whatsapp_chat(filepath)
        all_messages.extend(messages)

    # Convert to DataFrame
    df = pd.DataFrame(all_messages)

    # Add additional useful columns
    df['date'] = df['datetime'].dt.date
    df['time'] = df['datetime'].dt.time
    df['day_of_week'] = df['datetime'].dt.day_name()
    df['hour'] = df['datetime'].dt.hour
    df['month'] = df['datetime'].dt.month
    df['year'] = df['datetime'].dt.year

    return df

In [7]:
# Process all chats
messages_dir = './messages'
df = process_all_chats(messages_dir)

# Display basic information about the dataset
print(f"Total number of messages: {len(df)}")
print(f"\nNumber of messages per chat:")
print(df['chat'].value_counts())
print(f"\nNumber of messages per sender:")
print(df['sender'].value_counts().head())
print(f"\nDate range: from {df['datetime'].min()} to {df['datetime'].max()}")

Total number of messages: 85902

Number of messages per chat:
chat
Mikkel Allison                        14642
Nyakundi’s                             5795
Gideon_2                               5339
Ainslan Myers                          5196
Old humble abode                       4783
                                      ...  
+1 (301) 523-7477                        10
Tofu Veganers                             8
Andy JP Morgan Manager                    6
+44 7940 376134                           5
Temp Maid Right Cleaning Liverpool        2
Name: count, Length: 103, dtype: int64

Number of messages per sender:
sender
Bosire                32011
Mikkel Allison        12707
Gideon                 4584
Caroline               3796
Mr Stenneth Karbah     3487
Name: count, dtype: int64

Date range: from 2017-01-11 21:29:00 to 2025-02-11 07:43:00


In [8]:
# Display the first few rows of the DataFrame
df.head()

Unnamed: 0,datetime,sender,message,message_type,chat,date,time,day_of_week,hour,month,year
0,2024-07-21 00:45:00,Bosire,"Hey Guys, hope you're doing well! Created this...",text,🏡61 Adelaide - Tenants & Management,2024-07-21,00:45:00,Sunday,0,7,2024
1,2024-07-21 00:46:00,Beth Adelaide Road Liverpool Tenant,Thank you and hope your doing too,text,🏡61 Adelaide - Tenants & Management,2024-07-21,00:46:00,Sunday,0,7,2024
2,2024-07-28 13:33:00,Bosire,"Hey guys , there should be people coming for a...",text,🏡61 Adelaide - Tenants & Management,2024-07-28,13:33:00,Sunday,13,7,2024
3,2024-07-28 13:33:00,Bosire,https://calendar.app.google/WKNJy2E7sHsPZqVm6,text,🏡61 Adelaide - Tenants & Management,2024-07-28,13:33:00,Sunday,13,7,2024
4,2024-07-28 13:33:00,Bosire,To install a smart reader so you guys don't ha...,text,🏡61 Adelaide - Tenants & Management,2024-07-28,13:33:00,Sunday,13,7,2024
