In [4]:
from pathlib import Path
from loguru import logger
import pandas as pd
from datetime import datetime

processed = Path("../../data/processed")
datafile = processed / "whatsapp-20241023-154747.csv"
if not datafile.exists():
    logger.warning("Datafile does not exist. First run src/preprocess.py, and check the timestamp!")

Read in the file

In [5]:
df = pd.read_csv(datafile, parse_dates=["timestamp"])
df.head(-1)

Unnamed: 0,timestamp,author,message
0,2024-05-04 13:57:55,Latina Wifey,Latina Wifey: ‎Messages and calls are end-to-e...
1,2024-05-04 13:57:57,Joppe Montezinos,Joppe Montezinos: In the train omw
2,2024-05-04 15:01:15,Joppe Montezinos,Joppe Montezinos: I think we will be back at 6
3,2024-05-04 15:01:29,Joppe Montezinos,Joppe Montezinos: Quinten was late cuz he had ...
4,2024-05-04 15:13:03,Joppe Montezinos,‎ Joppe Montezinos: ‎audio omitted
...,...,...,...
65310,2024-10-23 15:09:28,Latina Wifey,Latina Wifey: and ways to trust/block shoppers
65311,2024-10-23 15:09:40,Joppe Montezinos,Joppe Montezinos: block them if their name inc...
65312,2024-10-23 15:09:52,Joppe Montezinos,Joppe Montezinos: weird spending patterns
65313,2024-10-23 15:09:53,Joppe Montezinos,Joppe Montezinos: 😉


In [6]:


# Remove messages with "sticker omitted" or "image omitted"
df = df[~df['message'].str.contains("sticker omitted|image omitted")]

df.head(300)

Unnamed: 0,timestamp,author,message
0,2024-05-04 13:57:55,Latina Wifey,Latina Wifey: ‎Messages and calls are end-to-e...
1,2024-05-04 13:57:57,Joppe Montezinos,Joppe Montezinos: In the train omw
2,2024-05-04 15:01:15,Joppe Montezinos,Joppe Montezinos: I think we will be back at 6
3,2024-05-04 15:01:29,Joppe Montezinos,Joppe Montezinos: Quinten was late cuz he had ...
4,2024-05-04 15:13:03,Joppe Montezinos,‎ Joppe Montezinos: ‎audio omitted
...,...,...,...
385,2024-05-06 10:59:09,Latina Wifey,Latina Wifey: maybe its in your spam?
386,2024-05-06 10:59:13,Latina Wifey,Latina Wifey: cause it went to my spam
387,2024-05-06 10:59:36,Joppe Montezinos,Joppe Montezinos: yes got it
388,2024-05-06 10:59:37,Joppe Montezinos,Joppe Montezinos: my bad


Check the datatypes. Note the timestamp type!

In [7]:
df.dtypes


timestamp    datetime64[ns]
author               object
message              object
dtype: object

Sometimes, author names have a tilde in front of them, allong with some unicode. Let's clean that.

In [8]:
import re
clean_tilde = r"^~\u202f"
df["author"] = df["author"].apply(lambda x: re.sub(clean_tilde, "", x))

Let's check how many unique authors we have

In [9]:
len(df.author.unique())

2

Let's make the authors anonymous

In [10]:
import sys
sys.path.append("../../src/")
import json
from wa_cleaner.humanhasher import humanize


authors = df.author.unique()
anon = {k:humanize(k) for k in authors}
# we save a reference file so we can look up the original author names if we want to
reference_file = processed / "anon_reference.json"

with open(reference_file, "w") as f:
    # invert the dictionary:
    ref = {v:k for k,v in anon.items()}
    # sort alphabetically:
    ref_sorted = {k:ref[k] for k in sorted(ref.keys())}
    # save as json:
    json.dump(ref_sorted, f)

assert len(anon) == len(authors), "you lost some authors!"


In [11]:
df["anon_author"] = df.author.map(anon)
df.head()

Unnamed: 0,timestamp,author,message,anon_author
0,2024-05-04 13:57:55,Latina Wifey,Latina Wifey: ‎Messages and calls are end-to-e...,giggling-termite
1,2024-05-04 13:57:57,Joppe Montezinos,Joppe Montezinos: In the train omw,cheerful-nightingale
2,2024-05-04 15:01:15,Joppe Montezinos,Joppe Montezinos: I think we will be back at 6,cheerful-nightingale
3,2024-05-04 15:01:29,Joppe Montezinos,Joppe Montezinos: Quinten was late cuz he had ...,cheerful-nightingale
4,2024-05-04 15:13:03,Joppe Montezinos,‎ Joppe Montezinos: ‎audio omitted,cheerful-nightingale


We can now drop the original author column

In [12]:
df.drop(columns=["author"], inplace=True)

Check if it's gone

In [13]:
df.head()

Unnamed: 0,timestamp,message,anon_author
0,2024-05-04 13:57:55,Latina Wifey: ‎Messages and calls are end-to-e...,giggling-termite
1,2024-05-04 13:57:57,Joppe Montezinos: In the train omw,cheerful-nightingale
2,2024-05-04 15:01:15,Joppe Montezinos: I think we will be back at 6,cheerful-nightingale
3,2024-05-04 15:01:29,Joppe Montezinos: Quinten was late cuz he had ...,cheerful-nightingale
4,2024-05-04 15:13:03,‎ Joppe Montezinos: ‎audio omitted,cheerful-nightingale


And let's rename the column

In [14]:
df.rename(columns={"anon_author":"author"}, inplace=True)

In [15]:
df.head()

Unnamed: 0,timestamp,message,author
0,2024-05-04 13:57:55,Latina Wifey: ‎Messages and calls are end-to-e...,giggling-termite
1,2024-05-04 13:57:57,Joppe Montezinos: In the train omw,cheerful-nightingale
2,2024-05-04 15:01:15,Joppe Montezinos: I think we will be back at 6,cheerful-nightingale
3,2024-05-04 15:01:29,Joppe Montezinos: Quinten was late cuz he had ...,cheerful-nightingale
4,2024-05-04 15:13:03,‎ Joppe Montezinos: ‎audio omitted,cheerful-nightingale


In my case, the first line is a header, saying messages are encrypted. Let's remove that. Your data might be different, so double check if you also want to remove the first line!

In [16]:
df = df.drop(index=[0])

let's check:

In [17]:
df.head()

Unnamed: 0,timestamp,message,author
1,2024-05-04 13:57:57,Joppe Montezinos: In the train omw,cheerful-nightingale
2,2024-05-04 15:01:15,Joppe Montezinos: I think we will be back at 6,cheerful-nightingale
3,2024-05-04 15:01:29,Joppe Montezinos: Quinten was late cuz he had ...,cheerful-nightingale
4,2024-05-04 15:13:03,‎ Joppe Montezinos: ‎audio omitted,cheerful-nightingale
6,2024-05-04 15:24:33,Latina Wifey: good morning cutie,giggling-termite


Let's find emojis in the text and add that as a feature.

In [18]:
import re
import pandas as pd

def clean_message(row):
    # Extract the message
    message = row['message']
    
    # Define the pattern to match name and colon at the start of the message
    pattern = r'^([^:]+):\s*'
    
    # Use re.sub to replace the pattern if it's at the start of the message
    cleaned_message = re.sub(pattern, '', message)
    
    return cleaned_message


# Apply the cleaning function to the 'message' column
df['message'] = df.apply(clean_message, axis=1)

# Save the result
print(df.head())

print("Cleaning completed")

            timestamp                                            message  \
1 2024-05-04 13:57:57                                   In the train omw   
2 2024-05-04 15:01:15                       I think we will be back at 6   
3 2024-05-04 15:01:29  Quinten was late cuz he had to stop by someone...   
4 2024-05-04 15:13:03                                     ‎audio omitted   
6 2024-05-04 15:24:33                                 good morning cutie   

                 author  
1  cheerful-nightingale  
2  cheerful-nightingale  
3  cheerful-nightingale  
4  cheerful-nightingale  
6      giggling-termite  
Cleaning completed


In [19]:
import re

emoji_pattern = re.compile("["
                            u"\U0001F600-\U0001F64F"  # emoticons
                            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                            u"\U0001F680-\U0001F6FF"  # transport & map symbols
                            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                            u"\U00002702-\U000027B0"  # Dingbats
                            u"\U000024C2-\U0001F251"
                            "]+", flags=re.UNICODE)

def has_emoji(text):
    return bool(emoji_pattern.search(text))

df['has_emoji'] = df['message'].apply(has_emoji)

In [20]:
print(df.head())

            timestamp                                            message  \
1 2024-05-04 13:57:57                                   In the train omw   
2 2024-05-04 15:01:15                       I think we will be back at 6   
3 2024-05-04 15:01:29  Quinten was late cuz he had to stop by someone...   
4 2024-05-04 15:13:03                                     ‎audio omitted   
6 2024-05-04 15:24:33                                 good morning cutie   

                 author  has_emoji  
1  cheerful-nightingale      False  
2  cheerful-nightingale      False  
3  cheerful-nightingale      False  
4  cheerful-nightingale      False  
6      giggling-termite      False  


Let's create a timestamp for a new, unique, filename.

In [21]:
now = datetime.now().strftime("%Y%m%d-%H%M%S")
output = processed / f"whatsapp-{now}.csv"

Let's save the file both as a csv and as a parquet file.
Parquet has some advantages:
- its about 100x faster to read and write
- datatypes are preserved (eg the timestamp type). You will loose this in a csv file.
- file size is much smaller

The advantage of csv is that you can easily peak at the data in a text editor.

In [22]:
df.to_csv(output, index=False)
df.to_parquet(output.with_suffix(".parq"), index=False)

Now, go to `config.toml` and change the name by "current" to the parquet file you just created.
This makes it easier to use the same file everywhere, without the need to continuously retype the name if you change it.