In [1]:
path = "/Users/danieljones/Documents/Data_for_projects/Whatsapp_Data.txt"

- For each line:
    - If line starts with a date then we know it is the start of a message
        - But we should ignore things like "X was added to the group" or "X changed the subject to Y"
    - Otherwise it should be appended to the previous message

In [2]:
import datetime
import pandas as pd

### Checking how to write the date format

In [3]:
t = datetime.datetime.utcnow()
t.strftime("[%d/%m/%Y, %H:%M:%S]")

'[05/08/2020, 18:46:52]'

In [4]:
def get_date_from_line(line):
    datestr = line[:22]
    return datetime.datetime.strptime(datestr, "[%d/%m/%Y, %H:%M:%S]")

In [5]:
line = "[28/06/2014, 19:03:01] Tom changed the subject to “Hello everyone"
get_date_from_line(line)

datetime.datetime(2014, 6, 28, 19, 3, 1)

In [6]:
line = "you eat beans and cheese"

try:
    get_date_from_line(line)
except ValueError:
    print("This line failed with a ValueError")

This line failed with a ValueError


### Append multiple lines together

In [7]:
'"[28/06/2014, 19:03:01] Tommy turn changed the subject to “Hideout ”"\n'

'"[28/06/2014, 19:03:01] Tommy turn changed the subject to “Hideout ”"\n'

In [8]:
lines = [
    '"[28/06/2014, 19:03:01] Tom changed the subject to “Hello everyone"',
    '"[28/06/2014, 19:03:02] Tom changed the subject to “Berko"',
    '"[28/07/2014, 11:33:30] Tom: Henry how do you eat that? "',
    '"pasta beans and cheese 3 meals a day"',
    '"[28/07/2014, 11:33:30] Tom: Dan youre in great health "',
    '"tuna and rice 3 meals a day"',
]

In [9]:
def does_line_start_with_date(line):
    try:
        get_date_from_line(line)
        return True
    except ValueError:
        return False

In [10]:
for line in lines:
    print(does_line_start_with_date(line[1:]))

True
True
True
False
True
False


In [11]:
def get_combined_lines(lines):
    combined_lines = []
    combined_line = ""
    for line in lines:
        if does_line_start_with_date(line):
            if len(combined_line) > 0:
                combined_lines.append(combined_line)
                combined_line = ""
            combined_line = line
        else:
            combined_line += line
    if len(combined_line) > 0:
        combined_lines.append(combined_line)
    return combined_lines

In [12]:
get_combined_lines(lines)

['"[28/06/2014, 19:03:01] Tom changed the subject to “Hello everyone""[28/06/2014, 19:03:02] Tom changed the subject to “Berko""[28/07/2014, 11:33:30] Tom: Henry how do you eat that? ""pasta beans and cheese 3 meals a day""[28/07/2014, 11:33:30] Tom: Dan youre in great health ""tuna and rice 3 meals a day"']

In [13]:
get_combined_lines(lines[-2:])

['"[28/07/2014, 11:33:30] Tom: Dan youre in great health ""tuna and rice 3 meals a day"']

### Run on all the data

In [None]:
def load_whatsapp_messages_to_dataframe(path_to_messages):
    with open(path_to_messages, "rt") as f:
        lines = f.readlines()
    lines = [line.strip()[1:-1] for line in lines]
    combined_lines = get_combined_lines(lines) 
    processed_lines = []
    for line in combined_lines:
        try:
            processed_lines.append(_split_line_into_components(line))
        except ValueError:
            print(line)
    df = pd.DataFrame(processed_lines, columns=["date", "time", "author", "message"])
    return df

def _split_line_into_components(line):
    """
    It looks like the lines all follow the exact same structure:
    [DD/MM/YYYY, HH:MM:SS] NAME: MESSAGE
    """
    date = line[1:11]
    time = line[13:21]
    date_and_time = line[1:21]
    name, message = line[23:].split(": ", 1)
    message = message.strip()
    return date, time, name, message


df = load_whatsapp_messages_to_dataframe(path)
df.to_csv("/Users/danieljones/Documents/Data_for_projects/Whatsapp_Data.txt", index=False)
