In [1]:
path = "/Users/danieljones/Documents/Codecademy/Data for projects/Sit_Off/Sit Off Whatsapp Data.txt"

- For each line:
    - If line starts with a date then we know it is the start of a message
        - But we should ignore things like "X was added to the group" or "X changed the subject to Y"
    - Otherwise it should be appended to the previous message

In [2]:
import datetime
import pandas as pd

### Check how to write the date format

In [44]:
t = datetime.datetime.utcnow()
t.strftime("[%d/%m/%Y, %H:%M:%S]")

'[26/07/2020, 14:47:33]'

In [4]:
def get_date_from_line(line):
    datestr = line[:22]
    return datetime.datetime.strptime(datestr, "[%d/%m/%Y, %H:%M:%S]")

In [5]:
line = "[28/06/2014, 19:03:01] Tommy turn changed the subject to “Hideout"
get_date_from_line(line)

datetime.datetime(2014, 6, 28, 19, 3, 1)

In [6]:
line = "you eat beans and cheese"

try:
    get_date_from_line(line)
except ValueError:
    print("This line failed with a ValueError")

This line failed with a ValueError


### Lets understand how to append multiple lines together

In [None]:
'"[28/06/2014, 19:03:01] Tommy turn changed the subject to “Hideout ”"\n'

In [20]:
lines = [
    '"[28/06/2014, 19:03:01] Tommy turn changed the subject to “Hideout"',
    '"[28/06/2014, 19:03:02] Tommy turn changed the subject to “Creamfields"',
    '"[28/07/2014, 11:33:30] Tommy turn: Bragg you are in terrible health "',
    '"You eat pasta beans and cheese 3 meals a day"',
    '"[28/07/2014, 11:33:30] Tommy turn: Dan you are in great health "',
    '"You eat tuna and rice 3 meals a day"',
]

In [21]:
def does_line_start_with_date(line):
    try:
        get_date_from_line(line)
        return True
    except ValueError:
        return False

In [25]:
for line in lines:
    print(does_line_start_with_date(line[1:]))

True
True
True
False
True
False


In [22]:
def get_combined_lines(lines):
    combined_lines = []
    combined_line = ""
    for line in lines:
        if does_line_start_with_date(line):
            if len(combined_line) > 0:
                combined_lines.append(combined_line)
                combined_line = ""
            combined_line = line
        else:
            combined_line += line
    if len(combined_line) > 0:
        combined_lines.append(combined_line)
    return combined_lines

In [23]:
get_combined_lines(lines)

['"[28/06/2014, 19:03:01] Tommy turn changed the subject to “Hideout""[28/06/2014, 19:03:02] Tommy turn changed the subject to “Creamfields""[28/07/2014, 11:33:30] Tommy turn: Bragg you are in terrible health ""You eat pasta beans and cheese 3 meals a day""[28/07/2014, 11:33:30] Tommy turn: Dan you are in great health ""You eat tuna and rice 3 meals a day"']

In [37]:
get_combined_lines(lines[-2:])

['"[28/07/2014, 11:33:30] Tommy turn: Dan you are in great health ""You eat tuna and rice 3 meals a day"']

### Run on all the data

In [42]:
def load_whatsapp_messages_to_dataframe(path_to_messages):
    with open(path_to_messages, "rt") as f:
        lines = f.readlines()
    lines = [line.strip()[1:-1] for line in lines]
    combined_lines = get_combined_lines(lines) 
    processed_lines = []
    for line in combined_lines:
        try:
            processed_lines.append(_split_line_into_components(line))
        except ValueError:
            print(line)
    df = pd.DataFrame(processed_lines, columns=["date", "time", "author", "message"])
    return df

def _split_line_into_components(line):
    """
    It looks like the lines all follow the exact same structure:
    [DD/MM/YYYY, HH:MM:SS] NAME: MESSAGE
    """
    date = line[1:11]
    time = line[13:21]
    date_and_time = line[1:21]
    name, message = line[23:].split(": ", 1)
    message = message.strip()
    return date, time, name, message


df = load_whatsapp_messages_to_dataframe(path)
df.to_csv("/Users/danieljones/Documents/Codecademy/Data for projects/Sit_Off/Sit_Off_Whatsapp_Data_Processed.csv", index=False)
#"Sit Off Whatsapp Data.txt"


[28/06/2014, 19:03:01] Tommy turn changed the subject to “Hideout ”
[11/07/2014, 15:45:00] Tommy turn was added
[11/07/2014, 15:45:02] You were added
[11/07/2014, 15:45:03] Jamie was added
[11/07/2014, 15:45:03] Iain was added
[11/07/2014, 15:45:03] Edd Rocca was added
[11/07/2014, 15:45:04] Bobby was added
[11/07/2014, 15:45:04] Ollie was added
[11/07/2014, 15:45:04] Charlie Rider was added
[11/07/2014, 15:45:04] Huggggo was added
[11/07/2014, 15:45:04] Chris Kat was added
[11/07/2014, 15:45:04] Adam was added
[26/07/2014, 01:01:24] Henry B was added
[10/08/2014, 12:02:52] Henry B changed this group's icon
[10/08/2014, 19:48:11] Chris Kat changed this group's icon
[10/08/2014, 19:52:37] Chris Kat changed this group's icon
[10/08/2014, 22:05:16] Tommy turn changed this group's icon
[11/08/2014, 21:03:27] Chris Kat changed this group's icon
[11/08/2014, 23:20:38] Dommy Fraz was added
[11/08/2014, 23:20:38] Chris M was added
[12/08/2014, 14:19:49] Tommy turn changed the subject to “.”
[1

In [43]:
df

Unnamed: 0,date,time,author,message
0,11/07/2014,16:12:19,Charlie Rider,"Might make an appearance, me aunts down so mig..."
1,11/07/2014,16:41:17,Adam,Anyone wonna buy this pre workout at all I've ...
2,11/07/2014,16:41:20,Adam,image omitted
3,11/07/2014,16:45:33,Dan Jones,image omitted
4,11/07/2014,16:47:43,Dan Jones,Thanks for the belated add tom lol but i misse...
...,...,...,...,...
44772,16/06/2020,13:04:45,Dan Jones,HB Goooooosage!
44773,16/06/2020,13:04:52,Charlie Rider,Happy birthday hardy!!
44774,16/06/2020,13:05:00,Adam,Harrrddyyyyyy HBD!!
44775,16/06/2020,13:42:54,Jamie,Happy birthday mateeee
