In [15]:
import pandas as pd
import numpy as np
import re
import dateparser
from collections import Counter

import warnings
warnings.filterwarnings('ignore')

In [16]:
def read_file(file):
    '''Reads Whatsapp text file into a list of strings''' 
    x = open(file,'r', encoding = 'utf-8') #Opens the text file into variable x but the variable cannot be explored yet
    y = x.read() #By now it becomes a huge chunk of string that we need to separate line by line
    content = y.splitlines() #The splitline method converts the chunk of string into a list of strings
    return content

In [19]:
chat = read_file('Chat.txt')
len(chat)

3198

In [20]:
join = [line for line in chat if  "joined using this" in line]
join

["04/08/2022, 2:12 pm - You joined using this group's invite link",
 "04/08/2022, 2:54 pm - +234 803 568 5324 joined using this group's invite link",
 "04/08/2022, 2:57 pm - Benji joined using this group's invite link",
 "04/08/2022, 3:16 pm - Mash joined using this group's invite link",
 "04/08/2022, 3:19 pm - +234 706 767 8201 joined using this group's invite link",
 "04/08/2022, 3:22 pm - +234 806 234 7970 joined using this group's invite link",
 "04/08/2022, 3:40 pm - +234 803 570 5272 joined using this group's invite link",
 "04/08/2022, 4:18 pm - +234 813 038 2293 joined using this group's invite link",
 "04/08/2022, 4:26 pm - +234 912 185 6837 joined using this group's invite link",
 "04/08/2022, 4:32 pm - +234 813 890 9679 joined using this group's invite link",
 "04/08/2022, 6:28 pm - +234 806 827 3129 joined using this group's invite link",
 "04/08/2022, 6:32 pm - +234 806 884 2448 joined using this group's invite link",
 "04/08/2022, 8:09 pm - +234 816 258 3465 joined using 

We will clean our chat data by removing all messages in join and also by removing all empty lines (lines that contain no message) by running the code below:

In [21]:
#Remove new lines
chat = [line.strip() for line in chat]
print("length of chat is:")
print(len(chat))
#Clean out the join notification lines
clean_chat = [line for line in chat if not "joined using this" in line]
#Further cleaning
#Remove empty lines
clean_chat = [line for line in clean_chat if len(line) > 1]
print("length of clean_chat is:")
print(len(clean_chat))

length of chat is:
3198
length of clean_chat is:
2869


In [22]:
#Drop 'left-ers'
left = [line for line in clean_chat if line.endswith("left")]
left

['19/08/2022, 12:16 am - Benji left',
 '09/09/2022, 7:52 am - +234 806 010 4979 left',
 '23/09/2022, 6:26 pm - +234 803 568 5324 left']

In [23]:
#Clean out the left notification lines
clean_chat = [line for line in clean_chat if not line.endswith("left")]
print(len(clean_chat))

2866


In [24]:
#Merge messages that belong together
msgs = [] #message container
pos = 0 #counter for position of msgs in the container
"""
Flow:
For every line, see if it matches the expression which is starting with the format "number(s)+slash" eg "12/"
If it does, it is a new line of conversion as they begin with dates, add it to msgs container
Else, it is a continuation of the previous line, add it to the previous line and append to msgs, then pop previous line.
"""
for line in clean_chat:
    if re.findall("\A\d+[/]", line):
        msgs.append(line)
        pos += 1
    else:
        take = msgs[pos-1] + ". " + line
        msgs.append(take)
        msgs.pop(pos-1)
len(msgs)

2023

In [25]:
msgs[0:10]

['04/08/2022, 2:12 pm - Messages and calls are end-to-end encrypted. No one outside of this chat, not even WhatsApp, can read or listen to them. Tap to learn more.',
 '04/08/2022, 12:05 pm - +234 813 215 5650 created group "CodePlateau4.0DataScience"',
 '04/08/2022, 3:02 pm - +234 814 838 1223 added +234 802 253 0435',
 '04/08/2022, 3:18 pm - +234 814 838 1223 added +234 806 892 9315',
 '04/08/2022, 3:41 pm - +234 803 570 5272: Please, can you send the document for today?',
 '04/08/2022, 4:20 pm - +234 806 451 1186: Calm down',
 '04/08/2022, 4:28 pm - +234 806 234 7970: Thanks for the link and for adding me',
 '04/08/2022, 10:11 pm - Jabezy Code Plateau: <Media omitted>',
 "04/08/2022, 10:12 pm - +234 813 215 5650 changed this group's icon",
 "05/08/2022, 6:14 am - +234 814 838 1223 changed this group's icon"]

Next, we will need to extract Date, Time, Name and Message Content from our msgs data using the codes below:

In [26]:
time = [msgs[i].split(',')[1].split('-')[0] for i in range(len(msgs))]
time = [s.strip(' ') for s in time] # Remove spacing
print("length of time is:")
print(len(time))
time

length of time is:
2023


['2:12 pm',
 '12:05 pm',
 '3:02 pm',
 '3:18 pm',
 '3:41 pm',
 '4:20 pm',
 '4:28 pm',
 '10:11 pm',
 '10:12 pm',
 '6:14 am',
 '10:56 am',
 '11:01 am',
 '11:02 am',
 '11:03 am',
 '11:05 am',
 '11:19 am',
 '11:56 am',
 '12:09 pm',
 '2:48 pm',
 '5:41 pm',
 '5:44 pm',
 '7:00 pm',
 '7:02 pm',
 '7:07 pm',
 '7:20 pm',
 '7:39 pm',
 '7:39 pm',
 '7:39 pm',
 '7:43 pm',
 '7:45 pm',
 '7:54 pm',
 '7:59 pm',
 '8:05 pm',
 '8:05 pm',
 '8:06 pm',
 '8:06 pm',
 '8:53 pm',
 '6:44 am',
 '2:17 pm',
 '2:17 pm',
 '2:17 pm',
 '2:38 pm',
 '2:42 pm',
 '2:50 pm',
 '2:53 pm',
 '2:53 pm',
 '2:54 pm',
 '3:02 pm',
 '3:04 pm',
 '3:58 pm',
 '3:58 pm',
 '11:22 am',
 '8:14 am',
 '8:15 am',
 '9:30 am',
 '9:36 am',
 '9:41 am',
 '10:06 am',
 '10:07 am',
 '10:07 am',
 '1:11 pm',
 '1:35 pm',
 '2:35 pm',
 '2:39 pm',
 '3:04 pm',
 '3:51 pm',
 '3:51 pm',
 '4:15 pm',
 '5:29 pm',
 '5:41 pm',
 '5:44 pm',
 '9:23 pm',
 '12:37 pm',
 '12:37 pm',
 '12:56 pm',
 '12:59 pm',
 '1:00 pm',
 '1:00 pm',
 '2:07 pm',
 '2:07 pm',
 '2:09 pm',
 '2:14 pm

In [27]:
date = [msgs[i].split(',')[0] for i in range(len(msgs))]
len(date)
name = [msgs[i].split('-')[1].split(':')[0] for i in range(len(msgs))]
len(name)
content = []
for i in range(len(msgs)):
  try:
    content.append(msgs[i].split(':')[2])
  except IndexError:
    content.append('Missing Text')
len(content)

2023

Now we can finally use the pandas library to merge our date, time, name and content data into a Dataframe named df using the code below:

In [28]:
df = pd.DataFrame(list(zip(date, time, name, content)), columns = ['Date', 'Time', 'Name', 'Content'])
df

Unnamed: 0,Date,Time,Name,Content
0,04/08/2022,2:12 pm,Messages and calls are end,Missing Text
1,04/08/2022,12:05 pm,"+234 813 215 5650 created group ""CodePlateau4...",Missing Text
2,04/08/2022,3:02 pm,+234 814 838 1223 added +234 802 253 0435,Missing Text
3,04/08/2022,3:18 pm,+234 814 838 1223 added +234 806 892 9315,Missing Text
4,04/08/2022,3:41 pm,+234 803 570 5272,"Please, can you send the document for today?"
...,...,...,...,...
2018,24/10/2022,7:23 pm,+234 803 570 5272,Oho! U wan do us the more we see the less we ...
2019,24/10/2022,7:23 pm,+234 816 739 8525,<Media omitted>
2020,24/10/2022,7:24 pm,+234 816 739 8525,<Media omitted>
2021,24/10/2022,7:24 pm,+234 701 936 6781,Na why we no want make u see sef...


In [29]:
df = df[df["Content"]!='Missing Text']
df.reset_index(inplace=True, drop=True)
df

Unnamed: 0,Date,Time,Name,Content
0,04/08/2022,3:41 pm,+234 803 570 5272,"Please, can you send the document for today?"
1,04/08/2022,4:20 pm,+234 806 451 1186,Calm down
2,04/08/2022,4:28 pm,+234 806 234 7970,Thanks for the link and for adding me
3,04/08/2022,10:11 pm,Jabezy Code Plateau,<Media omitted>
4,05/08/2022,10:56 am,+234 802 253 0435,<Media omitted>
...,...,...,...,...
2010,24/10/2022,7:23 pm,+234 803 570 5272,Oho! U wan do us the more we see the less we ...
2011,24/10/2022,7:23 pm,+234 816 739 8525,<Media omitted>
2012,24/10/2022,7:24 pm,+234 816 739 8525,<Media omitted>
2013,24/10/2022,7:24 pm,+234 701 936 6781,Na why we no want make u see sef...


We will create additional columns by taking advantage of built-in functions in pandas. First, let us create a Datetime column by merging Date and Time columns and using the pd.to_datetime function:

In [30]:
df['DateTime'] = pd.to_datetime(df['Date'] + ' ' + df['Time'])
df['DateTime']

0      2022-04-08 15:41:00
1      2022-04-08 16:20:00
2      2022-04-08 16:28:00
3      2022-04-08 22:11:00
4      2022-05-08 10:56:00
               ...        
2010   2022-10-24 19:23:00
2011   2022-10-24 19:23:00
2012   2022-10-24 19:24:00
2013   2022-10-24 19:24:00
2014   2022-10-24 19:24:00
Name: DateTime, Length: 2015, dtype: datetime64[ns]

In [31]:
df['weekday'] = df['DateTime'].apply(lambda x: x.day_name())
df

Unnamed: 0,Date,Time,Name,Content,DateTime,weekday
0,04/08/2022,3:41 pm,+234 803 570 5272,"Please, can you send the document for today?",2022-04-08 15:41:00,Friday
1,04/08/2022,4:20 pm,+234 806 451 1186,Calm down,2022-04-08 16:20:00,Friday
2,04/08/2022,4:28 pm,+234 806 234 7970,Thanks for the link and for adding me,2022-04-08 16:28:00,Friday
3,04/08/2022,10:11 pm,Jabezy Code Plateau,<Media omitted>,2022-04-08 22:11:00,Friday
4,05/08/2022,10:56 am,+234 802 253 0435,<Media omitted>,2022-05-08 10:56:00,Sunday
...,...,...,...,...,...,...
2010,24/10/2022,7:23 pm,+234 803 570 5272,Oho! U wan do us the more we see the less we ...,2022-10-24 19:23:00,Monday
2011,24/10/2022,7:23 pm,+234 816 739 8525,<Media omitted>,2022-10-24 19:23:00,Monday
2012,24/10/2022,7:24 pm,+234 816 739 8525,<Media omitted>,2022-10-24 19:24:00,Monday
2013,24/10/2022,7:24 pm,+234 701 936 6781,Na why we no want make u see sef...,2022-10-24 19:24:00,Monday


We will split our Content column to create new columns showing the number of letters and words contained in each message. We will call these columns Letter_count and Word_count respectively.

In [32]:
df['Letter_Count'] = df['Content'].apply(lambda s : len(s))
df['Word_Count'] = df['Content'].apply(lambda s : len(s.split(' ')))

In [33]:
df['Hour'] = df['Time'].apply(lambda x : x.split(':')[0]) 
# The first token of a value in the Time Column contains the hour (Eg., "12" in "12:15")

In [34]:
#print first five rows of our dataframe
df.head()

Unnamed: 0,Date,Time,Name,Content,DateTime,weekday,Letter_Count,Word_Count,Hour
0,04/08/2022,3:41 pm,+234 803 570 5272,"Please, can you send the document for today?",2022-04-08 15:41:00,Friday,45,9,3
1,04/08/2022,4:20 pm,+234 806 451 1186,Calm down,2022-04-08 16:20:00,Friday,10,3,4
2,04/08/2022,4:28 pm,+234 806 234 7970,Thanks for the link and for adding me,2022-04-08 16:28:00,Friday,38,9,4
3,04/08/2022,10:11 pm,Jabezy Code Plateau,<Media omitted>,2022-04-08 22:11:00,Friday,16,3,10
4,05/08/2022,10:56 am,+234 802 253 0435,<Media omitted>,2022-05-08 10:56:00,Sunday,16,3,10


In [35]:
#saving to csv format
df.to_csv("WhatsappChat.csv")