In [28]:
import numpy as np
import pandas as pd
import re

In [29]:
f = open('WhatsApp Chat with AOC BACKEND.txt',encoding='utf-8')
data = f.read()

In [30]:
pattern = '\d{1,2}/\d{1,2}/\d{2},\s\d{1,2}:\d{2}\s'

messages = re.split(pattern, data)[1:]
dates = re.findall(pattern, data)


In [31]:
df = pd.DataFrame({'user_message': messages, 'message_date': dates})

In [32]:
df.head()

Unnamed: 0,user_message,message_date
0,PM - Messages and calls are end-to-end encrypt...,"1/16/23, 8:45"
1,"AM - Sir Eut Online Meeting created group ""AOC...","1/6/23, 12:14"
2,PM - You joined using this group's invite link\n,"1/16/23, 8:45"
3,PM - Umar Aslam joined using this group's invi...,"1/16/23, 8:49"
4,PM - Sir Eut Online Meeting: wasif u thr\n,"1/16/23, 9:00"


In [33]:
df['user_message'][0:5]

0    PM - Messages and calls are end-to-end encrypt...
1    AM - Sir Eut Online Meeting created group "AOC...
2     PM - You joined using this group's invite link\n
3    PM - Umar Aslam joined using this group's invi...
4           PM - Sir Eut Online Meeting: wasif u thr\n
Name: user_message, dtype: object

In [34]:
df['user_message'] = df['user_message'].map(lambda x: x.lstrip('PM - '))
df['user_message'] = df['user_message'].map(lambda x: x.lstrip('AM - '))

In [35]:
df.head()

Unnamed: 0,user_message,message_date
0,essages and calls are end-to-end encrypted. No...,"1/16/23, 8:45"
1,"Sir Eut Online Meeting created group ""AOC BACK...","1/6/23, 12:14"
2,You joined using this group's invite link\n,"1/16/23, 8:45"
3,Umar Aslam joined using this group's invite li...,"1/16/23, 8:49"
4,Sir Eut Online Meeting: wasif u thr\n,"1/16/23, 9:00"


In [36]:
# convert message_date type
df['message_date'] = pd.to_datetime(df['message_date'], infer_datetime_format=True)
df['message_date'] = pd.to_datetime(df['message_date'], format='%m/%d/%y, %H:%M')
df.rename(columns={'message_date': 'date'}, inplace=True)

In [37]:
df.head(2)

Unnamed: 0,user_message,date
0,essages and calls are end-to-end encrypted. No...,2023-01-16 08:45:00
1,"Sir Eut Online Meeting created group ""AOC BACK...",2023-01-06 12:14:00


In [38]:
users = []
messages = []
for message in df['user_message']:
    entry = re.split('([\w\W]+?):\s', message)
    if entry[1:]:  # user name
        users.append(entry[1])
        messages.append(" ".join(entry[2:]))
    else:
        users.append('group_notification')
        messages.append(entry[0])

df['user'] = users
df['message'] = messages
df.drop(columns=['user_message'], inplace=True)

df['only_date'] = df['date'].dt.date
df['year'] = df['date'].dt.year
df['month_num'] = df['date'].dt.month
df['month'] = df['date'].dt.month_name()
df['day'] = df['date'].dt.day
df['day_name'] = df['date'].dt.day_name()
df['hour'] = df['date'].dt.hour
df['minute'] = df['date'].dt.minute

period = []
for hour in df[['day_name', 'hour']]['hour']:
    if hour == 23:
        period.append(str(hour) + "-" + str('00'))
    elif hour == 0:
        period.append(str('00') + "-" + str(hour + 1))
    else:
        period.append(str(hour) + "-" + str(hour + 1))

df['period'] = period

In [39]:
df.head()

Unnamed: 0,date,user,message,only_date,year,month_num,month,day,day_name,hour,minute,period
0,2023-01-16 08:45:00,group_notification,essages and calls are end-to-end encrypted. No...,2023-01-16,2023,1,January,16,Monday,8,45,8-9
1,2023-01-06 12:14:00,group_notification,"Sir Eut Online Meeting created group ""AOC BACK...",2023-01-06,2023,1,January,6,Friday,12,14,12-13
2,2023-01-16 08:45:00,group_notification,You joined using this group's invite link\n,2023-01-16,2023,1,January,16,Monday,8,45,8-9
3,2023-01-16 08:49:00,group_notification,Umar Aslam joined using this group's invite li...,2023-01-16,2023,1,January,16,Monday,8,49,8-9
4,2023-01-16 09:00:00,Sir Eut Online Meeting,wasif u thr\n,2023-01-16,2023,1,January,16,Monday,9,0,9-10


# 
# code analyzing

In [41]:
df['user'][30:100]

30         Umar Aslam
31          Wasif Eut
32          Wasif Eut
33         Umar Aslam
34     Syed Afaq Shah
           ...       
95         Umar Aslam
96          Wasif Eut
97    +92 333 5010840
98          Wasif Eut
99     Syed Afaq Shah
Name: user, Length: 70, dtype: object

In [42]:
# fetch the number of messages
num_messages = df.shape[0]
num_messages

189

In [54]:
# fetch the total number of words
words = []
for message in df['message']:
    #print(message.split()) 
    words.extend(message.split())
words

['essages',
 'and',
 'calls',
 'are',
 'end-to-end',
 'encrypted.',
 'No',
 'one',
 'outside',
 'of',
 'this',
 'chat,',
 'not',
 'even',
 'WhatsApp,',
 'can',
 'read',
 'or',
 'listen',
 'to',
 'them.',
 'Tap',
 'to',
 'learn',
 'more.',
 'Sir',
 'Eut',
 'Online',
 'Meeting',
 'created',
 'group',
 '"AOC',
 'BACKEND"',
 'You',
 'joined',
 'using',
 'this',
 "group's",
 'invite',
 'link',
 'Umar',
 'Aslam',
 'joined',
 'using',
 'this',
 "group's",
 'invite',
 'link',
 'wasif',
 'u',
 'thr',
 'Yes',
 'sir',
 'im',
 'waiting',
 'for',
 '30',
 'min,',
 'sent',
 'tried',
 'many',
 'times',
 'with',
 'google',
 'link',
 'so',
 'that',
 'u',
 'let',
 'me',
 'in',
 'but',
 'no',
 'response,',
 'can',
 'u',
 'please',
 'make',
 'sure',
 'u',
 'let',
 'everyone',
 'immediately',
 'when',
 'thy',
 'request',
 'join',
 'in',
 'still',
 'waiting',
 'join',
 'please',
 'now',
 'umar,',
 'syed',
 'and',
 'noor',
 'please',
 'join',
 'meeting',
 'now',
 'Yes',
 'sir',
 'Yes',
 'sir',
 'break',
 'bre

In [61]:
# fetch number of media messages
num_media_messages = df[df['message'] == '<Media omitted>\n'].shape[0]

In [64]:
num_media_messages

4

In [67]:
#fetch number of links shared
from urlextract import URLExtract
ex = URLExtract()
links = []
for mess in df['message']:
    print(ex.find_urls(mess))

[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
['https://meet.google.com/jsy-yueq-mrv']
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
['https://meet.google.com/jsy-yueq-mrv']
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
['https://admin.vuestorefronts.com/']
['https://nouthemes.net/html/martfury/homepage-4.html']
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
['premium109.web-hosting.com', 'precisetec.ca']
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]


In [68]:
# most busy user
x = df['user'].value_counts().head()

In [70]:
x

Wasif Eut                 47
Umar Aslam                46
💻Noor Saeed🗜️             46
Syed Afaq Shah            21
Sir Eut Online Meeting    14
Name: user, dtype: int64

In [71]:
 df = round((df['user'].value_counts() / df.shape[0]) * 100, 2).reset_index().rename(
        columns={'index': 'name', 'user': 'percent'})

In [74]:
df

Unnamed: 0,name,percent
0,Wasif Eut,24.87
1,Umar Aslam,24.34
2,💻Noor Saeed🗜️,24.34
3,Syed Afaq Shah,11.11
4,Sir Eut Online Meeting,7.41
5,group_notification,3.7
6,+92 333 5010840,2.12
7,+92 346 9184015,2.12
