In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot  as plt
import seaborn as sns
import emoji
import re
from collections import Counter
from wordcloud import WordCloud, STOPWORDS

In [12]:
## Loading chat data
file = open("chat.txt")

In [13]:
data = file.read()

In [14]:
## First have to split the data based on the date and time.
## I will use the regex to do so
pattern = "\d{1,2}/\d{1,2}/\d{1,2},\s\d{1,2}:\d{1,2}\s.M"

In [15]:
## Finding the all messages.
message = re.split(pattern, data)[1:]
## First line is empty string

In [16]:
## Now finding the all dates and times
dates = re.findall(pattern, data)

In [17]:
## Checking the size of dates and messges
len(dates), len(message)

(984, 984)

In [18]:
## Not putting both in data frame.
df = pd.DataFrame({'date':dates, "message":message})

In [19]:
# df.shape

In [20]:
df.head()

Unnamed: 0,date,message
0,"8/10/22, 9:25 AM",- Messages and calls are end-to-end encrypted...
1,"7/21/22, 5:42 AM","- +91 6283 642 395 created group ""20BCS_WM-70..."
2,"8/10/22, 9:25 AM",- You joined using this group's invite link\n
3,"8/11/22, 6:27 PM","- GAZI Abbas Sir Java: Dear all, \nHard copy ..."
4,"8/12/22, 8:41 AM","- +91 80917 73465: Respected All,\n\nWith ref..."


In [21]:
## Converting the date columns into date times
## Cleaning the date columns
df['date'] = pd.to_datetime(df['date'].str.replace(',',''))

In [22]:
## Extracting the date and time individually
df['year'] = df['date'].dt.year

In [23]:
## Extracing the month in the form of month string.
df['month'] = df['date'].dt.month_name()

In [24]:
## Date
df['day'] = df['date'].dt.day
df['day_name'] = df['date'].dt.day_name()
df['hour'] = df['date'].dt.hour
df['minute'] = df['date'].dt.minute

In [25]:
## Now splitting the sender and their message respectively
df.head()

Unnamed: 0,date,message,year,month,day,day_name,hour,minute
0,2022-08-10 09:25:00,- Messages and calls are end-to-end encrypted...,2022,August,10,Wednesday,9,25
1,2022-07-21 05:42:00,"- +91 6283 642 395 created group ""20BCS_WM-70...",2022,July,21,Thursday,5,42
2,2022-08-10 09:25:00,- You joined using this group's invite link\n,2022,August,10,Wednesday,9,25
3,2022-08-11 18:27:00,"- GAZI Abbas Sir Java: Dear all, \nHard copy ...",2022,August,11,Thursday,18,27
4,2022-08-12 08:41:00,"- +91 80917 73465: Respected All,\n\nWith ref...",2022,August,12,Friday,8,41


In [26]:
## Now splitting the message with respece to sender and their messages.
print(df['message'][3])
print(df['message'][2])
print(df['message'][40])

 - GAZI Abbas Sir Java: Dear all, 
Hard copy of Lab file is mandatory for the students having physical classes. So kindly instruct the students to maintain hard copy for Lab file. Print outs

 - You joined using this group's invite link

 - +91 70189 60040: Open with your university e-mail IDs



In [27]:
## Function to find mobile number and names
def find_sender(x):
    pattern = "^\s-\s([\w\s\d+]+):"
    res = re.search(pattern, x)
    if res == None:
        return ""
    else:
        return res[1]

In [28]:
df['sender'] = df['message'].apply(lambda x : find_sender(x))

In [29]:
## The message which have no sender means
## they are function message like, group creation, member added
## So removing such kind of message
indices = df[df['sender'] == ""].index
df.drop(indices, inplace = True)

In [30]:
## Now seperating the text from message
pattern = "^\s-\s[\w\s\d+]+:"
df['text'] = df['message'].apply(lambda x : re.split(pattern, x)[1])\
            .str.replace('\\n','').str.strip().str.lower()

  .str.replace('\\n','').str.strip().str.lower()


In [31]:
df.drop(['date','message'], axis=1, inplace=True)

In [32]:
df.head()

Unnamed: 0,year,month,day,day_name,hour,minute,sender,text
3,2022,August,11,Thursday,18,27,GAZI Abbas Sir Java,"dear all, hard copy of lab file is mandatory f..."
4,2022,August,12,Friday,8,41,+91 80917 73465,"respected all,with reference to the list recei..."
5,2022,August,12,Friday,8,41,+91 80917 73465,<media omitted>
6,2022,August,12,Friday,8,41,+91 80917 73465,if any student have not got the creditionals f...
7,2022,August,12,Friday,8,41,+91 80917 73465,<media omitted>


In [46]:
busy_user = round(df['sender'].value_counts().head(10)/df.shape[0] * 100, 2).reset_index()
busy_user.rename(columns={'index':'Sender','sender':'Percentage'})

Unnamed: 0,index,sender
0,Utkarsh,18.14
1,+91 83508 02892,16.41
2,+91 6283 339 161,8.86
3,+91 89582 48182,5.72
4,+91 80917 73465,4.32
5,+91 89798 79570,4.1
6,GAZI Abbas Sir Java,3.67
7,+91 73524 58669,3.67
8,+91 89558 55533,3.46
9,+91 96604 89963,2.48


Unnamed: 0,year,month,day,day_name,hour,minute,sender,text
3,2022,August,11,Thursday,18,27,GAZI Abbas Sir Java,"dear all, hard copy of lab file is mandatory f..."
4,2022,August,12,Friday,8,41,+91 80917 73465,"respected all,with reference to the list recei..."
5,2022,August,12,Friday,8,41,+91 80917 73465,<media omitted>
6,2022,August,12,Friday,8,41,+91 80917 73465,if any student have not got the creditionals f...
7,2022,August,12,Friday,8,41,+91 80917 73465,<media omitted>


In [33]:
def most_busy_user_plot(df):
    return df['sender'].value_counts()[ :10].plot(kind='bar')

In [34]:
def most_busy_user_perc(df):
    return df['sender'].value_counts()[:10]/df['sender'].shape[0] * 100

In [35]:
wordcloud = WordCloud(width = 600, height =400,
                background_color ='white',
                stopwords = set(STOPWORDS),
                min_font_size = 10).generate("".join(total_words(df)))

# plot the WordCloud image                      
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 4)
plt.show()

NameError: name 'total_words' is not defined