# Dataset Extraction

Importing the required libraries

In [1]:
import re
import pandas as pd
import numpy as np
import emoji
import plotly.express as px
from collections import Counter
import matplotlib.pyplot as plt
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

In [2]:
%matplotlib inline

Checking each line if it starts with date and time to identify each unique message in the text file

In [3]:
def startsWithDateAndTime(s):
#     regex pattern for all the dates
    pattern = '^([0-9]+)(\/)([0-9]+)(\/)([0-9]+), ([0-9]+):([0-9]+)[ ]?(AM|PM|am|pm)? -'
    result = re.match(pattern, s)
    if result:
        return True
    return False

In [4]:
def FindAuthor(s):
# sending the second part of the message after split so to check if it's the same message or if it's a new message
  s = s.split(":")
  if len(s)==2:
    return True
  else:
    return False

In [5]:
def getDataPoint(line):
    splitLine = line.split(' - ') 
#     first part before '-' is datetime
    dateTime = splitLine[0]
    date, time = dateTime.split(', ')
    
#     second part is the message
    message = ' '.join(splitLine[1:])
    if FindAuthor(message): 
        splitMessage = message.split(': ') 
        author = splitMessage[0] 
        message = ' '.join(splitMessage[1:])
    else:
        author = None
    return date, time, author, message

In [6]:
# List to keep track of data so it can be used by a Pandas dataframe
parsedData = [] 

conversation = 'chat.txt'
with open(conversation, encoding="utf-8") as fp:
    fp.readline()
    # Skipping first line of the file because contains information related to something about end-to-end encryption
    
    messageBuffer = [] 
    date, time, author = None, None, None
    while True:
        line = fp.readline() 
        if not line: 
            break
        line = line.strip() 
        if startsWithDateAndTime(line): 
            if len(messageBuffer) > 0: 
                parsedData.append([date, time, author, ' '.join(messageBuffer)]) 
            messageBuffer.clear() 
            date, time, author, message = getDataPoint(line) 
            messageBuffer.append(message) 
        else:
            messageBuffer.append(line)

In [7]:
df = pd.DataFrame(parsedData, columns = ['Date', 'Time','Author', 'Message'])

In [8]:
df.head()

Unnamed: 0,Date,Time,Author,Message
0,24/01/2020,8:25 pm,,"Tanay Kamath (TSEC, CS) created group ""CODERS👨..."
1,26/01/2020,4:19 pm,,You joined using this group's invite link
2,26/01/2020,4:20 pm,,+91 99871 38558 joined using this group's invi...
3,26/01/2020,4:20 pm,,+91 91680 38866 joined using this group's invi...
4,26/01/2020,4:22 pm,,+91 72762 35231 joined using this group's invi...


In [9]:
# checking out number of unique authors of the messages
df['Author'].unique()

array([None, '+91 96536 93868', 'Dheeraj Lalwani (TSEC, CS)',
       '+91 99201 75875', '+91 95949 08570', '+91 79778 76844',
       '+91 90499 38860', 'Tanay Kamath (TSEC, CS)', 'Saket (TSEC, CS)',
       '+91 77568 95072', 'Rohit Pathak (TSEC, CS)', '+91 75078 05454',
       'Darshan Rander (TSEC, IT)', '+91 79774 68083', '+91 70394 60876',
       '+91 96191 55044', '+91 90678 93300', 'Mohit Varma (TSEC, CS)',
       '+91 79770 56210', 'Chirag Sharma (TSEC, CS)',
       'Vivek Iyer (TSEC, Biomed)', 'Tushar Nankani', '+91 81696 22410',
       '+91 89764 07509', '+91 78758 66747', 'Ankit (TSEC, CS)',
       '+91 86556 33169', '+91 76663 28147', '+91 88284 70904',
       '+91 97698 67348', 'Vivek (TSEC, CS)', 'Hardik Raheja (TSEC, CS)',
       '+91 91680 38866', 'Mittul Dasani (TSEC, CS)',
       'Kartik Soneji (TSEC, CS)', '+91 77180 43697', '+91 99676 84479',
       'Shreya (TSEC, IT)', '+91 96190 16721', '+91 89833 85127',
       '+91 99675 58551', '+91 90822 59476', 'Prithvi Rohira 

In [10]:
# loading the cleaned dataset into the csv file
df.to_csv('Whatsapp_Chat_Table.csv')

In [11]:
# checking out random 10 samples from the dataset
df.sample(10)

Unnamed: 0,Date,Time,Author,Message
9117,19/07/2020,2:16 pm,"Tanay Kamath (TSEC, CS)",Srsly needed
5291,25/04/2020,5:39 pm,"Dheeraj Lalwani (TSEC, CS)",Who is Dumped?
3516,20/03/2020,8:19 pm,,"Darshan Rander (TSEC, IT): Mashup tomorrow : M..."
10144,20/08/2020,2:23 pm,"Harsh Kapadia (TSEC IT, SE)","It's okay, it happens... First time Ige heard ..."
6789,28/05/2020,3:28 am,"Dheeraj Lalwani (TSEC, CS)","But, it's a Django project"
414,29/01/2020,11:29 pm,+91 99201 75875,It's a replacement for return 0 Return 0 termi...
1847,23/02/2020,12:00 pm,+91 79770 56210,I am getting Runtime error Dk why and i am get...
6928,03/06/2020,10:34 pm,"Pratik K (TSEC CS, SE)",+100
7530,13/06/2020,12:19 am,+91 77180 43697,<Media omitted>
9394,06/08/2020,11:16 pm,"Tanay Kamath (TSEC, CS)",same


In [12]:
# dropping the rows with null or None values
df.dropna()

Unnamed: 0,Date,Time,Author,Message
112,27/01/2020,7:31 pm,+91 96536 93868,<Media omitted>
113,27/01/2020,7:31 pm,+91 96536 93868,Give it a try ....
114,27/01/2020,7:31 pm,"Dheeraj Lalwani (TSEC, CS)",Alright
115,27/01/2020,7:32 pm,"Dheeraj Lalwani (TSEC, CS)",We can make this a trend
116,27/01/2020,7:32 pm,+91 96536 93868,Sure
...,...,...,...,...
13648,02/10/2020,2:05 am,"Darshan Rander (TSEC, IT)",7mins ig
13649,02/10/2020,2:05 am,"Darshan Rander (TSEC, IT)",MCQs mark kiya
13650,02/10/2020,2:05 am,"Darshan Rander (TSEC, IT)",Sign-in kiya😂😅
13651,02/10/2020,2:11 am,"Tanay Kamath (TSEC, CS)",Incognito se na?
