### Import libraries

In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
def startsWithDateTime(s):
    pattern = '^([0-2][0-9]|(3)[0-1])(\/)(((0)[0-9])|((1)[0-2]))(\/)(\d{2}|\d{4}), ([0-9])|([0-9]):([0-9][0-9]) '
    result = re.match(pattern, s)
    if result:
        return True
    return False

In [3]:
# def startsWithAuthor(s):
#     patterns = [
#         '([\w()\[\]-]+:)'#'([\w]+):',                        # First Name
#         '([\w]+[\s]+([\w()\[\]-]+)):',              # First Name + Last Name
#         '([\w]+[\s]+[\w]+[\s]+([\w()\[\]-]+)):',    # First Name + Middle Name + Last Name
#         '([+]\d{2} \d{5} \d{5}):',         # Mobile Number (India)
#         '([+]\d{2} \d{3} \d{3} \d{4}):',   # Mobile Number (US)
#         '([+]\d{2} \d{4} \d{7})'           # Mobile Number (Europe)
#     ]
#     pattern = '^' + '|'.join(patterns)
#     result = re.match(pattern, s)
#     if result:
#         return True
#     return False
def startsWithAuthor(s):
    """
        This function is used to verify the string(s) contains 'Author' or not.
        
        Parameters:
            s: String
        
        Returns:
            True if it contains author name otherwise False
    """
    
    pattern = '^([\w()\[\]-]+):|([\w]+[\s]+([\w()\[\]-]+)):'
    result = re.match(pattern, s)
    if result:
        return True
    return False

In [4]:
def getDataPoint(line):
    """
        Use to extract the date, time, author and message from line.
        
        Parameters: 
            line (from txt file)
        
        Returns:
            date, time, author, message        
    """
    splitLine = line.split(' - ') # splitLine = ['18/06/17, 22:47', 'Loki: Why do you have 2 numbers, Banner?']
    
    dateTime = splitLine[0] # dateTime = '18/06/17, 22:47'
    
    date, time = dateTime.split(', ') # date = '18/06/17'; time = '22:47'
    
    message = ' '.join(splitLine[1:]) # message = 'Loki: Why do you have 2 numbers, Banner?'
    
    if startsWithAuthor(message): # True
        splitMessage = message.split(': ') # splitMessage = ['Loki', 'Why do you have 2 numbers, Banner?']
        author = splitMessage[0] # author = 'Loki'
        message = ' '.join(splitMessage[1:]) # message = 'Why do you have 2 numbers, Banner?'
    else:
        author = None
    return date, time, author, message

In [5]:
"""
In this cell, I am reading the text format file of WhatsApp chat.
"""
filename="WhatsApp Chat with Quarantine time📱📱.txt"

with open(filename, encoding="utf-8") as f:
    file_contents = [x.rstrip() for x in f]

In [6]:
"""

"""

data = [] # List to keep track of data so it can be used by a Pandas dataframe
    
messageBuffer = [] # Buffer to capture intermediate output for multi-line messages
date, time, author = None, None, None # Intermediate variables to keep track of the current message being processed
    
for line in file_contents:
    line = line.strip() # Guarding against erroneous leading and trailing whitespaces

    if startsWithDateTime(line): # If a line starts with a Date Time pattern, then this indicates the beginning of a new message
        if len(messageBuffer) > 0: # Check if the message buffer contains characters from previous iterations
            data.append([date, time, author, ' '.join(messageBuffer)]) # Save the tokens from the previous message in parsedData
        messageBuffer.clear() # Clear the message buffer so that it can be used for the next message
        date, time, author, message = getDataPoint(line) # Identify and extract tokens from the line
        messageBuffer.append(message) # Append message to buffer
    else:
        messageBuffer.append(line) # If a line doesn't start with a Date Time pattern, then it is part of a multi-line message. So, just append to buffer

In [7]:
df = pd.DataFrame(data, columns=['Date', 'Time', 'Author', 'Message'])
df.head()

Unnamed: 0,Date,Time,Author,Message
0,14/03/19,9:05 am,,Messages and calls are end-to-end encrypted. N...
1,14/03/19,9:05 am,,"You created group ""For some new"""
2,14/03/19,9:06 am,,Prem: https://www.theverge.com/2019/3/13/18264...
3,14/03/19,9:07 am,,Prem: https://www.cnet.com/news/cant-update-st...
4,14/03/19,9:09 am,,You added Soumyajit(Iiitg)


In [8]:
df.describe()

Unnamed: 0,Date,Time,Author,Message
count,5283,5283,0.0,5283
unique,338,1111,0.0,3614
top,20/03/20,7:45 pm,,Prem: <Media omitted>
freq,240,70,,240


In [9]:
df['Author'].value_counts(dropna=False)

NaN    5283
Name: Author, dtype: int64

In [10]:
df[df['Author'].isnull()][0:30]

Unnamed: 0,Date,Time,Author,Message
0,14/03/19,9:05 am,,Messages and calls are end-to-end encrypted. N...
1,14/03/19,9:05 am,,"You created group ""For some new"""
2,14/03/19,9:06 am,,Prem: https://www.theverge.com/2019/3/13/18264...
3,14/03/19,9:07 am,,Prem: https://www.cnet.com/news/cant-update-st...
4,14/03/19,9:09 am,,You added Soumyajit(Iiitg)
5,14/03/19,10:27 pm,,Prem: <Media omitted>
6,15/03/19,2:07 pm,,"Prem: *NOTICE* सरकार के निर्देश अनुसार, भारत स..."
7,15/03/19,4:27 pm,,Prem: https://youtu.be/SUDEmOeQT7U
8,15/03/19,4:29 pm,,Prem: You deleted this message
9,15/03/19,4:33 pm,,Raj Ranjan(Iiitg): Kyun
