In [2]:
import re
import pandas as pd

In [3]:
class ExtractDataFrame:
    
    '''
    This module will help parsing the whatsapp chats data.
        Parameters:
            File_path (string): The path of the chats files
        Functions:
            load_file(object) -> File pointer
            is_newEntry(object, string) -> Boolean
    '''
    
    def __init__(self, file_path):
        self.path = file_path
        self.data = []
        
    def load_file(self):
        '''
        This function loads the chat file
        '''
        file = open(self.path, 'r', encoding='utf-8')
        return file
    
    def is_newEntry(self, line: str) -> bool:
        '''
        This function returns if the line is a new message or continuation of the previous one
        '''
        date_time = '([0-9]+)(\/)([0-9]+)(\/)([0-9]+), ([0-9]+):([0-9]+)[ ]?(AM|PM|am|pm)? -'
        test = re.match(date_time, line)
        if test is not None:
            return True
        else:
            return False
        
    def seperateData(self, line: str) -> tuple:
        '''
        This function cleans the line and seperates the author, date, time and message from the text
        '''
        entry_data = line.split(' - ')
        date, time = entry_data[0].split(', ')
        authMsg = entry_data[1].split(':')
        if len(authMsg) > 1:
            author = authMsg[0]
            message = ' '.join(authMsg[1:])
            return (date, time, author, message)
        else:
            return None
    
    def process(self):
        '''
        This functions aggregates all the data from different lines
        '''
        f = self.load_file()
        f.readline()
        full_message = []
        while True:
            line = f.readline()
            if not line:
                break
                
            if self.is_newEntry(line):
                
                if len(full_message)>0:
                    temp = ' '.join(full_message)
                    modified_replaced = temp.replace('\n', ' ')
                    self.data.append([date, time, author, modified_replaced])
                    
                full_message.clear()
                received = self.seperateData(line)
                if received is not None:
                    date, time, author, message = received
                    full_message.append(message)
            else:
                full_message.append(line)

        f.close()
        
    def dataframe(self) -> object:
        '''
        This function returns processed data in Pandas dataframe
        '''
        df = pd.DataFrame(self.data, columns=['Date', 'Time', 'Author', 'Message'])
        df['Date'] = pd.to_datetime(df.Date)
        return df
        

In [4]:
chats = ExtractDataFrame('WhatsApp Chat with Shahbad Roots - Family.txt')

In [5]:
chats.process()

In [6]:
chats.dataframe()

Unnamed: 0,Date,Time,Author,Message
0,2019-03-25,05:33,didi new,<Media omitted>
1,2019-03-25,07:33,didi new,<Media omitted>
2,2019-03-25,08:35,Babbooji,<Media omitted>
3,2019-03-25,08:35,Babbooji,*🙏🌹जय श्री महाकाल 🌹🙏* *श्री महाकालेश्वर ज्यो...
4,2019-03-25,09:37,didi new,<Media omitted>
...,...,...,...,...
8939,2020-03-11,05:46,didi new,<Media omitted>
8940,2020-03-11,05:46,didi new,<Media omitted>
8941,2020-03-11,08:08,Babbooji,<Media omitted>
8942,2020-03-11,08:15,Babbooji,<Media omitted>
