In [52]:
import emoji, emot, re, stylecloud

import pandas as pd
import plotly.graph_objs as go

from bertopic import BERTopic

from collections import Counter
from datetime import timedelta
# from emosent import get_emoji_sentiment_rank
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from umap import UMAP
from whatstk import WhatsAppChat, FigureBuilder




In [53]:
def rawToDf(file, key):
    '''Converts raw .txt file into a Data Frame'''
    
    split_formats = {
        '12hr' : '\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s[APap][mM]\s-\s',
        '24hr' : '\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s-\s',
        'custom' : ''
    }
    datetime_formats = {
        '12hr' : '%d/%m/%Y, %I:%M %p - ',
        '24hr' : '%d/%m/%Y, %H:%M - ',
        'custom': ''
    }
    
    with open(file, 'r', encoding='utf-8') as raw_data:
    
        raw_string = ' '.join(raw_data.read().split('\n')) # converting the list split by newline char. as one whole string as there can be multi-line messages
        user_msg = re.split(split_formats[key], raw_string) [1:] # splits at all the date-time pattern, resulting in list of all the messages with user names
        date_time = re.findall(split_formats[key], raw_string) # finds all the date-time patterns
        
        df = pd.DataFrame({'date_time': date_time, 'user_msg': user_msg}) # exporting it to a df
        
    # converting date-time pattern which is of type String to type datetime,
    # format is to be specified for the whole string where the placeholders are extracted by the method 
    df['date_time'] = pd.to_datetime(df['date_time'], format=datetime_formats[key])
    
    # split user and msg 
    usernames = []
    msgs = []
    for i in df['user_msg']:
        a = re.split('([\w\W]+?):\s', i) # lazy pattern match to first {user_name}: pattern and spliting it aka each msg from a user
        if(a[1:]): # user typed messages
            usernames.append(a[1])
            msgs.append(a[2])
        else: # other notifications in the group(eg: someone was added, some left ...)
            usernames.append("group_notification")
            msgs.append(a[0])

    # creating new columns         
    df['user'] = usernames
    df['message'] = msgs

    # dropping the old user_msg col.
    df.drop('user_msg', axis=1, inplace=True)
    
    return df

In [57]:
df = rawToDf('sample.txt', '12hr')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9358 entries, 0 to 9357
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   date_time  9358 non-null   datetime64[ns]
 1   user       9358 non-null   object        
 2   message    9358 non-null   object        
dtypes: datetime64[ns](1), object(2)
memory usage: 219.5+ KB


In [58]:
df.sample(10)

Unnamed: 0,date_time,user,message
2843,2023-07-29 21:43:00,Davie,<Media omitted>
4124,2023-09-09 15:14:00,Agnes,Safe
2081,2023-07-09 12:44:00,Davie,<Media omitted>
8683,2024-01-21 15:18:00,Mom ❤️🌹,Safe travels
3557,2023-08-21 12:01:00,Davie,We will talk kesho kaluu
5219,2023-10-14 15:25:00,Agnes,Okay
3081,2023-08-04 12:24:00,Agnes,Eeeh utakuja kuwauliza hivo monday
8662,2024-01-21 12:04:00,Davie,Me nishatoka church already
7724,2023-12-22 12:49:00,Davie,
5289,2023-10-15 19:41:00,Agnes,<Media omitted>
