In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import re
from collections import Counter

In [2]:
def seperate_date_time(x):
    date = pd.to_datetime(x.split(', ')[0], format="%m/%d/%y")
    time = x.split(', ')[1].split(' - ')[0]
    return date, time

In [3]:
def createDf(file_data, dt):
    patterns = {
        '12 Hour':'\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s[APap][mM]\s-\s',
        '24 Hour':'\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s-\s',
    }
    
    data = ' '.join(file_data.read().split('\n'))
    user_message = re.split(patterns[dt], data)[1:]
    date_time = re.findall(patterns[dt], data)
    
    df = pd.DataFrame({'UserMessage': user_message, 'DateTime': date_time})
    df[['Date','Time']] = df['DateTime'].apply(lambda x: seperate_date_time(x)).to_list()
    users = []
    messages = []
    for message in df['UserMessage']:
        entry = re.split('([\w\W]+?):\s', message)
        if entry[1:]:
            users.append(entry[1])
            messages.append(entry[2])
        else:
            users.append('group_notification')
            messages.append(entry[0])

    df['User'] = users
    df['Message'] = messages
    df.drop(columns=['UserMessage', 'DateTime'], inplace=True)
    return df

In [4]:
f = open('WhatsApp Chat with Say GT (original) 12 hour format.txt', 'r', encoding='utf-8')
df = createDf(f, '12 Hour')

In [5]:
f = open('WhatsApp Chat with Say GT (original) 24 hour format.txt', 'r', encoding='utf-8')
df = pd.DataFrame(createDf(f, '24 Hour'))

In [6]:
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month_name()
df['MonthNum'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df['DayName'] = df['Date'].dt.day_name()
df['Hour'] = df['Time'].apply(lambda x: x.split(':')[0])
df['Minute'] = df['Time'].apply(lambda x: (x.split(':')[1]).split(' ')[0])

#Uncomment while working with 12 hour file format
# df['Meridian'] = df['Time'].apply(lambda x: (x.split(':')[1]).split(' ')[1])  

df.head()

Unnamed: 0,Date,Time,User,Message,Year,Month,MonthNum,Day,DayName,Hour,Minute
0,2022-05-09,15:01,group_notification,Messages and calls are end-to-end encrypted. N...,2022,May,5,9,Monday,15,1
1,2022-05-26,19:21,Clg Divyansh,Robotics ke notes,2022,May,5,26,Thursday,19,21
2,2022-05-26,19:44,Clg Kushagra Ahire,MST me jo numerical aaye the uske solution bhe...,2022,May,5,26,Thursday,19,44
3,2022-05-26,19:55,Clg Abbas Amjhera,<Media omitted>,2022,May,5,26,Thursday,19,55
4,2022-05-26,20:05,+91 6239 413 783,Bhai notes bhej do yrr koi 🙏🏻,2022,May,5,26,Thursday,20,5


# Chatting From

In [111]:
def chat_from(selected_user, df):
    if selected_user != 'Overall':
        df = df[df['User'] == selected_user]
    unique_years = df['Year'].unique()
    start_year = unique_years[0]
    msg_count = df.groupby(['Date']).count()['Message']
    avg_msg = round(msg_count.mean(),2)
    return start_year, avg_msg

In [112]:
chat_from('Overall', df)

(2022, 22.53)

# Most Talkative

In [134]:
def most_talkative(df):
    df = df[df['User'] != 'group_notification']
    user = df['User'].value_counts()
    username = user.index[0]
    avg_msg = round(len(df)/user[username], 2)
    return username, avg_msg

In [135]:
most_talkative(df)

('Clg Ashvin birla', 6.64)

# Hourly Timeline

In [7]:
def hourly_timeline(selected_user, df, format):
  if selected_user != 'Overall':
    df = df[df['User'] == selected_user]
  new_df = df[df['User'] != 'group_notification']
  
  new_df['Message'] = [1] * new_df.shape[0]
  if format == '12 Hour':
    new_df['Hour'] = new_df['Hour'].astype(str) + ' ' + new_df['Meridian'].astype(str)
    new_df = new_df.groupby('Hour').sum().reset_index()
  else:
    new_df = new_df.groupby('Hour').sum().reset_index()
  return new_df

In [8]:
hourly_timeline('Overall', df, '24 Hour')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['Message'] = [1] * new_df.shape[0]


Unnamed: 0,Hour,Message,Year,MonthNum,Day
0,0,22,44484,141,497
1,1,14,28308,97,260
2,2,3,6066,18,85
3,3,5,10110,31,128
4,4,1,2022,6,2
5,7,2,4044,12,18
6,8,8,16176,49,214
7,9,43,86946,286,444
8,10,49,99078,303,804
9,11,103,208266,721,1347


# Daily Timeline

In [9]:
def daily_timeline(selected_user, df):
  if selected_user != 'Overall':
    df = df[df['User'] == selected_user]
  new_df = df[df['User'] != 'group_notification']
  
  new_df['Message'] = [1] * new_df.shape[0]
  new_df = new_df.groupby('Date').sum().reset_index()
  return new_df

In [10]:
daily_timeline('Overall', df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['Message'] = [1] * new_df.shape[0]


Unnamed: 0,Date,Message,Year,MonthNum,Day
0,2022-05-26,16,32352,80,416
1,2022-05-27,14,28308,70,378
2,2022-05-28,5,10110,25,140
3,2022-05-29,8,16176,40,232
4,2022-05-30,37,74814,185,1110
5,2022-05-31,5,10110,25,155
6,2022-06-01,172,347784,1032,172
7,2022-06-02,9,18198,54,18
8,2022-06-03,15,30330,90,45
9,2022-06-07,2,4044,12,14


# Weekly Timeline

In [53]:
def weekly_timeline(selected_user, df):
  if selected_user != 'Overall':
    df = df[df['User'] == selected_user]
  new_df = df[df['User'] != 'group_notification']
  
  new_df['Message'] = [1] * new_df.shape[0]
  days = [ 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
  new_df = new_df.groupby('DayName').sum().reindex(days)
  new_df['Days'] = ['1 - Monday', '2 - Tuesday', '3 - Wednesday', '4 - Thursday', '5 - Friday', '6 - Saturday', '8 - Sunday']
  return new_df

In [54]:
weekly_timeline('Overall', df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['Message'] = [1] * new_df.shape[0]


Unnamed: 0_level_0,Message,Year,MonthNum,Day,Days
DayName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Monday,178,359916,1093,4117,1 - Monday
Tuesday,96,194112,720,946,2 - Tuesday
Wednesday,328,663216,2215,2471,3 - Wednesday
Thursday,185,374070,1265,2268,4 - Thursday
Friday,356,719832,2313,3779,5 - Friday
Saturday,123,248706,844,1944,6 - Saturday
Sunday,161,325542,1182,1849,8 - Sunday


# Monthly Timeline

In [92]:
def monthly_timeline(selected_user, df):
    if selected_user != 'Overall':
      df = df[df['User'] == selected_user]
    new_df = df[df['User'] != 'group_notification']
    new_df  = df.groupby(['Year','Month','MonthNum']).count()
    new_df.reset_index(inplace=True)
    time = []
    for i in range(new_df.shape[0]):
        time.append(str(new_df['MonthNum'][i]) + " - " + str(new_df['Year'][i]) + " - " + new_df['Month'][i])
    new_df['Time'] = time
    return new_df

In [93]:
monthly_timeline('Overall', df)

Unnamed: 0,Year,Month,MonthNum,Date,Time,User,Message,Day,DayName,Hour,Minute
0,2022,August,8,410,8 - 2022 - August,410,410,410,410,410,410
1,2022,July,7,343,7 - 2022 - July,343,343,343,343,343,343
2,2022,June,6,603,6 - 2022 - June,603,603,603,603,603,603
3,2022,May,5,86,5 - 2022 - May,86,86,86,86,86,86


# Most Common Word

In [168]:
def most_common_words(selected_user, df):
  if selected_user != 'Overall':
    df = df[df['User'] == selected_user]
  new_df = df[df['User'] != 'group_notification']
  new_df = new_df[~new_df['Message'].str.contains('<Media omitted>')]
  f = open('stop_hinglish.txt')
  stop_words = f.read()

  words = []

  for message in new_df['Message']:
    for word in message.lower().split():
      if word not in stop_words:
        words.append(word)
  
  most_common_df = pd.DataFrame(Counter(words).most_common(20), columns=['Message', 'Count'])
  most_common_df.sort_values('Count')
  return most_common_df

In [169]:
most_common_words('Overall', df)

Unnamed: 0,Message,Count
0,message,69
1,deleted,67
2,tcs,42
3,form,25
4,jain,23
5,mail,20
6,digital,19
7,drive,18
8,option,17
9,apply,17
