In [68]:
import numpy as np
import pandas as pd
from datetime import datetime
import re
from collections import Counter
from wordcloud import WordCloud
import warnings
import emoji

In [69]:
def seperate_date_time(x):
    date = pd.to_datetime(x.split(', ')[0])
    time = x.split(', ')[1].split(' - ')[0]
    return date, time

In [70]:
def createDf(file_data, dt):
    patterns = {
        '12 Hour':'\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s[APap][mM]\s-\s',
        '24 Hour':'\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s-\s',
    }
    
    data = ' '.join(file_data.read().split('\n'))
    user_message = re.split(patterns[dt], data)[1:]
    date_time = re.findall(patterns[dt], data)
    
    df = pd.DataFrame({'UserMessage': user_message, 'DateTime': date_time})
    df[['Date','Time']] = df['DateTime'].apply(lambda x: seperate_date_time(x)).to_list()
    users = []
    messages = []
    for message in df['UserMessage']:
        entry = re.split('([\w\W]+?):\s', message)
        if entry[1:]:
            users.append(entry[1])
            messages.append(entry[2])
        else:
            users.append('group_notification')
            messages.append(entry[0])

    df['User'] = users
    df['Message'] = messages
    df.drop(columns=['UserMessage', 'DateTime'], inplace=True)
    return df

In [71]:
# f = open('chats/Sample_WhatsApp_Chat_12hr.txt', 'r', encoding='utf-8')
# df = createDf(f, '12 Hour')

In [72]:
f = open('chats/Sample_WhatsApp_Chat_24hr.txt', 'r', encoding='utf-8')
df = pd.DataFrame(createDf(f, '24 Hour'))

In [73]:
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month_name()
df['MonthNum'] = df['Date'].dt.month
df['DayOfWeek'] = df['Date'].dt.dayofweek
df['WeekNum'] = df['Date'].dt.isocalendar().week
df['Day'] = df['Date'].dt.day
df['DayName'] = df['Date'].dt.day_name()
df['Hour'] = df['Time'].apply(lambda x: x.split(':')[0])
df['Minute'] = df['Time'].apply(lambda x: (x.split(':')[1]).split(' ')[0])

#Uncomment while working with 12 hour file format
# df['Meridian'] = df['Time'].apply(lambda x: (x.split(':')[1]).split(' ')[1]).str.upper()

df.head()

Unnamed: 0,Date,Time,User,Message,Year,Month,MonthNum,DayOfWeek,WeekNum,Day,DayName,Hour,Minute
0,2025-03-27,11:00,Kim Mason,"Great thoughts shared today!"" """,2025,March,3,3,13,27,Thursday,11,0
1,2025-03-27,22:15,Jessica Williams,"Congratulations on your achievement! 🎉"" """,2025,March,3,3,13,27,Thursday,22,15
2,2025-03-27,02:32,Jessica Gonzalez,"That’s wonderful news!"" """,2025,March,3,3,13,27,Thursday,2,32
3,2025-03-27,07:26,Joshua Fisher,,2025,March,3,3,13,27,Thursday,7,26
4,2025-03-27,21:47,Nancy Brown,"Here’s the update on the event:"" """,2025,March,3,3,13,27,Thursday,21,47


# Fetch Stats

In [74]:
def fetch_stats(selected_user, df):
  if selected_user != 'Overall':
    df = df[df['User'] == selected_user]
  num_messages = df.shape[0]
  diff_days = (df.Date.iloc[len(df)-1] - df.Date.iloc[0]).days
  words = []
  for message in df['Message']:
    words.extend(message.split())

  num_media_messages = df[df['Message'] == '<Media omitted>\n'].shape[0]
  return num_messages, len(words), num_media_messages, diff_days

In [75]:
fetch_stats('Overall', df)

(150, 732, 0, 14)

# Removed/Left User

In [76]:
def seperate(x, seperator):
    if seperator == 'removed':
        if x.find(seperator) != -1:
            return x.split(seperator)[1]
        else:
            return x.split('left')[0]
    else:
        if x.find(seperator) != -1:
            return x.split(seperator)[1]
        else:
            return x.split('joined')[0]

In [77]:
seperate('Dhakad Svvv removed +91 94240 96950', 'removed')

' +91 94240 96950'

In [78]:
seperate('You left', 'removed')

'You '

In [79]:
seperate('Dhakad Svvv added +91 98937 63178', 'added')

' +91 98937 63178'

In [80]:
seperate('Hrishikesh Svvv joined using this group', 'added')

'Hrishikesh Svvv '

In [81]:
def removed_left(df):
    df = df[df['User']=='group_notification']
    df = df[df['Message'].str.contains('changed|deleted|encrypted|created') == False]
    removedLeft = df[df['Message'].str.contains('removed|left') ==  True]
    addedJoined = df[df['Message'].str.contains('added|joined') == True]
    removedLeft['Status'] = 0
    addedJoined['Status'] = 1
    removedLeft['User'] = removedLeft.Message.apply(lambda x: seperate(x, 'removed'))
    addedJoined['User'] = addedJoined.Message.apply(lambda x: seperate(x, 'added'))
    new_df = pd.concat([removedLeft, addedJoined])
    new_df.drop(columns=['Date','Time', 'Message', 'Year', 'Month', 'MonthNum', 'DayOfWeek', 'WeekNum', 'Day', 'Hour', 'DayName', 'Minute'], inplace=True, axis=1)
    new_df['User'] = new_df['User'].str.lower()
    new_df['User'] = new_df['User'].str.split(', ')
    new_df = new_df.explode('User')
    new_df['User'] = new_df['User'].str.split(' and ')
    new_df = new_df.explode('User')
    new_df['User'] = new_df['User'].str.strip()
    new_df = new_df.groupby('User')['Status'].count().reset_index()
    new_df['Status'] = new_df.Status.apply(lambda x: x%2 == 0)
    return len(new_df[new_df.Status==True])

In [82]:
removed_left(df)

0

# Chatting From

In [83]:
def chat_from(selected_user, df):
    if selected_user != 'Overall':
        df = df[df['User'] == selected_user]
    unique_years = df['Year'].unique()
    start_year = unique_years[0]
    msg_count = df.groupby(['Date']).count()['Message']
    avg_msg = round(msg_count.mean(),2)
    return start_year, avg_msg

In [84]:
chat_from('Overall', df)

(2025, 10.0)

# Most Talkative

In [85]:
def most_talkative(df):
    df = df[df['User'] != 'group_notification']
    user = df['User'].value_counts()
    username = user.index[0]
    avg_msg = round(user[username]/len(df)*100, 2)
    return username, avg_msg

In [86]:
most_talkative(df)

('Kim Mason', 5.33)

# Influencer

In [87]:
def influencer(df):
    df = df[df['User'] != 'group_notification']
    new_df = df.groupby(['User'])
    inf_dict = {}
    for name, group in new_df:
        count = 0
        for i in group['Message']:
            if '<Media omitted>' in i:
                count += 1
        inf_dict[name]=count

    if not inf_dict:
        return None, 0.0, 0
        
    name = max(inf_dict, key=inf_dict.get)
    name_df = df[df['User']==name]

    total_msgs = name_df.shape[0]
    if total_msgs == 0:
        return name, 0.0, inf_dict[name]

    percent = (inf_dict[name]/name_df.shape[0])*100
    return name,round(percent,2), inf_dict[name]

In [88]:
influencer(df)

(('Jessica Gonzalez',), 0.0, 2)

# Long Winded

In [89]:
def long_winded(df):
    df = df[df['User'] != 'group_notification']
    msg_len = []
    for i in df['Message']:
        msg_len.append(len(i))
    df['Message_len'] = msg_len
    new_df = df.sort_values(by=['Message_len'],ascending=False)
    name = new_df.iloc[0]['User']
    user_df = df[df['User']==name]
    avg_msg_len = int(user_df['Message_len'].mean())
    mean_character = user_df[user_df['Message_len'] > avg_msg_len]
    percentage = round((mean_character.shape[0] / user_df.shape[0])*100, 2)
    return name, avg_msg_len, percentage

In [90]:
long_winded(df)

('Tim Lee', 28, 62.5)

# Professor

In [91]:
df.sample(15)

Unnamed: 0,Date,Time,User,Message,Year,Month,MonthNum,DayOfWeek,WeekNum,Day,DayName,Hour,Minute
52,2025-04-01,03:01,Mr. Jesse Bryan,"Here’s the update on the event:"" """,2025,April,4,1,14,1,Tuesday,3,1
57,2025-04-01,00:32,Donna Dominguez,"<Media omitted>"" """,2025,April,4,1,14,1,Tuesday,0,32
60,2025-04-02,18:38,Kim Mason,"Will join the meeting at 5 PM."" """,2025,April,4,2,14,2,Wednesday,18,38
94,2025-04-05,10:08,Jeanne Short,"🙏 जय श्री राम 🙏"" """,2025,April,4,5,14,5,Saturday,10,8
139,2025-04-09,10:51,Kelly Fox,"Will join the meeting at 5 PM."" """,2025,April,4,2,15,9,Wednesday,10,51
51,2025-04-01,13:46,Shannon Jones,"Let's meet at the temple tomorrow."" """,2025,April,4,1,14,1,Tuesday,13,46
58,2025-04-01,18:53,Sara Wallace,"🙏 जय श्री राम 🙏"" """,2025,April,4,1,14,1,Tuesday,18,53
33,2025-03-30,22:56,Casey Mills,"Thank you 🙏"" """,2025,March,3,6,13,30,Sunday,22,56
43,2025-03-31,16:44,Joshua Fisher,"Okay noted."" """,2025,March,3,0,14,31,Monday,16,44
83,2025-04-04,05:25,Joshua Fisher,"Can you forward the message again?"" """,2025,April,4,4,14,4,Friday,5,25


# Emoji Lover

In [92]:
def emojized_user(df):
    df=df[df['User']!='group_notification']
    emoji_user = {}
    new_df = df.groupby(['User'])
    for i in df['User'].unique():
        count = 0
        group = new_df.get_group(i)
        for j in group['Message']:
            for k in j:
                if emoji.is_emoji(k):
                    count += 1
        emoji_user[i]=count   
    name = max(emoji_user, key=emoji_user.get)
    total = emoji_user.values()
    percent = round((emoji_user[name]/sum(total))*100, 2)
    return name, percent

In [93]:
emojized_user(df)

  group = new_df.get_group(i)
  group = new_df.get_group(i)


('Matthew Oneill', 10.17)

# Early Bird

In [94]:
def early_bird(df, format):
    new_df = df[df['User']!='group_notification']
    if format == '12 Hour':
        new_df = new_df[((new_df['Meridian'] == 'AM') & (pd.to_numeric(new_df['Hour']) > 7)) | (new_df['Meridian'] == 'PM')]
    elif format == '24 Hour':
        new_df = df[pd.to_numeric(df['Hour']) > 7]
    user = new_df['User'].value_counts()
    username = user.index[0]
    avg_msg = round(user[username]/len(new_df)*100, 2)
    return username, avg_msg

In [95]:
# early_bird(df, '12 Hour')

In [96]:
early_bird(df, '24 Hour')

('Nancy Brown', 6.93)

# Night Owl

In [97]:
def night_owl(df, format):
    new_df = df[df['User']!='group_notification']
    if format == '12 Hour':
        new_df = new_df[((new_df['Meridian'] == 'AM') & (pd.to_numeric(new_df['Hour']) < 7)) | (new_df['Meridian'] == 'PM')]
    elif format == '24 Hour':
        new_df = new_df[(pd.to_numeric(new_df['Hour']) < 6) | (pd.to_numeric(new_df['Hour']) > 11)]
    user = new_df['User'].value_counts()
    username = user.index[0]
    avg_msg = round(user[username]/len(new_df)*100, 2)
    return username, avg_msg

In [98]:
# night_owl(df, '12 Hour')

In [99]:
night_owl(df, '24 Hour')

('Jessica Williams', 6.78)

# Most Shared Links

In [100]:
def get_urls(df, selected_user):
    if selected_user != 'Overall':
        df = df[df['User'] == selected_user]
    df = df[df['User'] != 'group_notification']
    urls_list = []
    url = []
    for i in df['Message']:
        o = re.findall('(https://.*)|(http://.*)',i)
        if len(o) != 0:
            urls_list.append(o[0][0].split(' ')[0])
    if len(urls_list) != 0:
        for i in urls_list:
            o = re.search('.*[.](com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)',i)
            if o != None:
                url.append(o.group())
        url_df = pd.DataFrame(Counter(url).most_common(len(Counter(url))), columns=['Urls', 'Count'])
        return url_df
    else:
        return pd.DataFrame()

In [101]:
get_urls(df, 'Overall')

# Most Shared Emojis

In [102]:
def get_emojis(selected_user, df):
    if selected_user != 'Overall':
        df = df[df['User'] == selected_user]
    emojis = []
    description = []
    for message in df['Message']:
        all_emoji = emoji.distinct_emoji_list(message)
        emojis.extend([emoji.emojize(is_emoji) for is_emoji in all_emoji])
    emoji_df = pd.DataFrame(Counter(emojis).most_common(len(Counter(emojis))), columns=['Emoji', 'Count'])
    for i in emoji_df['Emoji']:
        description.append(emoji.demojize(i))
    emoji_df['Description'] = description
    emoji_df['Description'] = emoji_df['Description'].apply(lambda x: x.strip(':'))
    emoji_df['EmojiDescription'] = emoji_df['Emoji'] + ' - ' + emoji_df['Description']
    return emoji_df

In [103]:
get_emojis('Overall', df)

Unnamed: 0,Emoji,Count,Description,EmojiDescription
0,🙏,20,folded_hands,🙏 - folded_hands
1,🎂,13,birthday_cake,🎂 - birthday_cake
2,🌸,8,cherry_blossom,🌸 - cherry_blossom
3,🎉,7,party_popper,🎉 - party_popper


# Hourly Timeline

In [104]:
def hourly_timeline(selected_user, df, format):
  if selected_user != 'Overall':
    df = df[df['User'] == selected_user]
  new_df = df[df['User'] != 'group_notification']
  
  new_df['Message'] = [1] * new_df.shape[0]
  if format == '12 Hour':
    new_df['Hour'] = new_df['Hour'].apply(lambda x: ('0'+str(x)) if (len(x)<2) else x )
    new_df['Hour'] = new_df['Meridian'].astype(str) + ' ' + new_df['Hour'].astype(str) 
  new_df = new_df.groupby('Hour')['Message'].sum().reset_index()
  return new_df

In [105]:
hourly_timeline('Overall', df, '24 Hour')

Unnamed: 0,Hour,Message
0,0,7
1,1,6
2,2,6
3,3,3
4,4,8
5,5,6
6,6,5
7,7,8
8,8,3
9,9,4


# Daily Timeline

In [106]:
def daily_timeline(selected_user, df):
  if selected_user != 'Overall':
    df = df[df['User'] == selected_user]
  new_df = df[df['User'] != 'group_notification']
  new_df = new_df.groupby('Date')['Message'].count().reset_index()
  return new_df

In [107]:
daily_timeline('Overall', df)

Unnamed: 0,Date,Message
0,2025-03-27,10
1,2025-03-28,10
2,2025-03-29,10
3,2025-03-30,10
4,2025-03-31,10
5,2025-04-01,10
6,2025-04-02,10
7,2025-04-03,10
8,2025-04-04,10
9,2025-04-05,10


# Weekly Timeline

In [108]:
def weekly_timeline(selected_user, df):
  if selected_user != 'Overall':
    df = df[df['User'] == selected_user]
  new_df = df[df['User'] != 'group_notification']
  new_df  = df.groupby(['Year','Month','WeekNum'], sort=False)['Message'].count().reset_index()
  new_df['Week'] = new_df.apply(
        lambda row: f"Week 0{row['WeekNum']} - {row['Month']} - {row['Year']}"
        if row['WeekNum'] < 10 else f"Week {row['WeekNum']} - {row['Month']} - {row['Year']}",
        axis=1
    )
  new_df.sort_values(['WeekNum', 'Month', 'Year'], inplace=True)
  return new_df

In [109]:
weekly_timeline('Overall', df)

Unnamed: 0,Year,Month,WeekNum,Message,Week
0,2025,March,13,40,Week 13 - March - 2025
2,2025,April,14,60,Week 14 - April - 2025
1,2025,March,14,10,Week 14 - March - 2025
3,2025,April,15,40,Week 15 - April - 2025


In [110]:
weekly_timeline('+91 6239 413 783', df)

Unnamed: 0,Year,Month,WeekNum,Message,Week


# Monthly Timeline

In [111]:
def monthly_timeline(selected_user, df):
    if selected_user != 'Overall':
      df = df[df['User'] == selected_user]
    new_df = df[df['User'] != 'group_notification']
    new_df  = df.groupby(['Year','Month','MonthNum'], sort=False)['Message'].count().reset_index()
    month = []
    for i in range(new_df.shape[0]):
        month.append(str(new_df['MonthNum'][i]) + " - " + new_df['Month'][i] + " - " + str(new_df['Year'][i]))
    new_df['Months'] = month
    new_df['Months'] = new_df[['MonthNum', 'Months']].apply(lambda x: "Month 0"+x['Months'] if x['MonthNum']<10 else "Month "+x['Months'], axis=1)
    new_df.sort_values(['MonthNum', 'Year'], inplace=True)
    return new_df

In [112]:
monthly_timeline('Overall', df)

Unnamed: 0,Year,Month,MonthNum,Message,Months
0,2025,March,3,50,Month 03 - March - 2025
1,2025,April,4,100,Month 04 - April - 2025


# Most Busy Day

In [113]:
def most_busy_day(selected_user, df):
  if selected_user != 'Overall':
    df = df[df['User'] == selected_user]
  new_df = df[df['User'] != 'group_notification']
  
  new_df = new_df.groupby(['DayName', 'DayOfWeek'], sort=False)['Message'].count().reset_index()
  new_df.sort_values('DayOfWeek', inplace=True)
  new_df['Days'] = new_df['DayOfWeek'].astype(str) + " - " + new_df['DayName']
  return new_df

In [114]:
most_busy_day('Overall', df)

Unnamed: 0,DayName,DayOfWeek,Message,Days
4,Monday,0,20,0 - Monday
5,Tuesday,1,20,1 - Tuesday
6,Wednesday,2,20,2 - Wednesday
0,Thursday,3,30,3 - Thursday
1,Friday,4,20,4 - Friday
2,Saturday,5,20,5 - Saturday
3,Sunday,6,20,6 - Sunday


# Most Busy Month

In [115]:
def most_busy_month(selected_user, df):
  if selected_user != 'Overall':
    df = df[df['User'] == selected_user]
  new_df = df[df['User'] != 'group_notification']
  
  new_df = new_df.groupby(['MonthNum', 'Month'], sort=False)['Message'].count().reset_index()
  new_df.sort_values('MonthNum', inplace=True)
  new_df['Months'] = new_df['MonthNum'].astype(str) + " - " + new_df['Month']
  new_df['Months'] = new_df[['MonthNum', 'Months']].apply(lambda x: "Month 0"+x['Months'] if x['MonthNum']<10 else "Month "+x['Months'], axis=1)
  return new_df

In [116]:
most_busy_month('Overall', df)
# temp['Months']

Unnamed: 0,MonthNum,Month,Message,Months
0,3,March,50,Month 03 - March
1,4,April,100,Month 04 - April


# User Chat Percentage

In [117]:
def user_chat_percentage(df):
    df = df[df['User'] != 'group_notification']
    user = df['User'].value_counts()
    new_df = pd.DataFrame ({ 'User': user.index, 'Message': user})
    new_df['Percentage'] = new_df['Message'].apply(lambda x: round(x/len(df)*100, 2))
    new_df['User'] = new_df['Percentage'].astype(str) + "% - " + new_df['User']
    new_df.drop('Message', axis=1, inplace=True)
    new_df.reset_index(drop=True,inplace=True)
    return new_df

In [118]:
user_chat_percentage(df)

Unnamed: 0,User,Percentage
0,5.33% - Kim Mason,5.33
1,5.33% - Jessica Gonzalez,5.33
2,5.33% - Joshua Fisher,5.33
3,5.33% - Nancy Brown,5.33
4,5.33% - Stacie Perez,5.33
5,5.33% - Meghan Bowman,5.33
6,5.33% - Mary Wright,5.33
7,5.33% - Matthew Oneill,5.33
8,5.33% - Tim Lee,5.33
9,5.33% - Jessica Williams,5.33


# Most Common Word

In [119]:
def most_common_words(selected_user, df):
  if selected_user != 'Overall':
    df = df[df['User'] == selected_user]
  new_df = df[df['User'] != 'group_notification']
  new_df = new_df[~new_df['Message'].str.contains('<Media omitted>')]
  f = open('stop_hinglish.txt')
  stop_words = f.read()

  words = []

  for message in new_df['Message']:
    message = re.sub('[^A-Za-z\s]','',message)
    if message == '':
        pass
    else:
        for word in message.lower().split():
            if word not in stop_words:
                words.append(word)
  
  most_common_df = pd.DataFrame(Counter(words).most_common(20), columns=['Message', 'Count'])
  most_common_df.sort_values('Count')
  return most_common_df

In [120]:
most_common_words('Overall', df)

Unnamed: 0,Message,Count
0,great,13
1,thoughts,13
2,shared,13
3,today,13
4,happy,13
5,birthday,13
6,lovely,13
7,memories,13
8,trip,13
9,noted,13


# Word Cloud

In [121]:
def create_wordcloud(selected_user, df):
  if selected_user != 'Overall':
    df = df[df['User'] == selected_user]
  new_df = df[df['User'] != 'group_notification']
  new_df = new_df[new_df['Message'] != '<Media omitted>\n']
  
  f = open('stop_hinglish.txt')
  stop_words = f.read()

  def remove_stop_words(message):
    y = []
    for word in message.lower().split():
      if word not in stop_words:
        y.append(word)
    return " ".join(y)

  wc = WordCloud(width=500, height=500, min_font_size=10)
  new_df['Message'] = new_df['Message'].apply(remove_stop_words)
  df_wc = wc.generate(new_df['Message'].str.cat(sep=" "))
  return df_wc

In [122]:
create_wordcloud('Overall', df)

<wordcloud.wordcloud.WordCloud at 0x29c9fd85b40>

In [123]:
pd.to_datetime('25-04-2022')

  pd.to_datetime('25-04-2022')


Timestamp('2022-04-25 00:00:00')

# Sentimental Analysis

# Generating Words.xlsx

In [124]:
def remove_emojis(data):
    return emoji.demojize(data)

In [125]:
new_df = df[df['User']!='group_notification']
new_df = new_df[new_df['Message']!='<Media omitted> ']
new_df = new_df[new_df['Message']!='This message was deleted'] 
# new_df['Message'] = new_df['Message'].apply(remove_emojis)

In [126]:
new_df

Unnamed: 0,Date,Time,User,Message,Year,Month,MonthNum,DayOfWeek,WeekNum,Day,DayName,Hour,Minute
0,2025-03-27,11:00,Kim Mason,"Great thoughts shared today!"" """,2025,March,3,3,13,27,Thursday,11,00
1,2025-03-27,22:15,Jessica Williams,"Congratulations on your achievement! 🎉"" """,2025,March,3,3,13,27,Thursday,22,15
2,2025-03-27,02:32,Jessica Gonzalez,"That’s wonderful news!"" """,2025,March,3,3,13,27,Thursday,02,32
3,2025-03-27,07:26,Joshua Fisher,,2025,March,3,3,13,27,Thursday,07,26
4,2025-03-27,21:47,Nancy Brown,"Here’s the update on the event:"" """,2025,March,3,3,13,27,Thursday,21,47
...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,2025-04-10,23:20,Stacie Perez,"Let's meet at the temple tomorrow."" """,2025,April,4,3,15,10,Thursday,23,20
146,2025-04-10,04:22,Meghan Bowman,"Great thoughts shared today!"" """,2025,April,4,3,15,10,Thursday,04,22
147,2025-04-10,15:00,Mary Wright,"Lovely memories from the trip!"" """,2025,April,4,3,15,10,Thursday,15,00
148,2025-04-10,18:05,Matthew Oneill,"Happy birthday! 🎂"" """,2025,April,4,3,15,10,Thursday,18,05


In [127]:
import re
from nltk.stem import PorterStemmer
def text_transformation(words_list):
  corpus = []
  stemmer = PorterStemmer()
  for item in words_list:
    new_item = item.lower()
    new_item = re.sub(r'[^a-z\s]', ' ', new_item)
    if 'http://' in new_item or 'https://' in new_item:
      continue
    words = new_item.split()
    for word in words:
      word = stemmer.stem(word)
      if len(word) > 1:
        corpus.append(word)
  return list(set(corpus))

In [128]:
words = text_transformation(new_df['Message'])
words.sort()
words

['achiev',
 'again',
 'at',
 'birthday',
 'can',
 'congratul',
 'event',
 'forward',
 'from',
 'great',
 'happi',
 'here',
 'join',
 'let',
 'love',
 'media',
 'meet',
 'memori',
 'messag',
 'news',
 'note',
 'okay',
 'omit',
 'on',
 'pm',
 'share',
 'templ',
 'thank',
 'that',
 'the',
 'thought',
 'today',
 'tomorrow',
 'trip',
 'updat',
 'will',
 'wonder',
 'you',
 'your']

In [129]:
list1 = text_transformation(new_df['Message'])

In [130]:
list2 = text_transformation(new_df['Message'])

In [131]:
list3 = text_transformation(new_df['Message'])

In [132]:
list4 = text_transformation(new_df['Message'])

In [133]:
words = list(set(list1 + list2 + list3 + list4))
words.sort()

In [134]:
words = pd.DataFrame(words, columns=['Word'])
words['Label'] = 1
words.to_excel("Words.xlsx", index=False)

In [135]:
words_df = pd.read_excel('Words.xlsx')
words_df.sample(5)

Unnamed: 0,Word,Label
15,media,1
35,will,1
24,pm,1
25,share,1
2,at,1


In [136]:
words_df.isnull().sum()

Word     0
Label    0
dtype: int64

In [137]:
words_df.dropna(inplace=True)

In [138]:
X = words_df['Word']
y = words_df['Label']

In [139]:
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [140]:
# X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [141]:
corpus = []
# for word in X_train:
for word in X:
    corpus.append(str(word))
corpus

['achiev',
 'again',
 'at',
 'birthday',
 'can',
 'congratul',
 'event',
 'forward',
 'from',
 'great',
 'happi',
 'here',
 'join',
 'let',
 'love',
 'media',
 'meet',
 'memori',
 'messag',
 'news',
 'note',
 'okay',
 'omit',
 'on',
 'pm',
 'share',
 'templ',
 'thank',
 'that',
 'the',
 'thought',
 'today',
 'tomorrow',
 'trip',
 'updat',
 'will',
 'wonder',
 'you',
 'your']

In [142]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(1,2))
X = cv.fit_transform(corpus)
# y = y_train

In [143]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [144]:
from sklearn.metrics import accuracy_score,precision_score,recall_score,confusion_matrix,roc_curve,classification_report

In [145]:
rfc = RandomForestClassifier()
rfc.fit(X,y)

In [146]:
# test_corpus = []
# for word in X_test:
#     test_corpus.append(str(word))
# test_corpus

In [147]:
# X_test = cv.transform(X_test)

In [148]:
# predictions = rfc.predict(X_test)

In [149]:
# accuracy_score(y_test, predictions)

In [150]:
prediction = rfc.predict(cv.transform(['kala', 'teri', 'bc'])).tolist()
total = len(prediction)
negative = total - prediction.count(1.0)
positive = total - negative
negative_per = round(negative / total * 100, 2)
positive_per = round(positive / total * 100, 2)
total, negative, positive, negative_per, positive_per

(3, 0, 3, 0.0, 100.0)

In [151]:
words_df[words_df['Word'] == 'bc']

Unnamed: 0,Word,Label


In [152]:
words_df[words_df['Word'] == 'teri']

Unnamed: 0,Word,Label


In [153]:
words_df[words_df['Word'] == 'kala']

Unnamed: 0,Word,Label


In [154]:
import pickle
with open('semtimental_analysis_model.pickle', 'wb') as f:
    pickle.dump(rfc, f)

In [155]:
with open("vectorizer.pkl", "wb") as f:
    pickle.dump(cv, f)

In [156]:
model = pickle.load(open("semtimental_analysis_model.pickle", "rb"))
vectorizer = pickle.load(open("vectorizer.pkl", "rb"))

In [157]:
new_df = df[df['Message']!='<Media omitted>\n']
new_df = new_df[new_df['Message']!='This message was deleted']
new_df['Message'] = new_df['Message'].apply(remove_emojis)
words = text_transformation(new_df['Message'])

In [158]:
words

['hand',
 'wonder',
 'on',
 'meet',
 'today',
 'join',
 'updat',
 'cherri',
 'achiev',
 'here',
 'the',
 'happi',
 'fold',
 'tomorrow',
 'you',
 'will',
 'again',
 'your',
 'at',
 'thank',
 'templ',
 'cake',
 'news',
 'birthday',
 'media',
 'great',
 'parti',
 'trip',
 'that',
 'blossom',
 'memori',
 'thought',
 'love',
 'note',
 'popper',
 'omit',
 'messag',
 'event',
 'let',
 'share',
 'from',
 'congratul',
 'pm',
 'forward',
 'okay',
 'can']

In [159]:
prediction = model.predict(vectorizer.transform(words)).tolist()

In [160]:
def sentimental_analysis(df):
    new_df = df[df['Message']!='<Media omitted>\n']
    new_df = new_df[new_df['Message']!='This message was deleted']
    new_df['Message'] = new_df['Message'].apply(remove_emojis)
    words = text_transformation(new_df['Message'])
    prediction = model.predict(vectorizer.transform(words)).tolist()
    total = len(prediction)
    negative = total - prediction.count(1.0)
    positive = total - negative
    negative_per = round(negative / total * 100, 2)
    positive_per = round(positive / total * 100, 2)
    return negative_per, positive_per

In [161]:
sentimental_analysis(df)

(0.0, 100.0)