In [None]:
import pandas as pd
import numpy as np
import sqlite3
import emoji

In [None]:
msg_df = pd.read_sql("select * FROM message", sqlite3.connect("chat_denormalized.db"))
msg_df['text'] = msg_df['attributedText']
msg_df.head()

In [None]:
# Replace null text with empty string
msg_df['text'].fillna('', inplace=True)

In [None]:
# Count the number of messages send by each Sender, only count the text column, order by count
msg_df.groupby('Sender').count().sort_values('text', ascending=False)['text']


In [None]:
# Message Types
message_types = {
    0: 'Text',
    2: 'Sticker?',
    3: 'iMessage App',
    1000: 'Photo'
}
# Using the message_types dictionary, create a new column called MessageType from the associated_message_type column, if the value is between 2000 an 2005 or 3000 and 3005, the MessageType is 'Reaction'
msg_df['MessageType'] = msg_df['associated_message_type'].apply(lambda x: message_types.get(x, 'Reaction') if (x >= 2000 and x <= 2005) or (x >= 3000 and x <= 3005) else message_types.get(x, 'Unknown'))

In [None]:
msg_df.groupby('MessageType')['text'].count()

In [None]:
# Map of reation type to its name
reaction_names = {
    2000: 'Loved',
    2001: 'Liked',
    2002: 'Disliked',
    2003: 'Laughed at',
    2004: 'Emphasized',
    2005: 'Questioned',
    3000: 'Removed a heart',
    3001: 'Removed a like',
    3002: 'Removed a dislike',
    3003: 'Removed a laugh',
    3004: 'Removed an emphasis',
    3005: 'Removed a question',
}
# Using the reaction_names dictionary, create a new column called Reaction from the associated_message_type column
msg_df['ReactionType'] = msg_df['associated_message_type'].map(reaction_names)

In [None]:
# Now count the number of times each reaction was used
msg_df.groupby('ReactionType')['text'].count()

In [None]:
# Find the Sender who sends the most reactions, order by count
msg_df[msg_df['ReactionType'].notnull()].groupby('Sender')['ReactionType'].count().sort_values(ascending=False)

In [None]:
for reaction_type, reaction_name in reaction_names.items():
    print(reaction_name)
    print(msg_df[(msg_df['ReactionType'].notnull()) & (msg_df['associated_message_type'] == reaction_type)].groupby('Sender')['text'].count().sort_values(ascending=False))
    print()

In [None]:
# all messsages find who sends the most words, order by count
msg_df['word_count'] = msg_df['text'].apply(lambda x: len(x.split()))
msg_df.groupby('Sender')['word_count'].sum().sort_values(ascending=False).head(20)


In [None]:
# For all message find who sends the most emojis, order by count
msg_df['emoji_count'] = msg_df['text'].apply(lambda x: sum([emoji.is_emoji(c) for c in x]))
msg_df.groupby('Sender')['emoji_count'].sum().sort_values(ascending=False).head(20)


In [None]:
# Get all group chat messages (display_name is not empty string)
group_chat_messages = msg_df[msg_df['display_name'] != '']
# From group chats, find the top 15 most active group chat, show display_name, group_id and count
group_chat_messages.groupby(['display_name', 'group_id']).count().sort_values('text', ascending=False)['text'].head(15)

In [None]:
# Get all messages for up!, where group_id=F10BB39C-461D-4E72-8D4C-8718998118CF
group_up = group_chat_messages[group_chat_messages['group_id'] == 'D82BE2BE-CCCB-43B9-9DCB-49A05EC1D495']
# Find the top most active members of up!, show Sender and count
group_up.groupby('Sender').count().sort_values('text', ascending=False)['text']


In [None]:
# For group "up", find who sends the most reactions, order by count
group_up[group_up['ReactionType'].notnull()].groupby('Sender')['ReactionType'].count().sort_values(ascending=False)

In [None]:
# For group "up", find the most reacted message, show text, Sender and count
group_up[group_up['ReactionType'].notnull()].groupby(['text', 'Sender']).count().sort_values('ReactionType', ascending=False)['ReactionType']

In [None]:
# for the up! group, find who sends the most words, order by count
group_up.groupby('Sender')['word_count'].sum().sort_values(ascending=False)


In [None]:
# For the up group find who sends the most emojis, order by count
group_up.groupby('Sender')['emoji_count'].sum().sort_values(ascending=False)

In [None]:
# Find the most popular emoji in the up group, order by count
group_up_emoji = group_up[group_up['emoji_count'] > 0]
group_up_emoji['text'].apply(lambda x: [c for c in x if emoji.is_emoji(c)]).explode().value_counts()


In [None]:
# For the top 20 users I've chatted with, find the most popular emoji sent by each user in all chats, order by count
top_20_senders = msg_df.groupby('Sender').count().sort_values('text', ascending=False).head(20).index
top_20_senders_emoji = msg_df[msg_df['Sender'].isin(top_20_senders)]
top_20_senders_emoji = top_20_senders_emoji[top_20_senders_emoji['emoji_count'] > 0]
top_20_senders_emoji['text'].apply(lambda x: [emoji.demojize(c) for c in x if emoji.is_emoji(c)]).explode().groupby(top_20_senders_emoji['Sender']).value_counts().groupby('Sender').head(1)


In [None]:
# What is the top 5 emojis used by Stephen Jayakar in all chats, order by count
stephen_emoji = msg_df[msg_df['Sender'] == 'Andy Chang']
stephen_emoji = stephen_emoji[stephen_emoji['emoji_count'] > 0]
stephen_emoji['text'].apply(lambda x: [c for c in x if emoji.is_emoji(c)]).explode().value_counts().head(5)


In [None]:
# Find the top 20 emojis used across all chats, order by count
all_emoji = msg_df[msg_df['emoji_count'] > 0]
all_emoji['text'].apply(lambda x: [c for c in x if emoji.is_emoji(c)]).explode().value_counts().head(20)


In [None]:
# TODO Probablt need to fix emoji encoding for some text

In [None]:
# find the top 20 most used words in all chats, order by count
all_words = msg_df[msg_df['word_count'] > 0]
all_words['text'].apply(lambda x: x.split()).explode().value_counts().head(20)


In [None]:
# Find the top 20 most used words in all chats, excluding reaction message and stop words, order by count
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
all_words = msg_df[msg_df['word_count'] > 0]
all_words = all_words[all_words['MessageType'] != 'Reaction']
all_words['text'].apply(lambda x: [word for word in x.split() if word.lower() not in stop_words]).explode().value_counts().head(20)

In [None]:
# Find the top used profane words in all chats, order by count. exclude reaction messages
from better_profanity import profanity
profane_words = msg_df[msg_df['word_count'] > 0]
profane_words = profane_words[profane_words['MessageType'] != 'Reaction']
profane_words['text'].apply(lambda x: [word.lower() for word in x.split() if profanity.contains_profanity(word.lower())]).explode().value_counts().head(50)


In [None]:
profane_words = msg_df[msg_df['word_count'] > 0]
profane_words = profane_words[profane_words['MessageType'] != 'Reaction']
profane_words['text'].apply(lambda x: [word.lower() for word in x.split() if word.lower() in profanity_list]).explode().value_counts().head(50)

In [None]:
# From the up group, find who uses the most words from profanity_list, grouped by Sender
profane_words_up = group_up[group_up['word_count'] > 0]
profane_words_up = profane_words_up[profane_words_up['MessageType'] != 'Reaction']
profane_words_up['text'].apply(lambda x: [word.lower() for word in x.split() if word.lower() in profanity_list]).explode().groupby(profane_words_up['Sender']).value_counts().groupby('Sender').sum().sort_values(ascending=False)


In [None]:
# From the up group, find who uses the most used word from profanity_list, grouped by Sender
profane_words_up = group_up[group_up['word_count'] > 0]
profane_words_up = profane_words_up[profane_words_up['MessageType'] != 'Reaction']
profane_words_up['text'].apply(lambda x: [word.lower() for word in x.split() if word.lower() in profanity_list]).explode().groupby(profane_words_up['Sender']).value_counts().groupby('Sender').head(1)


In [None]:
# TODO export to file