In [None]:
import numpy as np
import pandas as pd
import json
import regex
import emoji
from os import walk
from datetime import date, datetime, timedelta
from matplotlib import pyplot as plt
from matplotlib import dates as mdates

In [None]:
# %matplotlib tk
plt.rcParams['figure.figsize'] = [17, 5]

chatdir = ''

In [None]:
def daterange(start_date, end_date):
    for n in range(int((end_date - start_date).days)):
        yield start_date + timedelta(n)
        
        
# Separates a string into symbols
# Similar to list(string) but works with graphemes consisting of more than one character
def extract_symbols(string):
    return regex.findall(r'\X', string)


def plot(x, y, plot_type='line'):
    if plot_type == 'line':
        plt.grid(zorder=0)
        plt.plot(x, y, zorder=3)
        
    elif plot_type == 'bar':
        plt.grid(axis='y', zorder=0)
        plt.bar(x, y, zorder=3)
        
    else:
        print('Invalid plot_type')


def plot_dict_items(items, plot_type='line', xlim_left=None, xlim_right=None):
    if (xlim_left is not None) or (xlim_right is not None):
        plt.xlim(xlim_left, xlim_right)
        
    x, y = zip(*items)
    plot(x, y, plot_type)
    
    
def plot_pie(values, legend=None):
    total = sum(values)
    plt.pie(values, autopct=lambda p: '{:.2f}%\n({:.0f})'.format(p, p * total / 100))
    if legend is not None: 
        plt.legend(legend)

In [None]:
(_, _, filenames) = next(walk(chatdir))
filenames

In [None]:
msg_columns = ('Time', 'Sender', 'Content', 'Type', 'Sticker', 'Photos', 'Videos', 'Audio', 'Share')
extracted_data = []

for i in range(1, len(filenames) + 1):
    with open(chatdir + 'message_' + str(i) + '.json', 'r', encoding='raw_unicode_escape') as f:
        raw_data = json.loads(f.read().encode('raw_unicode_escape').decode())
        
    for msg in raw_data['messages']:
        timestamp = datetime.fromtimestamp(msg.get('timestamp_ms', 0) // 1000)
        sender = msg.get('sender_name', None)
        msg_content = msg.get('content', None)
        msg_type = msg.get('type', None)
        sticker = msg.get('sticker', None)
        photos = msg.get('photos', None)
        videos = msg.get('videos', None)
        audio = msg.get('audio_files', None)
        share = msg.get('share', None)

        extracted_data.append((timestamp, sender, msg_content, msg_type, sticker, photos, videos, audio, share))

extracted_data.reverse()
data = pd.DataFrame(extracted_data, columns=msg_columns)
data

In [None]:
# for row in extracted_data:
#     print(f'<{row[0]}> {row[1]}: {row[2]}')

In [None]:
earliest_datetime = data['Time'].min()
earliest_date = earliest_datetime.date()
print(f'Earliest datetime: {earliest_datetime}')

xlim_left = date(2019, 10, 28)
xlim_right = date.today()

In [None]:
# colors = ('r', 'b', 'g')
people = {}
i = 0
for participant in raw_data['participants']:
    people[participant['name']] = {
        # can contain properties
#         'color': colors[i]
    }
    i += 1

In [None]:
print(f'Total messages: {len(data)}')
msgs_per_person = [len(data[data.Sender == person]) for person in people]
plot_pie(msgs_per_person, legend=people)

In [None]:
charsum = sum(len(msg) for msg in data['Content'] if msg)
charsums = [sum(len(msg) for msg in data['Content'][data.Sender == person] if msg) for person in people]
print(f'Total characters: {charsum}')
plot_pie(charsums, legend=people)

In [None]:
def count_emojis(data):
    emojis = 0
    for msg in data['Content']:
        if msg is None: continue
            
        split_text = regex.findall(r'\X', msg)
        for symbol in split_text:
            if any(char in emoji.UNICODE_EMOJI for char in symbol):
                emojis += 1

    return emojis

In [None]:
print(f'Total emojis: {count_emojis(data)}')
emojis_per_person = [count_emojis(data[data.Sender == person]) for person in people]
plot_pie(emojis_per_person, legend=people)

In [None]:
total_msg_len = 0
msg_count = 0
total_word_len = 0
word_count = 0
for msg in data['Content']:
    if msg is None: continue

    total_msg_len += len(msg)
    msg_count += 1

    for word in regex.findall('\w+', msg):
        total_word_len += len(word)
        word_count += 1

print('Average message length: {:.2f} chars'.format(total_msg_len/msg_count))
print('Average word length: {:.2f} chars'.format(total_word_len/word_count))

In [None]:
def plot_msgs_daily(data, plot_type='line'):
    freq = {}
    for msg_datetime in data['Time']:
        msg_date = msg_datetime.date()
        freq[msg_date] = freq.get(msg_date, 0) + 1

    for day in daterange(earliest_date, date.today()):
        if day not in freq:
            freq[day] = 0

    plot_dict_items(sorted(freq.items()), plot_type, xlim_left, xlim_right)
    plt.title('Daily messages')

In [None]:
plt.figure(1)
plot_msgs_daily(data, plot_type='bar')

In [None]:
plt.figure(2)
for person in people:
    plot_msgs_daily(data[data.Sender == person])

plt.legend(people.keys());

In [None]:
def plot_chars_daily(data, plot_type='line'):
    freq = {}
    for msg_datetime, msg_content in zip(data['Time'], data['Content']):
        msg_date = msg_datetime.date()
        freq[msg_date] = freq.get(msg_date, 0) + (len(msg_content) if msg_content else 0)
        
    for day in daterange(earliest_date, date.today()):
        if day not in freq:
            freq[day] = 0

    plot_dict_items(sorted(freq.items()), plot_type, xlim_left, xlim_right)
    plt.title('Daily characters')

In [None]:
plt.figure(3)
plot_chars_daily(data, plot_type='bar')

In [None]:
plt.figure(4)
for person in people:
    plot_chars_daily(data[data.Sender == person])
    
plt.legend(people.keys());

In [None]:
def plot_count_daily(data, snippet, ignore_case=True):
    freq = {}
    for msg_datetime, msg_content in zip(data['Time'], data['Content']):
        if not msg_content: continue
        msg_date = msg_datetime.date()
        
        if ignore_case:
            snippet = snippet.lower()
            msg_content = msg_content.lower()
            
        matches = len(regex.findall(f'\\b{snippet}\\b', msg_content))
        freq[msg_date] = freq.get(msg_date, 0) + matches
        
    for day in daterange(earliest_date, date.today()):
        if day not in freq:
            freq[day] = 0
        
    plot_dict_items(sorted(freq.items()), xlim_left=xlim_left, xlim_right=xlim_right)
    plt.title(f'Daily occurrence of "{snippet}"')

In [None]:
plot_count_daily(data, 'lol')

In [None]:
def plot_count_daily_multi(data, snippets, ignore_case=True):
    for snippet in snippets:
        plot_count_daily(data, snippet, ignore_case)
        
    plt.title(None)
    plt.legend(snippets)

In [None]:
snippets = ['yes', 'yesh', 'esh']
plot_count_daily_multi(data, snippets)

In [None]:
snippets = ['no', 'nah', 'nop', 'nope']
plot_count_daily_multi(data, snippets)

In [None]:
snippets = ['meh', 'hmeh', 'shmesh']
plot_count_daily_multi(data, snippets)

In [None]:
snippets = ['haha', 'hahah', 'hahaha']
plot_count_daily_multi(data, snippets)

In [None]:
snippets = ['lol', 'xd']
plot_count_daily_multi(data, snippets)

In [None]:
def plot_count_daily_person(data, snippet, ignore_case=True):
    plt.grid(zorder=0)
    
    if (xlim_left is not None) or (xlim_right is not None):
        plt.xlim(xlim_left, xlim_right)
    
    freqs = {}
    heights = None
    
    for person in people:
        freqs[person] = {}
        freq = freqs[person]
        person_data = data[data.Sender == person]
        
        for msg_datetime, msg_content in zip(person_data['Time'], person_data['Content']):
            if not msg_content: continue
            msg_date = msg_datetime.date()

            if ignore_case:
                snippet = snippet.lower()
                msg_content = msg_content.lower()

            matches = len(regex.findall(f'\\b{snippet}\\b', msg_content))
            freq[msg_date] = freq.get(msg_date, 0) + matches
        
        for day in daterange(earliest_date, date.today()):
            if day not in freq:
                freq[day] = 0
        
        dates, values = zip(*sorted(freq.items()))
        if heights is None:
            heights = [0] * len(dates)
            
        plt.bar(dates, values, bottom=heights, zorder=3)
        heights = np.add(heights, values)
        
    plt.title(f'Daily occurrence of "{snippet}"')
    plt.legend(people.keys())

In [None]:
plot_count_daily_person(data, 'hahah')

In [None]:
plot_count_daily_person(data, 'hmeh')

In [None]:
def plot_messages(data):
    freq = {}
    for msg in data['Content']:
        if msg is None: continue
        
        freq[msg] = freq.get(msg, 0) + 1
        
    plot_dict_items(sorted(freq.items(), key=lambda e: e[1], reverse=True)[:50], plot_type='bar')
    plt.gcf().autofmt_xdate()

In [None]:
plot_messages(data)

In [None]:
def plot_words(data, char_count=0):
    freq = {}
    for msg in data['Content']:
        if msg is None: continue
            
        for word in regex.findall('\w+', msg):
            if char_count == 0 or len(word) == char_count:
                key = word.lower()
                freq[key] = freq.get(key, 0) + 1
            
    plot_dict_items(sorted(freq.items(), key=lambda e: e[1], reverse=True)[:50], plot_type='bar')
    plt.gcf().autofmt_xdate()

In [None]:
plt.figure(5)
plot_words(data)

In [None]:
def plot_words_person(data, char_count=0):
    plt.grid(axis='y', zorder=0)
    
    total_freq = {}
    for msg in data['Content']:
        if msg is None: continue
            
        for word in regex.findall('\w+', msg):
            if char_count == 0 or len(word) == char_count:
                key = word.lower()
                total_freq[key] = total_freq.get(key, 0) + 1
    
    if not total_freq:
        print(f'No words with {char_count} letters')
        return
        
    words, counts = zip(*sorted(total_freq.items(), key=lambda e: e[1], reverse=True)[:50])
    heights = [0] * len(words)
    
    freqs = {}
    for person in people:
        freqs[person] = {}
        freq = freqs[person]
        
        for msg in data[data.Sender == person]['Content']:
            if msg is None: continue

            for word in regex.findall('\w+', msg):
                if char_count == 0 or len(word) == char_count:
                    key = word.lower()
                    freq[key] = freq.get(key, 0) + 1

        values = [freqs[person].get(word, 0) for word in words]
        plt.bar(words, values, bottom=heights, zorder=3)
        heights = np.add(heights, values)
        
    plt.gcf().autofmt_xdate()
    plt.title('Most common words' if char_count == 0 else f'{char_count} letter words')
    plt.legend(people.keys())

In [None]:
plot_words_person(data)

In [None]:
for i in range(1,26):
    plot_words_person(data, char_count=i)
    plt.show()

In [None]:
def count_talk_starts(data, cooldowns):
    counts = {}
    last_dt = datetime(1970,1,1)
    
    for msg_datetime, sender in zip(data['Time'], data['Sender']):
        time_diff = msg_datetime - last_dt
        
        for cooldown in cooldowns:
            if time_diff.total_seconds() > cooldown * 60:
                counts[sender] = counts.get(sender, {})
                sender_count = counts[sender]
                sender_count[cooldown] = sender_count.get(cooldown, 0) + 1
            
        last_dt = msg_datetime  
        
    return counts

In [None]:
counts = count_talk_starts(data, range(20,801,10))
for person in counts:
    plot_dict_items(counts[person].items())
    
plt.title('Conversation initiations')
plt.xlabel('Minutes after last message')
plt.ylabel('Number of initiations')
plt.legend(counts.keys());

In [None]:
def count_occurrences(data, snippet, ignore_case=True):
    if ignore_case:
        return sum(msg.lower().count(snippet.lower()) for msg in data['Content'] if msg)
    else:
        return sum(msg.count(snippet) for msg in data['Content'] if msg)
    
    
def count_char_occurrences(data, charlist):
    return [(el, count_occurrences(data, el)) for el in extract_symbols(charlist)]

In [None]:
for el,count in count_char_occurrences(data, '\u2764❤️'):
    print(f'{el} {count}')
# ❤ su sva srceta
# ❤️ su samo neka... fb cudno to rasporeduje

In [None]:
def count_char_occ_person(data, charlist):
    chars_person = []
    for person in people:
        occ = count_char_occurrences(data[data.Sender == person], charlist)
        chars_person.append(list(count for _,count in occ))

    chars = extract_symbols(charlist)
    totals = [sum(char_counts) for char_counts in zip(*chars_person)]
    
    char_data = zip(chars, *chars_person, totals)
    cols = ('Symbol',) + tuple(people.keys()) + ('Total',)
    return pd.DataFrame(char_data, columns=cols)

In [None]:
count_char_occ_person(data, '\u2764💕💞💖💝💓💗❣️💟🥰😘😍')

In [None]:
count_char_occ_person(data, '🙂😀😃😄😆😂🤣😅☺️😊😁😇🙃😉😗😙😚😋😛😝😜🤪😌🤗🤭🤤🤓😎🤩🥳🤠🤑😈😺😸😹😻😼😽')

In [None]:
count_char_occ_person(data, '😕🙁☹️😟😒😞😔😣😖😫😩🥺😢😭😥😓😿🙍🙍‍♀️🙍‍♂️')

In [None]:
count_char_occ_person(data, '😠😡🤬😤👿😾👺🙎🙎‍♀️🙎‍♂️')

In [None]:
count_char_occ_person(data, '🤨🧐🤔😶😐😑😯😦😮😧😲😳😬😨😰😱🙀🤯😵🥴🤕😷🤧🤒🤢🤮🥵🥶🥱😪😴')

In [None]:
count_char_occ_person(data, '🤫🤥🤐👹🤡💩👻💀☠️👽👾🤖🎃')

In [None]:
count_char_occ_person(data, '🙏🤲👐🙌👏👋✋🤚🖐🖖🤟🤘✌️🤞👌🤙🤏🖕👊✊🤛🤜👍👎👈👉👆👇☝️🤝💪✍️🦶🦵🦿🦾')

In [None]:
count_char_occ_person(data, '🤦🤦‍♀️🤦‍♂️💁💁‍♀️💁‍♂️🤷🤷‍♀️🤷‍♂️🙋🙋🙋‍♀️🙋‍♂️🙅🙅‍♀️🙅‍♂️🙆🙆‍♀️🙆‍♂️🙇🙇‍♀️🙇‍♂️🤰🤱🧑‍🍼👩‍🍼👨‍🍼')