In [1]:
import pandas as pd
import json
import glob
import re
from functools import partial
from nltk.tokenize import word_tokenize

In [2]:
ME = "Павел Богданов"

### Reading data

In [3]:
def json_to_df(filepath, friend_name=True):
    """
    Reads facebook messenger's JSON file and returns a pandas Dataframe.
    
    Doesn't return a dataframe if the participants are more
    than two people(no group chats).
    
    Works only on the "Messages" json files downloaded through
    Facebook's "Download Your Information" section.
    
    Parameters
    ----------
    filepath : string
        Filepath to the JSON file.
    friend_name : boolean, default True
        If True, adds an aditional column "friend_name" to the df.
    Returns
    -------
    result : Dataframe        
    """
    # Fixes bad encoding
    fix_mojibake_escapes = partial(re.compile(rb'\\u00([\da-f]{2})').sub, lambda m: bytes.fromhex(m.group(1).decode()))
    
    # Need to read as binary to decode correctly
    with open(filepath, 'rb') as file:    
        repaired = fix_mojibake_escapes(file.read())
        data = json.loads(repaired.decode('utf8'), strict=False)
        
        # No group chats!
        if len(data['participants']) == 2:
            result = pd.DataFrame.from_dict(data['messages'])
            
            # Additional column
            if friend_name:
                participants = pd.Series(data['participants']).apply(pd.Series)
                for name in participants.name:
                    if not name == ME:
                        result['friend_name'] = name
            return result

In [5]:
all_files = glob.glob("messages/inbox/*/message_1.json")
data = pd.concat((json_to_df(filename) for filename in all_files), ignore_index=True, sort=False)

### Processing data

In [18]:
data.timestamp_ms = pd.to_datetime(data.timestamp_ms, unit='ms')
data = data.sort_values('timestamp_ms')
data = data.drop_duplicates('timestamp_ms')
data = data.set_index('timestamp_ms', verify_integrity=True)
data.index.names = ['timestamp']

In [7]:
mess = data[(data.type=='Generic') | (data.type=='Share')][['sender_name', 'content', 'friend_name', 'type', 'share', 'reactions']]
mess.content.dropna(inplace=True)
mess.sender_name = mess.sender_name.astype('category')
mess.friend_name = mess.friend_name.astype('category')
mess.type = mess.type.astype('category')
mess.content = mess.content.str.lower()
mess.content = mess.content.replace(np.nan, '0')
mess = mess[mess.content!='0']

### Remove links

In [8]:
mess.content = mess.content.str.replace(r'(https?:\/\/\S*)', '')

### Space split and tokenize

In [9]:
space_mess = mess.copy()
tokenized_mess = mess.copy()

space_mess.content = space_mess.content.str.split()
space_mess.content.dropna(inplace=True)

tokenized_mess.content = tokenized_mess.content.apply(word_tokenize)
tokenized_mess.content.dropna(inplace=True)

### Word split

In [11]:
def word_split(dataframe):
    rows = list()
    for row in dataframe[['sender_name', 'content', 'timestamp']].iterrows():
        r = row[1]
        for word in r.content:
            rows.append((r.sender_name, word, r.timestamp))

    return pd.DataFrame(rows, columns=['sender_name', 'word', 'timestamp'])

In [13]:
space_mess.reset_index(inplace=True)
tokenized_mess.reset_index(inplace=True)

words_by_space = word_split(space_mess)
words_by_tokenizer = word_split(tokenized_mess)

In [16]:
words_by_space.to_csv("words_by_space", index=False)
words_by_tokenizer.to_csv("words_by_tokenizer", index=False)