# Data pre-processing and anonymization

## <i>or What I've done to the data before sending it to you</i>

In [1]:
import json
import os
import shutil
import re

from functools import partial
from collections import Counter
from faker import Faker

In [2]:
def get_size_and_files(directory):
    size = 0
    files_count = 0
    for (path, dirs, files) in os.walk(directory):
        for file in files:
            filename = os.path.join(path, file)
            files_count += 1 
            size += os.path.getsize(filename)
    return size, files_count

In [3]:
before = get_size_and_files('messages')

We can delete all folders in "messages" that are not "inbox".

In [4]:
for root, dirs, files in os.walk("messages", topdown=True):
    for name in dirs:
        if not root == 'messages\inbox' and not name=='inbox':
            shutil.rmtree(os.path.join(root, name))

Save the filepaths to each message and delete all the folders inside the person's folder(files, pictures, audio, etc.).

In [5]:
all_files = []
for root, dirs, files in os.walk("messages/inbox/", topdown=True):
    for name in files:
        if name == 'message_1.json':
            all_files.append(os.path.join(root, name))
    for name in dirs:
        if len(name) < 10:
            shutil.rmtree(os.path.join(root, name))

Facebook download data is incorrectly encoded so I'm using Martijn Pieters [solution](https://stackoverflow.com/questions/50008296/facebook-json-badly-encoded) of reading as binary, replacing all \u00hh sequences with the byte the last two hex digits represent, decoding as UTF-8 and then decoding as JSON.

In [6]:
fix_mojibake_escapes = partial(re.compile(rb'\\u00([\da-f]{2})').sub, lambda m: bytes.fromhex(m.group(1).decode()))

I need a guaranteed way of getting my own name out of the data that works for other people as well, so this is the only thing I came up with.

In [7]:
people = []
for file in all_files:
    with open(file, 'rb') as f:
        repaired = fix_mojibake_escapes(f.read())
        data = json.loads(repaired.decode('utf8'), strict=False)
        for person in data['participants']:
            people.append(person["name"])

In [8]:
me = Counter(people).most_common(1)[0][0]

I could replace all the names with numbers but this way is more fun and it's closer to the original data.

In [9]:
# will change to English when publishing, it's just hilarious in BG
fake = Faker('bg_BG') 

fake.seed(42)
fake_me = fake.name()

In [10]:
fake_names = []
for i in range(len(all_files)):
    fake_names.append(fake.name())

Here comes the nested JSONs part.

In [11]:
def fake_thy_name(data, first_level, second_level):
    for person in data[first_level]:
        if person[second_level] == me:
            person[second_level] = fake_me
        else:
            person[second_level] = fake_name

In [12]:
def try_except(first_level, first_level_inside, second_level_inside=None, to_redact=None ):
    try:
        first_level[first_level_inside]
        if second_level_inside:
            # Yo dawg, I heard you like levels
            # so I put levels inside your levels
            fake_thy_name(data=first_level, first_level=first_level_inside, second_level=second_level_inside)
        
        # For parts which contain names but are not names themselves
        if to_redact:
            first_level[to_redact] = 'REDACTED'
    except:
        pass

In [13]:
for file in all_files:
    
    fake_name = fake_names[all_files.index(file)]
    
    # Read in binary to fix mojibakes
    with open(file, 'rb') as f:
        repaired = fix_mojibake_escapes(f.read())
        data = json.loads(repaired.decode('utf8'), strict=False)        
        directory = file.replace('\\message_1.json', '')
        
        # No group chats!
        if len(data['participants']) == 2:
            
            # How can you not love nested JSONs
            fake_thy_name(data=data, first_level='participants', second_level="name")
            fake_thy_name(data=data, first_level='messages', second_level="sender_name")
            
            # Need to try-except each time for NaN values
            for message in data['messages']:
                try_except(message, 'reactions', 'actor')
                try_except(message, 'users', 'name', to_redact='content')
                try_except(message, 'audio_files', to_redact='audio_files')
                try_except(message, 'photos', to_redact='photos')
                try_except(message, 'gifs', to_redact='gifs')
                try_except(message, 'videos', to_redact='videos')
                try_except(message, 'files', to_redact='files')
                try_except(message, 'call_duration', to_redact='content')
                try_except(message, 'missed', to_redact='content')
            
            # Removing old file and creating new
            f.close() 
            os.remove(file)
            with open(file, 'w', encoding='utf8') as f_new:
                json.dump(data, f_new, ensure_ascii=False)
            
            # Renaming directory with fake name
            if os.path.isdir(directory):
                fake_directory = f"messages/inbox/{'_'.join(fake_name.split())}" 
                os.rename(directory, fake_directory)
                
        # Deletes entire folder with group chats    
        else:
            f.close()
            shutil.rmtree(directory)


In [14]:
after = get_size_and_files('messages')

In [15]:
print("Before processing:")
print(f"  Size: {before[0]/ (1024*1024):.2f} MB ")
print(f"  Number of files: {before[1]}")
print()
print("After processing:")
print(f"  Size: {after[0]/ (1024*1024):.2f} MB ")
print(f"  Number of files: {after[1]}")

Before processing:
  Size: 4553.41 MB 
  Number of files: 19598

After processing:
  Size: 101.84 MB 
  Number of files: 643
