# Dependencies

In [9]:
import sys
import os
import json
import pandas as pd
import hashlib
import spacy
import re
from importlib import reload
import xlsxwriter
from datetime import datetime
from collections import defaultdict

# local file `futils` for utility funcitons
import futils

# Load Raw Data

In [90]:
"""
Get a list of files from the `data/` directory.
Each is a `.json. file with annotations for multiple informed consent documents. 
Each `.json` file is a single annotator.
"""

data_dir = "../data/"
json_file_list  = []

for subdir, dirs, files in os.walk(data_dir):
        for file in files:
            json_file_list.append(os.path.join(subdir, file))
            
futils.printl(json_file_list)

../data/2020-01-21_Random_46-60_KATHLEEN.json
../data/2020-01-20_Random_1-15_Kathleen.json
../data/2020-01-21_Random_46-60_LIZ.json
../data/2020-01-21_Random_16-30_LIZ.json
../data/2020-01-20_Random_16-30_KATHLEEN.json
../data/2020-01-27_Random_76-90_KATHLEEN.json
../data/2020-01-21_Random_31-45_KATHLEEN.json
../data/2020-01-27_Random_76-90_LIZ.json
../data/2020-01-27_Random_61-75_KATHLEEN.json
../data/2020-01-21_Random_31-45_KAYCEE.json
../data/2020-01-28_Random_91-105_KAYCEE.json
../data/2020-01-27_Random_76-90_KAYCEE.json
../data/2020-01-28_Random_91-105_LIZ.json
../data/2020-01-28_Random_106-120_KATHLEEN.json
../data/2020-01-21_Random_1-15_LIZ.json
../data/2020-01-21_Random_16-30_KAYCEE.json
../data/2020-01-27_Random_61-75_KAYCEE.json
../data/2020-01-21_Random_31-45_LIZ.json
../data/2020-01-28_Random_106-120_LIZ.json
../data/2020-01-21_Random_46-60_KAYCEE.json
../data/2020-01-28_Random_91-105_KATHLEEN.json
../data/2020-01-28_Random_121-134_KAYCEE.json
../data/2020-01-21_Random_1-15

In [91]:
"""
get list of annotators and format their names. 

This is a dictionary that has formatted names, and the `.json` files that each 
annotator annotated.
"""
reload(futils)
annotators = defaultdict()

for file in json_file_list:
    name = futils.format_annotator_name(file)
    if name not in annotators:
        annotators[name] = {'json_files': [file]}
    else:
        annotators[name]['json_files'].append(file)
        
        
futils.printl(annotators, True)

KATHLEEN {'json_files': ['../data/2020-01-21_Random_46-60_KATHLEEN.json', '../data/2020-01-20_Random_1-15_Kathleen.json', '../data/2020-01-20_Random_16-30_KATHLEEN.json', '../data/2020-01-27_Random_76-90_KATHLEEN.json', '../data/2020-01-21_Random_31-45_KATHLEEN.json', '../data/2020-01-27_Random_61-75_KATHLEEN.json', '../data/2020-01-28_Random_106-120_KATHLEEN.json', '../data/2020-01-28_Random_91-105_KATHLEEN.json', '../data/2020-01-28_Random_121-134_KATHLEEN.json']} 

LIZ {'json_files': ['../data/2020-01-21_Random_46-60_LIZ.json', '../data/2020-01-21_Random_16-30_LIZ.json', '../data/2020-01-27_Random_76-90_LIZ.json', '../data/2020-01-28_Random_91-105_LIZ.json', '../data/2020-01-21_Random_1-15_LIZ.json', '../data/2020-01-21_Random_31-45_LIZ.json', '../data/2020-01-28_Random_106-120_LIZ.json', '../data/2020-01-28_Random_121-134_LIZ.json', '../data/2020-01-27_Random_61-75_LIZ.json']} 

KAYCEE {'json_files': ['../data/2020-01-21_Random_31-45_KAYCEE.json', '../data/2020-01-28_Random_91-105_

In [92]:
print(annotators.keys())

dict_keys(['KATHLEEN', 'LIZ', 'KAYCEE'])


In [98]:
%time
"""
Load json objects into a 'raw' format for processing
"""

reload(futils)

for annotator in annotators:
    df_list = []
    annotators[annotator]['document_map'] = {}
    for file in annotators[annotator]['json_files']:
        
        tmp_df, doc_map = futils.load_annotations(annotator, file)
        df_list.append(tmp_df)
        
        for doc_id, content in doc_map.items():
            if doc_id not in annotators[annotator]['document_map']:
                annotators[annotator]['document_map'][doc_id] = content
        
    annotations = pd.concat(df_list, ignore_index=True)
     # add prepared annotation tables 
    annotators[annotator]['annotations'] = annotations
    
print(annotators.keys())

print(annotators['KATHLEEN'].keys())
print(annotators['LIZ'].keys())
print(annotators['KAYCEE'].keys())

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 7.39 µs
dict_keys(['KATHLEEN', 'LIZ', 'KAYCEE'])
dict_keys(['json_files', 'document_map', 'annotations'])
dict_keys(['json_files', 'document_map', 'annotations'])
dict_keys(['json_files', 'document_map', 'annotations'])
