We first need to transform requested_documents as a consolidated hashmap. We consolidate by the last name. We also store the key just with the Name of the person.

In [141]:
import json

REQUESTED_DOCUMENTS_PATH = '../requested_documents.json'

with open(REQUESTED_DOCUMENTS_PATH, 'r') as f:
    requested_documents = json.load(f)

full_name_from_reqsdocs = list(requested_documents.keys())

consolidated_requested_documents = {}

for k,v in requested_documents.items():
    last_name = k.split(',')[0]
    if last_name not in consolidated_requested_documents:
        consolidated_requested_documents[last_name] = 0
    for doc in v:
        if "annual" in doc or "transaction" in doc:
            consolidated_requested_documents[last_name] += 1

print(sum(consolidated_requested_documents.values()))
print(len(consolidated_requested_documents))

6474
627


Now we make the entire dir as a nested hashmap. For instance, every Dir is a hashmap. If that dir in turn has other dir, then they are nested hashmap. If the dir only has pdf files, then we store those file names as a list (value) for that hashmap key

Ultimately, the hashmap has 54 keys. These 54 keys represents the 54 pages of the USOGE website

In [97]:
import os

def build_directory_tree(root_path):
    tree = {}
    
    for item in os.listdir(root_path):
        item_path = os.path.join(root_path, item)
        
        if os.path.isdir(item_path):
            tree[item] = build_directory_tree(item_path)
        elif item.endswith('.pdf'):
            if 'files' not in tree:
                tree['files'] = []
            tree['files'].append(item)
    
    if 'files' in tree and len(tree) == 1:
        return tree['files']
    elif 'files' in tree:
        files_list = tree.pop('files')
        tree['files'] = files_list
    
    return tree

RA_COLLECTION_PATH = '../RA_Collection'
ra_collection_tree = {os.path.basename(RA_COLLECTION_PATH): build_directory_tree(RA_COLLECTION_PATH)}
ra_collection = ra_collection_tree[os.path.basename(RA_COLLECTION_PATH)]

#storing this as a json file with proper formatting
RA_COLLECTION_TREE_PATH = './JSON_FILES/ra_collection_tree.json'

ra_files = {}

for k,v in ra_collection.items():
    if isinstance(v, dict):
        ra_files = ra_files | v

with open(RA_COLLECTION_TREE_PATH, 'w') as f:
    json.dump(ra_files, f, indent=4)

How many total people (USOGE) did the students capture. How many of these are repeated across different students.

In [98]:
cleaned_ra_files = {}

def _clean_ra_files(ra_files):
    for page,dirs in ra_files.items():
        for k,v in dirs.items():
            if k[0] != "R":
                k = page + " - " + k

            cleaned_ra_files[page] = cleaned_ra_files.get(page, {})
            cleaned_ra_files[page][k] = v

    return cleaned_ra_files

cleaned_ra_files = _clean_ra_files(ra_files)
ra_files = cleaned_ra_files

In [99]:
def _get_total_people_and_dups(ra_files):
    total_people = 0
    total_people_list = []
    seen_people = set()
    dups = 0

    for page,dirs in ra_files.items():
        for k,v in dirs.items():
            person = k.split(' - ')[-1]
            person = person.split(',')[0].lower()
            person = person.strip()

            if person not in seen_people:
                seen_people.add(person)
                total_people_list.append((person, k))
            else:
                dups += 1

            total_people += 1

    return total_people, dups, total_people_list

total_people, dups, total_people_list = _get_total_people_and_dups(ra_files)
print(total_people, dups)
print(len(total_people_list))

ppl_in_ra = set()
for ppl in total_people_list:
    ppl_in_ra.add(ppl[0])

print(len(ppl_in_ra))

771 85
686
686


How many of the people from requested_documents are not in RA collection. All based on last name [.split(",")[0]]

In [100]:
#gives the name of USOGE people who are not in requested_documents.json
def _get_missing_from_reqsdocs(ra_files, consolidated_requested_documents):

    missing_from_reqsdocs = []

    for page,dirs in ra_files.items():
        for k,v in dirs.items():
            person = k.split('-')[-1]
            person = person.split(',')[0].lower()
            person = person.strip()
            
            if person not in consolidated_requested_documents:
                missing_from_reqsdocs.append((person, k))

    return missing_from_reqsdocs

#gives the name of the USOGE people who are not in the RA collection
def _get_missing_from_ra(ppl_in_ra, consolidated_requested_documents):

    missing_from_ra = []

    for k,v in consolidated_requested_documents.items():
        if k not in ppl_in_ra:
            missing_from_ra.append(k)

    return missing_from_ra

missing_from_reqsdocs = _get_missing_from_reqsdocs(ra_files, consolidated_requested_documents)
print("People missing from reqsdocs: ", len(missing_from_reqsdocs))

missing_from_ra = _get_missing_from_ra(ppl_in_ra, consolidated_requested_documents)
print("People missing from ra: ", len(missing_from_ra))


People missing from reqsdocs:  118
People missing from ra:  69


In [167]:
def _get_missing_files_for_every_ra(ra_files, consolidated_requested_documents):

    missing_pages = {}

    for page,dirs in ra_files.items():
        for k,v in dirs.items():
            person = k.split(' - ')[-1]
            person = person.split(',')[0].lower()

            #clean up the values. Only count the ones that have 278 in their name
            in_ra = 0
            seen = set()
            for docs in v:
                if docs not in seen:
                    if "278" in docs:
                        in_ra += 1
                    seen.add(docs)

            if person in consolidated_requested_documents:
                in_reqs = consolidated_requested_documents[person]
                missing_pages[k] = in_reqs - in_ra

    missing_files_from_ra = {}

    for k,v in missing_pages.items():
        #clean k
        ra = k.split(' - ')[1]
        ra = ra.strip()

        if ra not in missing_files_from_ra:
            missing_files_from_ra[ra] = {}

        missing_files_from_ra[ra][k] = v
    
    return missing_files_from_ra

def _check_again_for_those_in_ra_but_not_in_reqsdocs(ra_files, missing_from_reqsdocs, full_name_from_reqsdocs):

    match, not_match = [], []

    print(len(missing_from_reqsdocs))

    for last_name, dir in missing_from_reqsdocs:

        #get the whole name
        whole_name = dir.split("-")[-1]
        whole_name = whole_name.strip().lower()
        whole_name = whole_name.replace("_", " ")

        not_match.append(whole_name)
        for full_name in full_name_from_reqsdocs:
            if whole_name in full_name:
                match.append((whole_name, full_name, dir))
                not_match.pop()
                break

    for name, reqs_name, dir in match:

        ra_name = dir.split(" - ")[1]
        ra_name = ra_name.strip()

        #first find this guy in the ra_files
        for page, dirs in ra_files.items():
            for k,v in dirs.items():
                if dir == k:
                    missing_files = len(requested_documents[reqs_name]) - len(v)
                    print(dir, " --- " , missing_files)
                    break
        
        #since we got the missing files
        missing_files_from_ra[ra_name][dir] = missing_files

    return missing_files_from_ra

def _add_count_positive_vals(missing_files_from_ra):

    for k,v in missing_files_from_ra.items():
        total_missing_files_count = 0

        for k2,v2 in v.items():
            if v2 > 0:
                total_missing_files_count += v2

        missing_files_from_ra[k]['Total Number of Files Missing (considering only positive values)'] = total_missing_files_count

    return missing_files_from_ra


missing_files_from_ra = _get_missing_files_for_every_ra(ra_files, consolidated_requested_documents)
missing_files_from_ra = _check_again_for_those_in_ra_but_not_in_reqsdocs(ra_files, missing_from_reqsdocs, full_name_from_reqsdocs)
missing_files_from_ra = _add_count_positive_vals(missing_files_from_ra)

MISSING_FILES_SUMMARY_PATH = './JSON_FILES/missing_files_for_every_ra.json'
with open(MISSING_FILES_SUMMARY_PATH, 'w') as f:
    json.dump(missing_files_from_ra, f, indent=4)

118
RA Collection - Lai Chun Kiu - Page 20 - Hennessey-Niland, John T  ---  0
RA Collection - Lai Chun Kiu - Page 21 - Hoehn-Saric, Alex  ---  0
RA collection - LI Junhui - Page 13 - Dowling, Maria-Kate  ---  -8
RA collection - LI Junhui - Page 53 - Whyche-Shaw, Oren  ---  -2
RA collection - Wong Ho Ching - Page 49 - Thomas-Greenfield, Linda J  ---  5
RA collection - Wong Ho Ching - Page 2 - Babcock-Lumish, Terry  ---  0
RA Collection - ZHANG Xuanrui - Page 7 - Calvaresi-Barr, Ann  ---  -1
RA collection - Leong Deng Hung - Page 18 - Goldsmith-Romero, Christy L  ---  -11
RA collection - Leong Deng Hung - Page 17 - Goldsmith-Romero, Christina L  ---  6
RA Collection - Khushi Chandani - Page 47 - Stone-Manning, Tracy  ---  -2
RA Collection - Khushi Chandani - Page 48 - Syptak-Ramnath, Stephanie F  ---  -2
RA Collection - Khushi Chandani - Page 46 - Slater-Chandler, Neysa M  ---  0
RA Collection - Khushi Chandani - Page 46 - Sherwood-Randall, Elizabeth  ---  0
RA collection - SUNG, Wen-chi