In [16]:
import json
import glob
from os.path import join
import dateparser
import re
import pandas as pd

In [2]:
# URLs
with open('list.csv') as f:
    URLs = [link.strip() for link in list(f.readlines())[1:]]

In [3]:
# get all the json files in the folder
json_files = list(sorted(glob.glob(join("data", "*.json"))))

# create a df to store data
main_df = pd.DataFrame(columns=["CaseRef","CaseDesc","CaseType","CaseURL","PersonsData"])

# loop through and convert to csv
for file in json_files:
    with open(file) as f:
        data = json.load(f)
        temp_df = pd.DataFrame(data.values())
    main_df = pd.concat([main_df, temp_df])
    
main_df.to_csv(join("data", "json_converted.csv"))

In [4]:
unique_attriutes = set()

for index, row in main_df.iterrows():
    for person in row['PersonsData']:
        unique_attriutes.update(list(person['InfoSection'].keys()))
        
print(unique_attriutes)

{'Socks', 'Other', 'Suit', 'Coveralls', 'Sweater', 'Year of birth', 'Weight', 'Pierce', 'Luggage', 'Nightgown', 'Build', 'Cane', 'Purse', 'Hat', 'Shoes', 'Jeans', 'Hair', 'Vest', 'Belt', 'Missing since', 'Shorts', 'Leggings', 'Tattoo', 'Scarf', 'Teeth', 'Glasses', 'Blouse', 'Waders', 'Bathing Suit', 'Ring', 'Medication', 'Aliases', 'Underwear', 'Jacket', 'Eye colour', 'Jewelry', 'Tie', 'Est. age', 'Age at disappearance', 'Coat', 'Wallet', 'Fractured/Broken Bones', 'Amputation', 'Discovered on', 'Overalls', 'Money', 'Height', 'Skirt', 'Gloves', 'Pyjama', 'Bio group', 'Scar', 'Pants', 'Backpack', 'Chain', 'Mark', 'Watch', 'Shirt', 'Dress', 'Boots', 'Gender', 'Deformity', 'Foreign Object', 'Complexion'}


# Unique Attributes Useful for Filtering
- **Discovered on**
- **Missing since**
- Year of birth
- Est. Age
- Age at disappearance
- Complexion
- Eye colour
- Height

In [24]:
def format_age_range(raw_age_range_string):
    result = re.search(r'(\d+).*(\d+)', raw_age_range_string)
    
    if result is None or len(result.groups()) != 3:
        return None
    
    return [int(result.group(1)), int(result.group(2))]

In [5]:
missing_df = main_df[main_df['CaseType'] == 'Missing']
unidentified_df = main_df[main_df['CaseType'] == 'Unidentified']

In [8]:
%%time

def get_all_attribute_from_all_persons(attribute_name, attribute_function, df):
    people_with_attribute = []
    
    for index, row in df.iterrows():
        for person in row['PersonsData']:
            attribute = [row['CaseURL'], attribute_function(person['InfoSection'][attribute_name][0])]
            people_with_attribute.append(attribute)
            
    return people_with_attribute
            
unidentified_disovered_at = get_all_attribute_from_all_persons('Discovered on', dateparser.parse, unidentified_df)
missing_missing_since = get_all_attribute_from_all_persons('Missing since', dateparser.parse, missing_df)

CPU times: user 8.36 s, sys: 200 ms, total: 8.56 s
Wall time: 10.8 s


In [10]:
def person_data_data_frame(df, attributes):
    person_data_frame = {'Person URL': []}
    for attribute in attributes:
        person_data_frame[attribute] = []

    for _, row in df.iterrows():
        for person in row['PersonsData']:
            person_data_frame['Person URL'].append(row['CaseURL'])

            for attribute in attributes:
                if attribute in person['InfoSection']:
                    person_data_frame[attribute].append(person['InfoSection'][attribute][0])
                else:
                    person_data_frame[attribute].append(None)

    return pd.DataFrame(data=person_data_frame)

In [25]:
my_df = person_data_data_frame(unidentified_df, ['Discovered on', 'Est. age'])
print(my_df['Est. age'].apply(format_age_range))

0      None
1      None
2      None
3      None
4      None
       ... 
369    None
370    None
371    None
372    None
373    None
Name: Est. age, Length: 374, dtype: object


In [9]:
potential_matches = []

for missing_person in missing_missing_since:
    for unidentified in unidentified_disovered_at:
        if missing_person[1] < unidentified[1]:
            potential_matches.append([missing_person[0], unidentified[0]])
            
print(potential_matches[0])

['https://www.services.rcmp-grc.gc.ca/missing-disparus/case-dossier.jsf?case=2020008698&id=18', 'https://www.services.rcmp-grc.gc.ca/missing-disparus/case-dossier.jsf?case=2014001136&id=27']
