In [2]:
import json
import glob
import re
from functools import cache
from itertools import product
from os.path import join
from typing import Optional

import dateparser

setattr(dateparser, "parse", cache(dateparser.parse))

In [3]:
# URLs
with open('list.csv') as f:
    URLs = [link.strip() for link in list(f.readlines())[1:]]

In [4]:
# get all the json files in the folder
json_files = list(sorted(glob.glob(join("data", "*.json"))))

# create a dict to store data
main_cases = []

# loop through and convert to csv
for file in json_files:
    with open(file) as f:
        data = json.load(f)
        main_cases.extend(data.values())

In [5]:
unique_attributes = set()

for row in main_cases:
    for person in row['PersonsData']:
        unique_attributes.update(list(person['InfoSection'].keys()))
        
print(sorted(unique_attributes))

['Age at disappearance', 'Aliases', 'Amputation', 'Backpack', 'Bathing Suit', 'Belt', 'Bio group', 'Blouse', 'Boots', 'Build', 'Cane', 'Chain', 'Coat', 'Complexion', 'Coveralls', 'Deformity', 'Discovered on', 'Dress', 'Est. age', 'Eye colour', 'Foreign Object', 'Fractured/Broken Bones', 'Gender', 'Glasses', 'Gloves', 'Hair', 'Hat', 'Height', 'Jacket', 'Jeans', 'Jewelry', 'Leggings', 'Luggage', 'Mark', 'Medication', 'Missing since', 'Money', 'Nightgown', 'Other', 'Overalls', 'Pants', 'Pierce', 'Purse', 'Pyjama', 'Ring', 'Scar', 'Scarf', 'Shirt', 'Shoes', 'Shorts', 'Skirt', 'Socks', 'Suit', 'Sweater', 'Tattoo', 'Teeth', 'Tie', 'Underwear', 'Vest', 'Waders', 'Wallet', 'Watch', 'Weight', 'Year of birth']


In [6]:
# The data comes in as a list of cases with multiple people per case
# This method will convert the cases to a list of people with a unique key for each person as the index
# Use a list of people to do logic with, and then convert back to cases for the front end
def flatten_cases_to_people(cases):
    people = {}
    
    for case in cases:
        case_without_persons = case.copy()
        case_without_persons.pop('PersonsData')
        
        for person in case['PersonsData']:
            key = case['CaseRef'] + '%%' + person['Name']
            
            people[key] = person['InfoSection'] | case_without_persons
            people[key]['Name'] = person['Name']
            
    return people


persons_data = flatten_cases_to_people(main_cases)

missing_persons = [persons_data[p] for p in persons_data if persons_data[p]["CaseType"] == "Missing"]
unidentified_persons = [persons_data[p] for p in persons_data if persons_data[p]["CaseType"] == "Unidentified"]

# Unique Attributes Useful for Filtering
- [x] **Discovered on**
- [x] **Missing since**
- [x] Year of birth
- [x] Est. Age
- [x] Age at disappearance
- [x] Hair
- [x] Tattoo
- [x] Marks
- [ ] Complexion
- [ ] Eye colour
- [x] Height
- [x] Weight

In [7]:
def format_age_range(raw_age_range_string):
    result = re.search(r'(\d+)[^\d]+?(\d+)', raw_age_range_string, re.M)
    
    if result is None or len(result.groups()) != 2:
        return None
    
    g = list(result.groups())
    return [int(g[0]), int(g[1])]

In [8]:
def format_weight(raw_weight_string) -> int:
    result = re.search(r'(\d+)kg', raw_weight_string, re.M)
    
    if result is None or len(result.groups()) != 1:
        return None 
    
    return int(result.groups()[0])

In [9]:
# Missing people have a percise height, unidentified remains have a range or a percise height

def format_height(raw_height_string) -> int:
    result = re.search(r'(\d+)cm', raw_height_string, re.M)
    
    if result is None or len(result.groups()) != 1:
        return None 
    
    return int(result.groups()[0])

def format_height_range(raw_height_string):
    result = re.search(r'(\d+)cm.* (\d+)cm', raw_height_string, re.M)
    
    if result is None or len(result.groups()) != 2:
        return None
    
    g = list(result.groups())
    return [int(g[0]), int(g[1])]

In [10]:
def was_found_after_reported(unidentified, missing) -> bool:
    unidentified_found = dateparser.parse(unidentified["Discovered on"][0])
    missing_missing = dateparser.parse(missing["Missing since"][0])

    return  unidentified_found > missing_missing

In [11]:
def potential_gender_match(unidentified, missing) -> bool:
    try:
        unidentified_gender = unidentified["Gender"][0]
        missing_gender = missing["Gender"][0]
    
        if unidentified_gender == missing_gender:
            return True
        elif unidentified_gender.lower() not in ["male", "female"] or missing_gender.lower() not in ["male", "female"]:
            return True
    except:
        return True
        
    return False

In [12]:
def age_approximately_as_expected(u, m) -> bool:
    unidentified_age_est = format_age_range(u["Est. age"][0])
            
    if unidentified_age_est is None:
        return False
    
    # tweak the range to be a bit more generous
    unidentified_age_est = [unidentified_age_est[0] * 0.8, unidentified_age_est[1] * 1.2]


    # this range goes from the youngest possible age, the age at disappearance, to the
    # oldest, which is how old they'd be when the unidentified remains were found    
    missing_age_range = (int(m["Age at disappearance"][0]),
                         (dateparser.parse(u["Discovered on"][0]) - dateparser.parse(m["Year of birth"][0])).days / 365)
        
    # return if there's overlap between the two ranges
    return unidentified_age_est[1] > missing_age_range[0] and unidentified_age_est[0] < missing_age_range[1]

In [13]:
MAX_WEIGHT_DIFFERENCE = 8 #kg

def weight_approximately_as_expected(u, m) -> bool:
    
    # ignore if no weight value
    if not('Weight' in u and 'Weight' in m):
        return True
    
    u_weight = format_weight(u['Weight'])
    m_weight = format_weight(m['Weight'])
    
    # ignore if somehow improperly formatted
    if u_weight is None or m_weight is None:
        return True
    
    # check if weight in range
    return u_weight > (m_weight - MAX_WEIGHT_DIFFERENCE) and u_weight < (m_weight + MAX_WEIGHT_DIFFERENCE)

# TODO: change weight range as a function of time. e.g. if they have been missing for 2 days, use tight range
#  if missing for years have a more generous range

In [14]:
MAX_HEIGHT_DIFFERENCE = 15 # cm

def height_approximately_as_expected(u, m) -> Optional[float]:
    # ignore if no weight value
    if not('Height' in u and 'Height' in m):
        return 0.0
    
    m_height = format_height(m['Weight'])
    u_height = format_height_range(u['Weight'])

    if u_height is None:
        u_height = format_height(u['Weight'])
    else:
        # make sure height is not a range
        u_height = (u_height[0] + u_height[1]) / 2
        
    # ignore if can't parse
    if m_height is None or u_height is None:
        return 0.0

    # number of cm difference
    height_closeness = abs(u_height - m_height)

    if height_closeness > MAX_HEIGHT_DIFFERENCE:
        return None

    return 1 - (height_closeness / MAX_HEIGHT_DIFFERENCE)

In [15]:
# naive check to see if either both or neither have a "Tattoo" key
def tattoo_matching(u, m) -> bool:
    return ("Tattoo" in u) == ("Tattoo" in m)

In [16]:
# naive check to see if either both or neither have a "Mark" key
def mark_matching(u, m) -> bool:
    return ("Mark" in u) == ("Mark" in m)

In [17]:
def height_matching(u, m) -> bool:
    return True

In [18]:
def potential_hair_match(unidentified, missing) -> bool:
    try:
        unidentified_hair = unidentified["Hair"][0].lower()
        missing_hair = missing["Hair"][0].lower()
    
        if unidentified_hair == missing_hair:
            return True
    except:
        # if we can't pull any hair data from either, it stays a potential match
        return True

    return False

In [19]:
def potential_bio_group_match(unidentified, missing) -> bool:
    try:
        unidentified_bio_group = unidentified["Bio group"][0].lower()
        missing_bio_group = missing["Bio group"][0].lower()
    
        if unidentified_bio_group == missing_bio_group:
            return True
    except:
        return True

    return False

In [20]:
%%time

potential_matches_complete_objects = []

comparison_functions = [
    was_found_after_reported,
    potential_gender_match,
    age_approximately_as_expected,
    potential_hair_match,
    tattoo_matching,
    potential_bio_group_match,
    height_matching,
    weight_approximately_as_expected,
    height_approximately_as_expected,
    # mark_matching,
]

def try_all_match_functions(unidentified, missing) -> Optional[float]:
    match_score = 0.0
    for f in comparison_functions:
        try:
            comparison_score = f(unidentified, missing)
            if comparison_score is None or comparison_score is False:
                return None
            else:
                match_score += comparison_score
        except Exception as e:
            return None
    
    return match_score


for unidentified, missing in product(unidentified_persons, missing_persons):
    # do comparisons of missing and unidentified using all known comparison functions
    # If it's a possible match, append to the list of potential matches
    if try_all_match_functions(unidentified, missing):
        potential_matches_complete_objects.append((unidentified, missing))
        
num_matches = len(potential_matches_complete_objects)
possible_total_matches = len(unidentified_persons)*len(missing_persons)

print(f"Reduced to {num_matches} out of a possible {possible_total_matches} matches ({int(100*num_matches/possible_total_matches)}%)")
print(json.dumps(potential_matches_complete_objects[0], indent=2))

  date_obj = stz.localize(date_obj)


Reduced to 23453 out of a possible 756136 matches (3%)
[
  {
    "Discovered on": [
      "August 7, 1985"
    ],
    "Est. age": [
      "50\n             to 60"
    ],
    "Gender": [
      "Male"
    ],
    "Bio group": [
      "White"
    ],
    "Hair": [
      "Grey, White, It is unknown whether he had any scalp hair but a few white hairs were noted on the back of his neck"
    ],
    "Teeth": [
      "Several of his teeth were missing prior to his death and he has some fillings. He also has a cast chrome partial upper denture."
    ],
    "Mark": [
      "Left Side Of Head, Raised \"age spots\" on his head, left cheek and behind his left ear."
    ],
    "Deformity": [
      "Upper Left Back, A papilloma (benign non-cancerous tumor) over his left shoulder blade"
    ],
    "Scar": [
      "Right Pelvis, 2cm right inguinal (groin area) scar"
    ],
    "CaseRef": "Case reference: 2005007499",
    "CaseDesc": "On August 7, 1985, an unidentified man, believed to be between the ages 