In [None]:
import json
import glob
import re
from functools import cache
from itertools import product
from os.path import join

import dateparser

setattr(dateparser, "parse", cache(dateparser.parse))

In [None]:
# URLs
with open('list.csv') as f:
    URLs = [link.strip() for link in list(f.readlines())[1:]]

In [None]:
# get all the json files in the folder
json_files = list(sorted(glob.glob(join("data", "*.json"))))

# create a dict to store data
main_cases = []

# loop through and convert to csv
for file in json_files:
    with open(file) as f:
        data = json.load(f)
        main_cases.extend(data.values())

In [None]:
unique_attributes = set()

for row in main_cases:
    for person in row['PersonsData']:
        unique_attributes.update(list(person['InfoSection'].keys()))
        
print(sorted(unique_attributes))

In [None]:
# The data comes in as a list of cases with multiple people per case
# This method will convert the cases to a list of people with a unique key for each person as the index
# Use a list of people to do logic with, and then convert back to cases for the front end
def flatten_cases_to_people(cases):
    people = {}
    
    for case in cases:
        case_without_persons = case.copy()
        case_without_persons.pop('PersonsData')
        
        for person in case['PersonsData']:
            key = case['CaseRef'] + '%%' + person['Name']
            
            people[key] = person['InfoSection'] | case_without_persons
            people[key]['Name'] = person['Name']
            
    return people


persons_data = flatten_cases_to_people(main_cases)

missing_persons = [persons_data[p] for p in persons_data if persons_data[p]["CaseType"] == "Missing"]
unidentified_persons = [persons_data[p] for p in persons_data if persons_data[p]["CaseType"] == "Unidentified"]

# Unique Attributes Useful for Filtering
- [x] **Discovered on**
- [x] **Missing since**
- [x] Year of birth
- [x] Est. Age
- [x] Age at disappearance
- [x] Hair
- [x] Tattoo
- [x] Marks
- [ ] Complexion
- [ ] Eye colour
- [ ] Height
- [ ] Weight 

In [None]:
def format_age_range(raw_age_range_string):
    result = re.search(r'(\d+)[^\d]+?(\d+)', raw_age_range_string, re.M)
    
    if result is None or len(result.groups()) != 2:
        return None
    
    g = list(result.groups())
    return [int(g[0]), int(g[1])]

In [None]:
def format_weight(raw_weight_string) -> int:
    result = re.search(r'(\d+)cm', raw_weight_string, re.M)
    
    if result is None or len(result.groups()) != 1:
        return None 
    
    return int(result.groups()[0])

In [None]:
def was_found_after_reported(unidentified, missing) -> bool:
    unidentified_found = dateparser.parse(unidentified["Discovered on"][0])
    missing_missing = dateparser.parse(missing["Missing since"][0])

    return  unidentified_found > missing_missing

In [None]:
def potential_gender_match(unidentified, missing) -> bool:
    try:
        unidentified_gender = unidentified["Gender"][0]
        missing_gender = missing["Gender"][0]
    
        if unidentified_gender == missing_gender:
            return True
        elif unidentified_gender.lower() not in ["male", "female"] or missing_gender.lower() not in ["male", "female"]:
            return True
    except:
        return True
        
    return False

In [None]:
def age_approximately_as_expected(u, m) -> bool:
    unidentified_age_est = format_age_range(u["Est. age"][0])
            
    if unidentified_age_est is None:
        return False
    
    # tweak the range to be a bit more generous
    unidentified_age_est = [unidentified_age_est[0] * 0.8, unidentified_age_est[1] * 1.2]


    # this range goes from the youngest possible age, the age at disappearance, to the
    # oldest, which is how old they'd be when the unidentified remains were found    
    missing_age_range = (int(m["Age at disappearance"][0]),
                         (dateparser.parse(u["Discovered on"][0]) - dateparser.parse(m["Year of birth"][0])).days / 365)
        
    # return if there's overlap between the two ranges
    return unidentified_age_est[1] > missing_age_range[0] and unidentified_age_est[0] < missing_age_range[1]

In [None]:
MAX_WEIGHT_DIFFERENCE = 6 #cm

def weight_approximately_as_expected(u, m) -> bool:
    
    # ignore if no weight value
    if not('Weight' in u and 'Weight' in m):
        return True
    
    u_weight = format_weight(u['Weight'])
    m_weight = format_weight(m['Weight'])
    
    # ignore if somehow improperly formatted
    if u_weight is None or m_weight is None:
        return True
    
    # check if weight in range
    return u_weight > (m_weight - MAX_WEIGHT_DIFFERENCE) and u_weight < (m_weight + MAX_WEIGHT_DIFFERENCE)

# TODO: change weight range as a function of time. e.g. if they have been missing for 2 days, use tight range
#  if missing for years have a more generous range

In [None]:
# naive check to see if either both or neither have a "Tattoo" key
def tattoo_matching(u, m) -> bool:
    return ("Tattoo" in u) == ("Tattoo" in m)

In [None]:
# naive check to see if either both or neither have a "Mark" key
def mark_matching(u, m) -> bool:
    return ("Mark" in u) == ("Mark" in m)

In [None]:
def height_matching(u, m) -> bool:
    return True

In [None]:
def potential_hair_match(unidentified, missing) -> bool:
    try:
        unidentified_hair = unidentified["Hair"][0].lower()
        missing_hair = missing["Hair"][0].lower()
    
        if unidentified_hair == missing_hair:
            return True
    except:
        # if we can't pull any hair data from either, it stays a potential match
        return True

    return False

In [None]:
def potential_bio_group_match(unidentified, missing) -> bool:
    try:
        unidentified_bio_group = unidentified["Bio group"][0].lower()
        missing_bio_group = missing["Bio group"][0].lower()
    
        if unidentified_bio_group == missing_bio_group:
            return True
    except:
        return True

    return False

In [None]:
%%time

potential_matches_complete_objects = []

comparison_functions = [
    was_found_after_reported,
    potential_gender_match,
    age_approximately_as_expected,
    potential_hair_match,
    tattoo_matching,
    # mark_matching,
    potential_bio_group_match,
    height_matching,
    weight_approximately_as_expected
]

def try_all_match_functions(unidentified, missing) -> bool:
    for f in comparison_functions:
        try:
            valid = f(unidentified, missing)
            if not valid:
                return False
        except Exception as e:
            return False
    
    return True


for unidentified, missing in product(unidentified_persons, missing_persons):
    # do comparisons of missing and unidentified using all known comparison functions
    # If it's a possible match, append to the list of potential matches
    if try_all_match_functions(unidentified, missing):
        potential_matches_complete_objects.append((unidentified, missing))
        
num_matches = len(potential_matches_complete_objects)
possible_total_matches = len(unidentified_persons)*len(missing_persons)

print(f"Reduced to {num_matches} out of a possible {possible_total_matches} matches ({int(100*num_matches/possible_total_matches)}%)")
print(json.dumps(potential_matches_complete_objects[0], indent=2))