In [None]:
import json
import glob
import re
from functools import cache
from itertools import product
from os.path import join

import dateparser
import pandas as pd

In [None]:
# URLs
with open('list.csv') as f:
    URLs = [link.strip() for link in list(f.readlines())[1:]]

In [None]:
# get all the json files in the folder
json_files = list(sorted(glob.glob(join("data", "*.json"))))

# create a df to store data
main_df = pd.DataFrame(columns=["CaseRef","CaseDesc","CaseType","CaseURL","PersonsData"])

# loop through and convert to csv
for file in json_files:
    with open(file) as f:
        data = json.load(f)
        temp_df = pd.DataFrame(data.values())
    main_df = pd.concat([main_df, temp_df])
    
main_df.to_csv(join("data", "json_converted.csv"))

In [None]:
missing_df = pd.DataFrame(main_df[main_df['CaseType'] == 'Missing'])
unidentified_df = pd.DataFrame(main_df[main_df['CaseType'] == 'Unidentified'])



In [None]:
def get_attributes(dataframe):
    unique_attributes = set()

    for _, row in dataframe.iterrows():
        for person in row['PersonsData']:
            unique_attributes.update(list(person['InfoSection'].keys()))

    return unique_attributes

print('## All Missing Persons Attributes')
[print('- ' + a) for a in get_attributes(missing_df)]
print('## All Unidentified Remains Attributes')
[print('- ' + a) for a in get_attributes(unidentified_df)]


# Unique Attributes Useful for Filtering
- **Discovered on**
- **Missing since**
- Year of birth
- Est. Age
- Age at disappearance
- Complexion
- Eye colour
- Height

## All Missing Persons Attributes
- Pants
- Socks
- Suit
- Missing since
- Eye colour
- Tie
- Chain
- Waders
- Scar
- Jewelry
- Hair
- Bathing Suit
- Foreign Object
- Aliases
- Sweater
- Gloves
- Complexion
- Backpack
- Boots
- Vest
- Skirt
- Jeans
- Amputation
- Blouse
- Scarf
- Money
- Gender
- Jacket
- Age at disappearance
- Dress
- Overalls
- Height
- Coveralls
- Luggage
- Ring
- Leggings
- Underwear
- Cane
- Coat
- Deformity
- Shirt
- Shoes
- Purse
- Watch
- Pyjama
- Tattoo
- Glasses
- Belt
- Weight
- Build
- Shorts
- Wallet
- Hat
- Other
- Pierce
- Bio group
- Nightgown
- Teeth
- Fractured/Broken Bones
- Year of birth
- Mark

## All Unidentified Remains Attributes
- Pants
- Socks
- Suit
- Eye colour
- Chain
- Tie
- Scar
- Jewelry
- Hair
- Bathing Suit
- Foreign Object
- Aliases
- Sweater
- Gloves
- Backpack
- Complexion
- Medication
- Boots
- Vest
- Skirt
- Jeans
- Amputation
- Blouse
- Scarf
- Money
- Gender
- Jacket
- Coveralls
- Dress
- Height
- Leggings
- Underwear
- Ring
- Luggage
- Coat
- Deformity
- Shirt
- Shoes
- Purse
- Watch
- Pyjama
- Tattoo
- Glasses
- Belt
- Discovered on
- Weight
- Build
- Shorts
- Wallet
- Hat
- Other
- Pierce
- Bio group
- Teeth
- Fractured/Broken Bones
- Est. age
- Mark


In [None]:
def format_age_range(raw_age_range_string):
    result = re.search(r'(\d+).*(\d+)', raw_age_range_string)
    
    if result is None or len(result.groups()) != 3:
        return None
    
    return [int(result.group(1)), int(result.group(2))]

In [None]:
missing_df = main_df[main_df['CaseType'] == 'Missing']
unidentified_df = main_df[main_df['CaseType'] == 'Unidentified']

In [None]:
%%time

def get_all_attribute_from_all_persons(attribute_name, attribute_function, df):
    people_with_attribute = []
    
    for index, row in df.iterrows():
        for person in row['PersonsData']:
            attribute = [row['CaseURL'], attribute_function(person['InfoSection'][attribute_name][0])]
            people_with_attribute.append(attribute)
            
    return people_with_attribute
            
unidentified_disovered_at = get_all_attribute_from_all_persons('Discovered on', cache(dateparser.parse), unidentified_df)
missing_missing_since = get_all_attribute_from_all_persons('Missing since', cache(dateparser.parse), missing_df)

In [None]:
def person_data_data_frame(df, attributes):
    person_data_frame = {'Person URL': []}
    for attribute in attributes:
        person_data_frame[attribute] = []

    for _, row in df.iterrows():
        for person in row['PersonsData']:
            person_data_frame['Person URL'].append(row['CaseURL'])

            for attribute in attributes:
                if attribute in person['InfoSection']:
                    person_data_frame[attribute].append(person['InfoSection'][attribute][0])
                else:
                    person_data_frame[attribute].append(None)

    return pd.DataFrame(data=person_data_frame)

In [None]:
my_df = person_data_data_frame(unidentified_df, ['Discovered on', 'Est. age'])
print(my_df['Est. age'].apply(format_age_range))

In [None]:

potential_matches = []

for missing_person in missing_missing_since:
    for unidentified in unidentified_disovered_at:
        if missing_person[1] < unidentified[1]:
            potential_matches.append([missing_person[0], unidentified[0]])
            
print(potential_matches[0])

In [None]:
@cache
def cached_datetime_parser(*args, **kwargs):
    return dateparser.parse(*args, **kwargs)

In [None]:
def was_found_after_reported(unidentified, missing) -> bool:
    pass

In [None]:
%%time

potential_matches_complete_objects = []

comparison_functions = []

for (_, unidentified), (_, missing) in product(unidentified_df.iterrows(), missing_df.iterrows()):
    # do comparisons of missing and unidentified using all known comparison functions
    # If it's a possible match, append to the list of potential matches
    if all(f(unidentified, missing) for f in comparison_functions):
        potential_matches_complete_objects.append((unidentified, missing))
        
print(potential_matches_complete_objects[0][0])
print()
print(potential_matches_complete_objects[0][1])