In [None]:
import json
import glob
from os.path import join
import dateparser
import pandas as pd

In [None]:
# URLs
with open('list.csv') as f:
    URLs = [link.strip() for link in list(f.readlines())[1:]]

In [None]:
# get all the json files in the folder
json_files = list(sorted(glob.glob(join("data", "*.json"))))

# create a df to store data
main_df = pd.DataFrame(columns=["CaseRef","CaseDesc","CaseType","CaseURL","PersonsData"])

# loop through and convert to csv
for file in json_files:
    with open(file) as f:
        data = json.load(f)
        temp_df = pd.DataFrame(data.values())
    main_df = pd.concat([main_df, temp_df])
    
main_df.to_csv(join("data", "json_converted.csv"))

In [None]:
unique_attriutes = set()

for index, row in main_df.iterrows():
    for person in row['PersonsData']:
        unique_attriutes.update(list(person['InfoSection'].keys()))
        
print(unique_attriutes)

# Unique Attributes Useful for Filtering
- **Discovered on**
- **Missing since**
- Year of birth
- Est. Age
- Age at disappearance
- Complexion
- Eye colour
- Height

In [None]:
missing_df = main_df[main_df['CaseType'] == 'Missing']
unidentified_df = main_df[main_df['CaseType'] == 'Unidentified']

In [None]:
%%time

def get_all_attribute_from_all_persons(attribute_name, attribute_function, df):
    people_with_attribute = []
    
    for index, row in df.iterrows():
        for person in row['PersonsData']:
            attribute = [row['CaseURL'], attribute_function(person['InfoSection'][attribute_name][0])]
            people_with_attribute.append(attribute)
            
    return people_with_attribute
            
unidentified_disovered_at = get_all_attribute_from_all_persons('Discovered on', dateparser.parse, unidentified_df)
missing_missing_since = get_all_attribute_from_all_persons('Missing since', dateparser.parse, missing_df)

In [None]:
potential_matches = []

for missing_person in missing_missing_since:
    for unidentified in unidentified_disovered_at:
        if missing_person[1] < unidentified[1]:
            potential_matches.append([missing_person[0], unidentified[0]])
            
print(potential_matches)