## Missing Persons DB Webscrape

https://www.services.rcmp-grc.gc.ca/missing-disparus/search-recherche.jsf

Search with no criteria will bring up all results.

The links to the missing persons pages, can be appended to https://www.services.rcmp-grc.gc.ca/

### ========== Code

#### Get all the links to the more detailed case pages from the navigation pages

In [None]:
import time
import pandas as pd
import re
from collections import defaultdict
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from bs4 import BeautifulSoup as bs
import requests
import json
import csv
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service

In [None]:
link = 'https://www.services.rcmp-grc.gc.ca/missing-disparus/search-recherche.jsf'
# Get a chrome driver if there isn't one locally
service = service=Service(ChromeDriverManager().install())

In [None]:
browser = webdriver.Chrome(service=service) 
browser.get(link)
time.sleep(2)

# find search and click it to reach data
try:
    search = browser.find_element(By.NAME, 'searchForm:j_idt158')
    search.click()
except:
    print('could not find Search')

time.sleep(3)

not_last = True
# to store all the URLs
URLs = set()

# go through each page and get all URLs    
while not_last:
    time.sleep(2) # wait more just in case
    page = browser.page_source
    pageSoup = bs(page, 'html.parser')
    
    # get all the links on the page and add them to array
    pageURLs = pageSoup.find_all('a',class_="wet-newwindow")
    
    print("Collecting URLs...")
    
    # take each link on the page and add if not a dupe
    for link in pageURLs:
        href = link.get('href')
        if href not in URLs:
            URLs.add("https://www.services.rcmp-grc.gc.ca/" + href)

    print("Page Complete!")
    time.sleep(2) # wait a little
    
    # are we on the last page
    try:
        # click the next button at the bottom of the page
        next_page = browser.find_element(By.XPATH, '/html/body/main/form/div[33]/ul/li[83]/a')
        print('Found next button to press.')
        next_page.click()
        time.sleep(2) # wait for next page to load
    except:
        # should not have a next button on the last page
        print('last page or no next button found!')
        not_last = False
    
# the final list
print("================================== END ==================================")

# write progress to csv
df = pd.DataFrame(URLs, columns=["URL"])
df.to_csv('list.csv', index=False)

browser.quit()

#### Collect all the data from all the detailed case pages

This part of the code adapted from the CBC script

In [None]:
base_url = r'https://www.services.rcmp-grc.gc.ca'

#CLEANING FUNCTION
def cleaning_function(item):
    item = str(item)
    item = item.replace("<dd>" , "")
    item = item.replace("</dd>" , "")
    item = item.replace("<p>" , "")
    item = item.replace("</p>" , "")
    item = item.replace("<strong>Missing from </strong>" , "")
    item = item.replace("<strong>" , "")
    item = item.replace("</strong>" , "")
    return item

In [None]:
#this is the list where all the URLs from the sheet will go
person_url_list = []
#this is where all the person info will go
person_info = []

#a list for the sections later
section_list = []

#I have this because I dont know how else to filter out stuff from an if statement that I dont want
count_working = 0

for page_url in URLs:
    print("Record Number: " + str(count_working))
    print("Case URL: " + page_url)
    count_working += 1
    url = base_url + page_url
    
    # request the html
    try:
        page = requests.get(url, timeout = 10)
    except requests.exceptions.Timeout:
        print("Timeout occurred")
    # structure the page content for parsing
    soup = bs(page.content, 'html.parser') 
    
    # First we have to pull out the content area
    content_area = soup.find('main' , {"property" : "mainContentOfPage"})
    
    # LOCATION
    try:
        location_scrape = content_area.find('div')
        location_isolate = location_scrape.find_all('p')
        location_string = str(location_isolate[2])
        location_split = location_string.split(",")
        province = cleaning_function(location_split[1])
        city = cleaning_function(location_split[0])
    except:
        print('No Location')
    
    # STATUS
    status_scrape = content_area.find_all('h2')
    status = status_scrape[:1]
    status = str(status)
    front_of_status = status.index('<h2>') + 4
    back_of_status = status.index('</h2>')
    status_cleaned = (f'{status[front_of_status : back_of_status]}')
    
    # FOR THE MISSING ENTRIES
    if 'Missing' in status_cleaned:
        #Now we get into pulling out individual details which will eventually be compiled in a list
        #NAME(MISSING)
        name_scrape = content_area.find_all('h3')
        person_name = name_scrape[:1]
        person_name = str(person_name)
        front_of_name = person_name.index('<h3>') + 4
        back_of_name = person_name.index('</h3>')
        name_cleaned = (f'{person_name[front_of_name : back_of_name]}')
        name_split = name_cleaned.split(',')
        last_name = name_split[0]
        first_name = name_split[1]
        first_name_string = str(name_split[1:2])
        first_name_string = first_name_string.replace('[',"")
        first_name_string = first_name_string.replace(']',"")
        first_name_string = first_name_string.replace("'","")
        first_name_string = first_name_string.replace("\n","")
        first_name_string = first_name_string.strip()
        
        #PERSON DETAILS(MISSING)
        try:
            person_details = content_area.find_all('dd')
            date_missing_discovered = person_details[0]
            year_born = person_details[1]
            age_at_disappearance = person_details[2]
            gender = person_details[3]
            bio_group = person_details[4]
        except:
            print('Data error')

    #FOR THE UNIDENTIFIED ENTRIES
    else:
        try:
            first_name_string = 'Unidentified'
            last_name = 'Unidentified'
            person_details = content_area.find_all('dd')
            date_missing_discovered = person_details[0]
            age_at_disappearance = person_details[1]
            gender = person_details[2]
            bio_group = person_details[3]
            year_born = 'Unknown'
        except:
            print('Data error2')
            
            
    #PUT IT ALL TOGETHER
    person_info.append([first_name_string , last_name , status_cleaned , cleaning_function(date_missing_discovered) , cleaning_function(year_born) , cleaning_function(age_at_disappearance) , cleaning_function(gender) , cleaning_function(bio_group) , city , province , url])
       

### Save the file

In [None]:
# print to a file

full_file = pd.DataFrame(person_info)
full_file.to_csv("output_rcmp.csv")
print('Done')

### To Avoid Running the URL Collector Again - Run Code Below

In [None]:
with open('list.csv') as f:
    allLines = f.readlines()
    TempURLs = list(allLines)
    # remove the column header
    TempURLs = TempURLs[1:]
    f.close()

# clean the elements  
URLs = []
for link in TempURLs:
    URLs.append(link.strip())
    
print(URLs[:10])

#### Function to Turn DL sections into dict - No Longer Used!

In [None]:
def create_dl_dict(soup):
    keys, values = [] , []
    for dl in soup.find_all("dl", {"class":"dl-horizontal"}):
        for dt in dl.find_all("dt"):
            keys.append(dt.text.strip())
        for dd in dl.find_all("dd"):
            values.append(dd.text.strip())
    
    return dict(zip(keys,values))

### Second Method - For More Structured Data

In [None]:
# complete file
complete_db = {}

# used to test
test_URLs = URLs[:2]

# loop through all the URLs
for count, page_url in enumerate(URLs):
    
    # page dict
    page_dict = {}
    #this is where all the person info will go
    page_sections = []
    # make the full URL
    url = page_url
    
    print('==============================================')
    print("Record Number: " + str(count))
    print("Case URL: " + url)
        
    # request the html
    try:
        page = requests.get(url, timeout = 10)
    except requests.exceptions.Timeout:
        print("Timeout occurred")
    
    # structure the page content for parsing
    soup = bs(page.content, 'html.parser') 
    
    #print(soup)
    
    # First we have to pull out the content area
    content_area = soup.find('main' , {"property" : "mainContentOfPage"})
    
    try:
        # the case reference number
        _case_ref = content_area.find('h1')
        page_dict['CaseRef'] = " ".join(_case_ref.text.split())
        
        # the main section
        sections = content_area.section
        
        # the description
        desc = sections.div.p
        page_dict['CaseDesc'] = desc.text.strip()
        
        # the category
        case_type = sections.h2
        page_dict['CaseType'] = " ".join(case_type.text.split())
    except:
        print('page base info collection error')
    
    page_dict["CaseURL"] = url
    
    
    # find all the images in the persons section
    try:
        # the image link
        images = sections.find_all('img')
        imgs_list = []
        for image in images:
            image_src = image['src']
            # check if this matches the no photo image
            no_photo = re.search("noPhoto\.png", image_src)
            if not no_photo:
                # find the iamge ID
                img_id = re.search("id=(\d+).*", image_src)
                imgs_list.append("https://www.services.rcmp-grc.gc.ca/missing-disparus/showImage?"+img_id.group())
                # add the images section    
        # add to the main dict
        page_dict['PageImages'] = imgs_list
    except:
        print("no images found")
    
    """
    # if we need to treat the page types differently
    if page_dict['CaseType'] == 'Missing':
    """
    
    # get the first section with all the persons
    persons_section = sections.section
    
    # how many people are we looking through
    persons_names = persons_section.find_all('h3')
    num_persons = len(persons_names)
    # all the blocks within the section
    persons_blocks = persons_section.find_all('div',{"class":"row"})
    
    # loop through all the person sections to collect their data
    # assigned to their names
    for i in range(num_persons):
        print("Person(s) in Case: "+str(i+1))
        block = {} # stores the individuals info, some pages have 1+
        block['Name'] = " ".join(persons_names[i].text.split())
        
        # select the current persion
        current_person = persons_blocks[i]
        
        # takes all the DL sections out and saves them
        dl_sections = []
        for dl in current_person.find_all("dl"):
            dl_sections.append(str(dl))
        block["InfoSection"] = dl_sections          
        # add the block to the page sections
        page_sections.append(block)
        print(block['Name'])
        #print(persons_blocks[i])
        #print(block)
    
    """
    # If this is an unidentified persons record
    else:
        print("Un IDs Body")
    
    """
    
    # write the section to the dict
    page_dict['PersonsData'] = page_sections
    # write it all to the main DB
    complete_db[page_dict['CaseRef']] = page_dict
        
# write JSON to a file    
with open("Complete_DB.json", "w") as outfile:
    json.dump(complete_db, outfile)
        
print('======================= Done =======================')


In [None]:
# This function takes in a string of <dl> ... </dl> and converts it into a dictionary
def dl_to_dict(dl_str):
    soup = bs(dl_str, 'html.parser')
    #print(soup)

    data = defaultdict(list)
    dl = soup.find('dl')
    k = ''
    for c in sub.contents:
        is_real = bool(str(c).strip())  # real element, not whitespace
        if not is_real:
            continue

        if c.name == 'dt':
            k = c.contents[0].strip()
        elif c.name == 'dd':
            data[k].append(c.contents[0].strip())
            
    return dict(data)
            
    
data = dl_to_dict(complete_db[list(complete_db.keys())[0]]['PersonsData'][0]['InfoSection'][0])
print(json.dumps(data, indent=2))
