## Missing Persons DB Webscrape

https://www.services.rcmp-grc.gc.ca/missing-disparus/search-recherche.jsf

Search with no criteria will bring up all results.

The links to the missing persons pages, can be appended to https://www.services.rcmp-grc.gc.ca/

### ========== Code

#### Import packages

In [1]:
import time
import pandas as pd
import re
from collections import defaultdict
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from bs4 import BeautifulSoup as bs
import requests
import json
import csv
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from os import walk

#### link to search results + install selenium if it is not already installed

In [None]:
link = 'https://www.services.rcmp-grc.gc.ca/missing-disparus/search-recherche.jsf'
# Get a chrome driver if there isn't one locally
service = Service(ChromeDriverManager().install())

#### loop through all the search result pages and collect the URLs

In [None]:
browser = webdriver.Chrome(service=service) 
browser.get(link)
time.sleep(2)

# find search and click it to reach data
try:
    search = browser.find_element(By.NAME, 'searchForm:j_idt158')
    search.click()
except:
    print('could not find Search')

time.sleep(3)

not_last = True
# to store all the URLs
URLs = set()

# go through each page and get all URLs    
while not_last:
    time.sleep(2) # wait more just in case
    page = browser.page_source
    pageSoup = bs(page, 'html.parser')
    
    # get all the links on the page and add them to array
    pageURLs = pageSoup.find_all('a',class_="wet-newwindow")
    
    print("Collecting URLs...")
    
    # take each link on the page and add if not a dupe
    for link in pageURLs:
        href = link.get('href')
        if href not in URLs:
            URLs.add("https://www.services.rcmp-grc.gc.ca" + href)

    print("Page Complete!")
    time.sleep(2) # wait a little
    
    # are we on the last page
    try:
        # click the next button at the bottom of the page
        next_page = browser.find_element(By.XPATH, '/html/body/main/form/div[33]/ul/li[83]/a')
        print('Found next button to press.')
        next_page.click()
        time.sleep(2) # wait for next page to load
    except:
        # should not have a next button on the last page
        print('last page or no next button found!')
        not_last = False
    
# the final list
print("================================== END ==================================")

# write progress to csv
df = pd.DataFrame(URLs, columns=["URL"])
df.to_csv('list.csv', index=False)

browser.quit()

## Collect all the data from all the detailed case pages

In [None]:
base_url = r'https://www.services.rcmp-grc.gc.ca'

#### To Avoid Running the URL Collector Again - Run Code Below

In [2]:
with open('list.csv') as f:
    allLines = f.readlines()
    TempURLs = list(allLines)
    # remove the column header
    TempURLs = TempURLs[1:]
    f.close()

# clean the elements  
URLs = []
for link in TempURLs:
    URLs.append(link.strip())
    
print(URLs[:10])

['https://www.services.rcmp-grc.gc.ca//missing-disparus/case-dossier.jsf?case=2012020064&id=25', 'https://www.services.rcmp-grc.gc.ca//missing-disparus/case-dossier.jsf?case=2014002472&id=4', 'https://www.services.rcmp-grc.gc.ca//missing-disparus/case-dossier.jsf?case=2018041450&id=10', 'https://www.services.rcmp-grc.gc.ca//missing-disparus/case-dossier.jsf?case=2016061097&id=22', 'https://www.services.rcmp-grc.gc.ca//missing-disparus/case-dossier.jsf?case=2020021899&id=29', 'https://www.services.rcmp-grc.gc.ca//missing-disparus/case-dossier.jsf?case=2014002350&id=3', 'https://www.services.rcmp-grc.gc.ca//missing-disparus/case-dossier.jsf?case=2006021899&id=3', 'https://www.services.rcmp-grc.gc.ca//missing-disparus/case-dossier.jsf?case=2013000126&id=22', 'https://www.services.rcmp-grc.gc.ca//missing-disparus/case-dossier.jsf?case=2014004448&id=24', 'https://www.services.rcmp-grc.gc.ca//missing-disparus/case-dossier.jsf?case=2019054192&id=0']


#### Function to Turn DL sections into dict

In [7]:
# This function takes in a string of <dl> ... </dl> and converts it into a dictionary
def dl_to_dict(dl_str):
    soup = bs(dl_str, 'html.parser')
    dl = soup.find('dl')
    if not dl:
        return {}
    
    data = defaultdict(list)
    k = ''
    for c in dl.contents:
        is_real = bool(str(c).strip())  # real element, not whitespace
        if not is_real:
            continue

        if c.name == 'dt':
            k = c.contents[0].strip()
        elif c.name == 'dd':
            data[k].append(c.contents[0].strip())
            
    return dict(data)
            
# For Testing the Function

#data = dl_to_dict(complete_db[list(complete_db.keys())[0]]['PersonsData'][0]['InfoSection'][0])
#print(json.dumps(data, indent=2))


#### Looping through and scraping the data into a json file

In [8]:
def scrape_database(urls_list: list, iteration: int):

    # complete file
    complete_db = {}

    # loop through all the URLs
    for count, page_url in enumerate(urls_list):
        
        # page dict
        page_dict = {}
        #this is where all the person info will go
        page_sections = []
        # make the full URL
        url = page_url
        
        print('==============================================')
        print("Record Number: " + str(count))
        print("Case URL: " + url)
            
        # request the html
        try:
            page = requests.get(url, timeout = 10)
        except requests.exceptions.Timeout:
            print("Timeout occurred")
        
        # structure the page content for parsing
        soup = bs(page.content, 'html.parser')
        
        #print(soup)
        
        # First we have to pull out the content area
        content_area = soup.find('main' , {"property" : "mainContentOfPage"})
        
        try:
            # the case reference number
            _case_ref = content_area.find('h1')
            page_dict['CaseRef'] = " ".join(_case_ref.text.split())
            
            # the main section
            sections = content_area.section
            
            # the description
            desc = sections.div.p
            page_dict['CaseDesc'] = desc.text.strip()
            
            # the category
            case_type = sections.h2
            page_dict['CaseType'] = " ".join(case_type.text.split())
        except:
            print('page base info collection error')
        
        page_dict["CaseURL"] = url
        
        
        # find all the images in the persons section
        try:
            # the image link
            images = sections.find_all('img')
            imgs_list = []
            for image in images:
                image_src = image['src']
                # check if this matches the no photo image
                no_photo = re.search("noPhoto\.png", image_src)
                if not no_photo:
                    # find the iamge ID
                    img_id = re.search("id=(\d+).*", image_src)
                    imgs_list.append("https://www.services.rcmp-grc.gc.ca/missing-disparus/showImage?"+img_id.group())
                    # add the images section    
            # add to the main dict
            page_dict['PageImages'] = imgs_list
        except:
            print("no images found")
        
        """
        # if we need to treat the page types differently
        if page_dict['CaseType'] == 'Missing':
        """
        
        # get the first section with all the persons
        persons_section = sections.section
        
        # how many people are we looking through
        persons_names = persons_section.find_all('h3')
        num_persons = len(persons_names)
        # all the blocks within the section
        persons_blocks = persons_section.find_all('div',{"class":"row"})
        
        # loop through all the person sections to collect their data
        # assigned to their names
        for i in range(num_persons):
            print("Person(s) in Case: "+str(i+1))
            block = {} # stores the individuals info, some pages have 1+
            block['Name'] = " ".join(persons_names[i].text.split())
            
            # select the current persion
            current_person = persons_blocks[i]
            
            # array to save all the individual dl sections
            dl_sections = []
            
            # takes all the DL sections out and saves them
            for dl in current_person.find_all("dl"):
                # call the dl formatting function
                dl_section_dict = dl_to_dict(str(dl))            
                dl_sections.append(dl_section_dict)
                
            # append the formatted sectins array to the block
            block["InfoSection"] = dl_sections          
            
            # add the block to the page sections
            page_sections.append(block)
            print(block['Name'])
        
        # write the section to the dict
        page_dict['PersonsData'] = page_sections
        # write it all to the main DB
        complete_db[page_dict['CaseRef']] = page_dict
        
    # fine name to write to
    filename = "data\RCMP_Data_Part_" + str(iteration) + ".json"
    
    # write JSON to a file        
    with open(filename, "w") as outfile:        
        outfile.write(json.dumps(complete_db, indent = 2))
            
    print('======================= Done Part '+str(iteration)+' =======================')


#### Pageination of the data conversion

In [9]:
# number of cases in each subfile
page_size = 100

# for testing
#test_urls = URLs[:250]
#paginated_list = [test_urls[i:i+page_size] for i in range(0, len(test_urls), page_size)]

# split the list into smaller parts
paginated_list = [URLs[i:i+page_size] for i in range(0, len(URLs), page_size)]

# loop through the divided list and output files
for count, list_section in enumerate(paginated_list):
    scrape_database(list_section, count)

Record Number: 0
Case URL: https://www.services.rcmp-grc.gc.ca//missing-disparus/case-dossier.jsf?case=2012020064&id=25
Person(s) in Case: 1
MURDOCK, Irma Lynn
Record Number: 1
Case URL: https://www.services.rcmp-grc.gc.ca//missing-disparus/case-dossier.jsf?case=2014002472&id=4
Person(s) in Case: 1
ROGERS, Stacey Lynn
Record Number: 2
Case URL: https://www.services.rcmp-grc.gc.ca//missing-disparus/case-dossier.jsf?case=2018041450&id=10
Person(s) in Case: 1
DAOUD, Fakhri
Record Number: 3
Case URL: https://www.services.rcmp-grc.gc.ca//missing-disparus/case-dossier.jsf?case=2016061097&id=22
Person(s) in Case: 1
Male, 15 - 30
Record Number: 4
Case URL: https://www.services.rcmp-grc.gc.ca//missing-disparus/case-dossier.jsf?case=2020021899&id=29
Person(s) in Case: 1
HALFE, Nowell Leonard
Record Number: 5
Case URL: https://www.services.rcmp-grc.gc.ca//missing-disparus/case-dossier.jsf?case=2014002350&id=3
Person(s) in Case: 1
BEAUDOIN, Joseph Paul Pierre
Record Number: 6
Case URL: https://www

IndexError: list index out of range

#### Convert Page Sections into CSV Database

In [11]:
# get all the json files in the folder
all_files = next(walk("data\\"), (None, None, []))[2]
json_files = []

# get only the json files
for file in all_files:
    if file[-4:] == "json":
        json_files.append(file)

# create a df to store data
main_df = pd.DataFrame(columns=["CaseRef","CaseDesc","CaseType","CaseURL","PageImages","PersonsData"])

# loop through and convert to csv
for file in json_files:
    path = 'data/' + file
    temp_df = pd.read_json(path)
    temp_df_transposed = temp_df.transpose()    
    main_df = pd.concat([main_df, temp_df_transposed])
    
main_df.to_csv("data\json_converted.csv")