In [None]:
# Imports
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import pandas as pd
import numpy as np
import string
import time
import os
import glob

In [None]:
# Initialize the webdriver and specify browser
options = Options()
options.page_load_strategy = 'none'
driver = webdriver.Chrome(options=options)

In [None]:
# SEDAR separates company pages by NC and lowercase alphabet
# urls is a list that contains these characters
urls = ["nc"] + list(string.ascii_lowercase)

# Empty list to store company name and company urls
company = []
coUrls = []

# Visit each page and grab all the urls to company profiles
for i in urls:
    
    # Visit the page of companies in alphabetical order
    url = "https://www.sedar.com/issuers/company_issuers_{}_en.htm".format(i)
    driver.get(url)
    
    # Find all the urls on the page
    coUrl = driver.find_elements_by_xpath("//a[@href]")
    
    # For every url
    for c in coUrl:
        
        # Check to see if DisplayProfile is in the url,
        # If it is, append company name and the url to
        # respective list
        if ("DisplayProfile" in c.get_attribute("href")):
            company.append(c.text)
            coUrls.append(c.get_attribute("href"))
    
    # Wait 3 seconds
    time.sleep(3)

In [None]:
# Convert the information collected to a data frame
df = pd.DataFrame(list(zip(company, coUrls)), 
                  columns=["Company", "SEDAR_url"])

# Save to csv
df.to_csv("sedar_companies.csv", index=False)

In [None]:
# Load the data
df = pd.read_csv("sedar_companies.csv")

# Split the data into 4
df = np.array_split(df, 4)[1]

# If "company" directory does not exist, create it
if not os.path.exists('company'):
    os.makedirs('company')

# Count number of files in /company
numFiles = len(os.listdir('./company'))

# Check if the folder is empty
if numFiles == 0:
    
    # If it is empty, do nothing
    pass
else:   
    
    # Empty data frame to store rows
    scrapedCompanies = pd.DataFrame()
    
    # Read every CSV file in this directory
    for file in glob.glob('./company/*.csv'):
        
        # Read the csv file (1 row of data)
        c = pd.read_csv(file, low_memory=False)
        
        # Concatenate the rows
        scrapedCompanies = pd.concat([scrapedCompanies,c],axis=0)
        
        # Remove already scraped companies
        df = df[~df['Company'].isin(scrapedCompanies['Company'])]
        
# Reset index
# The other quarters do not start at index 0, reset them
df.reset_index(drop=True, inplace=True)
        
# Check
df.head()

In [None]:
"""Scrape SEDAR Company Profiles"""

# Empty lists to store info
mailAddress = []
contactName = []
bizEmail = []
telNum = []
faxNum = []
dateForm = []
jurisdiction = []
industryClass = []
cusipNum = []
finYearEnd = []
hqAddress = []
principalRegulator = []
shortFormProspectusIssuer = []
reportJurisdiction = []
stockExchange = []
stockSym = []
auditor = []
generalPartner = []
transferAgent = []
assets = []

# Visit each company profile page
for i in range(len(df.index)):
    
    # Go to page
    driver.get(df['SEDAR_url'][i])
    
    # If there's a captcha
    if (driver.find_elements_by_xpath('/html/body/div[1]/div[2]/div[2]/form/center/input')):
        
        # Print a warning
        print("Captcha test needs to be completed")
        
        # Wait 15 seconds
        time.sleep(15)
    
    ma = driver.find_elements_by_xpath('/html/body/div[3]/center/table/tbody/tr[2]/td[2]')

    # If it exists, add the text to mailAddress
    if ma:
        mailAddress.append(ma[0].text)
    else:
        mailAddress.append("")

    cn = driver.find_elements_by_xpath('/html/body/div[3]/center/table/tbody/tr[3]/td[2]')

    # If it exists, add the text to mailAddress
    if cn:
        contactName.append(cn[0].text)
    else:
        contactName.append("")

    be = driver.find_elements_by_xpath('/html/body/div[3]/center/table/tbody/tr[4]/td[2]')

    # If it exists, add the text to mailAddress
    if be:
        bizEmail.append(be[0].text)
    else:
        bizEmail.append("")

    tn = driver.find_elements_by_xpath('/html/body/div[3]/center/table/tbody/tr[5]/td[2]')

    # If it exists, add the text to mailAddress
    if tn:
        telNum.append(tn[0].text)
    else:
        telNum.append("")

    fn = driver.find_elements_by_xpath('/html/body/div[3]/center/table/tbody/tr[6]/td[2]')

    # If it exists, add the text to mailAddress
    if fn:
        faxNum.append(fn[0].text)
    else:
        faxNum.append("")

    dateF = driver.find_elements_by_xpath('/html/body/div[3]/center/table/tbody/tr[7]/td[2]')

    # If it exists, add the text to mailAddress
    if dateF:
        dateForm.append(dateF[0].text)
    else:
        dateForm.append("")

    jur = driver.find_elements_by_xpath('/html/body/div[3]/center/table/tbody/tr[8]/td[2]')

    # If it exists, add the text to mailAddress
    if jur:
        jurisdiction.append(jur[0].text)
    else:
        jurisdiction.append("")

    ic = driver.find_elements_by_xpath('/html/body/div[3]/center/table/tbody/tr[9]/td[2]')

    # If it exists, add the text to mailAddress
    if ic:
        industryClass.append(ic[0].text)
    else:
        industryClass.append("")

    cusip = driver.find_elements_by_xpath('/html/body/div[3]/center/table/tbody/tr[10]/td[2]')

    # If it exists, add the text to mailAddress
    if cusip:
        cusipNum.append(cusip[0].text)
    else:
        cusipNum.append("")

    fye = driver.find_elements_by_xpath('/html/body/div[3]/center/table/tbody/tr[11]/td[2]')

    # If it exists, add the text to mailAddress
    if fye:
        finYearEnd.append(fye[0].text)
    else:
        finYearEnd.append("")

    hq = driver.find_elements_by_xpath('/html/body/div[3]/center/table/tbody/tr[2]/td[4]')

    # If it exists, add the text to mailAddress
    if hq:
        hqAddress.append(hq[0].text)
    else:
        hqAddress.append("")

    pr = driver.find_elements_by_xpath('/html/body/div[3]/center/table/tbody/tr[3]/td[4]')

    # If it exists, add the text to mailAddress
    if pr:
        principalRegulator.append(pr[0].text)
    else:
        principalRegulator.append("")

    sfpi = driver.find_elements_by_xpath('/html/body/div[3]/center/table/tbody/tr[4]/td[4]')

    # If it exists, add the text to mailAddress
    if sfpi:
        shortFormProspectusIssuer.append(sfpi[0].text)
    else:
        shortFormProspectusIssuer.append("")

    rj = driver.find_elements_by_xpath('/html/body/div[3]/center/table/tbody/tr[5]/td[4]')

    # If it exists, add the text to mailAddress
    if rj:
        reportJurisdiction.append(rj[0].text)
    else:
        reportJurisdiction.append("")

    se = driver.find_elements_by_xpath('/html/body/div[3]/center/table/tbody/tr[6]/td[4]')

    # If it exists, add the text to mailAddress
    if se:
        stockExchange.append(se[0].text)
    else:
        stockExchange.append("")

    ss = driver.find_elements_by_xpath('/html/body/div[3]/center/table/tbody/tr[7]/td[4]')

    # If it exists, add the text to mailAddress
    if ss:
        stockSym.append(ss[0].text)
    else:
        stockSym.append("")

    aud = driver.find_elements_by_xpath('/html/body/div[3]/center/table/tbody/tr[8]/td[4]')

    # If it exists, add the text to mailAddress
    if aud:
        auditor.append(aud[0].text)
    else:
        auditor.append("")

    gp = driver.find_elements_by_xpath('/html/body/div[3]/center/table/tbody/tr[9]/td[4]')

    # If it exists, add the text to mailAddress
    if gp:
        generalPartner.append(gp[0].text)
    else:
        generalPartner.append("")

    ta = driver.find_elements_by_xpath('/html/body/div[3]/center/table/tbody/tr[10]/td[4]')

    # If it exists, add the text to mailAddress
    if ta:
        transferAgent.append(ta[0].text)
    else:
        transferAgent.append("")
    
    asset = driver.find_elements_by_xpath('/html/body/div[3]/center/table/tbody/tr[11]/td[4]')

    # If it exists, add the text to mailAddress
    if asset:
        assets.append(asset[0].text)
    else:
        assets.append("")
        
    # Convert the lists to a dataframe
    r = pd.DataFrame(list(zip(mailAddress, contactName, bizEmail, telNum, faxNum, dateForm,
                              jurisdiction, industryClass, cusipNum, finYearEnd, hqAddress,
                              principalRegulator, shortFormProspectusIssuer, reportJurisdiction,
                              stockExchange, stockSym, auditor, generalPartner, transferAgent,
                              assets)),
                     columns = ['mailAddress', 'contactName', 'bizEmail', 'telNum', 'faxNum', 
                                'dateForm', 'jurisdiction', 'industryClass', 'cusipNum', 
                                'finYearEnd', 'hqAddress', 'principalRegulator', 
                                'shortFormProspectusIssuer', 'reportJurisdiction', 'stockExchange', 
                                'stockSym', 'auditor', 'generalPartner', 'transferAgent', 'assets'])
    
    # Concat horizontally with current row
    r = pd.concat([df.iloc[[i]], r], axis=1)

    # Save to CSV
    r.to_csv("./company/{}_2.csv".format(i+1+numFiles), index=False)

    # Wait 15 seconds
    time.sleep(15)