In [16]:
# Imports
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import pandas as pd
import numpy as np
import string
import time

In [17]:
# Initialize the webdriver and specify browser
options = Options()
options.page_load_strategy = 'none'
driver = webdriver.Chrome(options=options)

In [2]:
# SEDAR separates company pages by NC and lowercase alphabet
# urls is a list that contains these characters
urls = ["nc"] + list(string.ascii_lowercase)

# Empty list to store company name and company urls
company = []
coUrls = []

# Visit each page and grab all the urls to company profiles
for i in urls:
    
    # Visit the page of companies in alphabetical order
    url = "https://www.sedar.com/issuers/company_issuers_{}_en.htm".format(i)
    driver.get(url)
    
    # Find all the urls on the page
    coUrl = driver.find_elements_by_xpath("//a[@href]")
    
    # For every url
    for c in coUrl:
        
        # Check to see if DisplayProfile is in the url,
        # If it is, append company name and the url to
        # respective list
        if ("DisplayProfile" in c.get_attribute("href")):
            company.append(c.text)
            coUrls.append(c.get_attribute("href"))
    
    # Wait 3 seconds
    time.sleep(3)

In [None]:
# Convert the information collected to a data frame
df = pd.DataFrame(list(zip(company, coUrls)), 
                  columns=["Company", "SEDAR_url"])

# Save to csv
df.to_csv("sedar_companies.csv", index=False)

In [18]:
# Load the data
df = pd.read_csv("sedar_companies.csv")

# Split the data into 4
df = np.array_split(df, 4)[0]

# Check
df.head()

Unnamed: 0,Company,SEDAR_url
0,01 Communique Laboratory Inc.,https://www.sedar.com/DisplayProfile.do?lang=E...
1,0373849 B.C. Ltd.,https://www.sedar.com/DisplayProfile.do?lang=E...
2,0694758 B.C. Ltd.,https://www.sedar.com/DisplayProfile.do?lang=E...
3,0757138 B.C. Ltd.,https://www.sedar.com/DisplayProfile.do?lang=E...
4,0799714 B.C. Ltd.,https://www.sedar.com/DisplayProfile.do?lang=E...


In [19]:
"""Scrape SEDAR Company Profiles"""

# Empty lists to store info
mailAddress = []
contactName = []
bizEmail = []
telNum = []
faxNum = []
dateForm = []
jurisdiction = []
industryClass = []
cusipNum = []
finYearEnd = []
hqAddress = []
principalRegulator = []
shortFormProspectusIssuer = []
reportJurisdiction = []
stockExchange = []
stockSym = []
auditor = []
generalPartner = []
transferAgent = []
assets = []

# Visit each company profile page
for i in range(len(df.index)):
    
    # Go to page
    driver.get(df['SEDAR_url'][i])
    
    # For the first site
    if (i==0):
        # Wait 15s in case there's a captcha
        time.sleep(15)
    
    ma = driver.find_elements_by_xpath('/html/body/div[3]/center/table/tbody/tr[2]/td[2]')

    # If it exists, add the text to mailAddress
    if ma:
        mailAddress.append(ma[0].text)
    else:
        mailAddress.append("")

    cn = driver.find_elements_by_xpath('/html/body/div[3]/center/table/tbody/tr[3]/td[2]')

    # If it exists, add the text to mailAddress
    if cn:
        contactName.append(cn[0].text)
    else:
        contactName.append("")

    be = driver.find_elements_by_xpath('/html/body/div[3]/center/table/tbody/tr[4]/td[2]')

    # If it exists, add the text to mailAddress
    if be:
        bizEmail.append(be[0].text)
    else:
        bizEmail.append("")

    tn = driver.find_elements_by_xpath('/html/body/div[3]/center/table/tbody/tr[5]/td[2]')

    # If it exists, add the text to mailAddress
    if tn:
        telNum.append(tn[0].text)
    else:
        telNum.append("")

    fn = driver.find_elements_by_xpath('/html/body/div[3]/center/table/tbody/tr[6]/td[2]')

    # If it exists, add the text to mailAddress
    if fn:
        faxNum.append(fn[0].text)
    else:
        faxNum.append("")

    dateF = driver.find_elements_by_xpath('/html/body/div[3]/center/table/tbody/tr[7]/td[2]')

    # If it exists, add the text to mailAddress
    if dateF:
        dateForm.append(dateF[0].text)
    else:
        dateForm.append("")

    jur = driver.find_elements_by_xpath('/html/body/div[3]/center/table/tbody/tr[8]/td[2]')

    # If it exists, add the text to mailAddress
    if jur:
        jurisdiction.append(jur[0].text)
    else:
        jurisdiction.append("")

    ic = driver.find_elements_by_xpath('/html/body/div[3]/center/table/tbody/tr[9]/td[2]')

    # If it exists, add the text to mailAddress
    if ic:
        industryClass.append(ic[0].text)
    else:
        industryClass.append("")

    cusip = driver.find_elements_by_xpath('/html/body/div[3]/center/table/tbody/tr[10]/td[2]')

    # If it exists, add the text to mailAddress
    if cusip:
        cusipNum.append(cusip[0].text)
    else:
        cusipNum.append("")

    fye = driver.find_elements_by_xpath('/html/body/div[3]/center/table/tbody/tr[11]/td[2]')

    # If it exists, add the text to mailAddress
    if fye:
        finYearEnd.append(fye[0].text)
    else:
        finYearEnd.append("")

    hq = driver.find_elements_by_xpath('/html/body/div[3]/center/table/tbody/tr[2]/td[4]')

    # If it exists, add the text to mailAddress
    if hq:
        hqAddress.append(hq[0].text)
    else:
        hqAddress.append("")

    pr = driver.find_elements_by_xpath('/html/body/div[3]/center/table/tbody/tr[3]/td[4]')

    # If it exists, add the text to mailAddress
    if pr:
        principalRegulator.append(pr[0].text)
    else:
        principalRegulator.append("")

    sfpi = driver.find_elements_by_xpath('/html/body/div[3]/center/table/tbody/tr[4]/td[4]')

    # If it exists, add the text to mailAddress
    if sfpi:
        shortFormProspectusIssuer.append(sfpi[0].text)
    else:
        shortFormProspectusIssuer.append("")

    rj = driver.find_elements_by_xpath('/html/body/div[3]/center/table/tbody/tr[5]/td[4]')

    # If it exists, add the text to mailAddress
    if rj:
        reportJurisdiction.append(rj[0].text)
    else:
        reportJurisdiction.append("")

    se = driver.find_elements_by_xpath('/html/body/div[3]/center/table/tbody/tr[6]/td[4]')

    # If it exists, add the text to mailAddress
    if se:
        stockExchange.append(se[0].text)
    else:
        stockExchange.append("")

    ss = driver.find_elements_by_xpath('/html/body/div[3]/center/table/tbody/tr[7]/td[4]')

    # If it exists, add the text to mailAddress
    if ss:
        stockSym.append(ss[0].text)
    else:
        stockSym.append("")

    aud = driver.find_elements_by_xpath('/html/body/div[3]/center/table/tbody/tr[8]/td[4]')

    # If it exists, add the text to mailAddress
    if aud:
        auditor.append(aud[0].text)
    else:
        auditor.append("")

    gp = driver.find_elements_by_xpath('/html/body/div[3]/center/table/tbody/tr[9]/td[4]')

    # If it exists, add the text to mailAddress
    if gp:
        generalPartner.append(gp[0].text)
    else:
        generalPartner.append("")

    ta = driver.find_elements_by_xpath('/html/body/div[3]/center/table/tbody/tr[10]/td[4]')

    # If it exists, add the text to mailAddress
    if ta:
        transferAgent.append(ta[0].text)
    else:
        transferAgent.append("")
    
    asset = driver.find_elements_by_xpath('/html/body/div[3]/center/table/tbody/tr[11]/td[4]')

    # If it exists, add the text to mailAddress
    if asset:
        assets.append(asset[0].text)
    else:
        assets.append("")

    # Wait 10 seconds
    time.sleep(10)

WebDriverException: Message: chrome not reachable
  (Session info: chrome=95.0.4638.54)


In [22]:
# Convert the lists to columns in df
df['mailAddress'] = mailAddress
df['contactName'] = contactName
df['bizEmail'] = bizEmail
df['telNum'] = telNum
df['faxNum'] = faxNum
df['dateForm'] = dateForm
df['jurisdiction'] = jurisdiction
df['industryClass'] = industryClass
df['cusipNum'] = cusipNum
df['finYearEnd'] = finYearEnd
df['hqAddress'] = hqAddress
df['principalRegulator'] = principalRegulator
df['shortFormProspectusIssuer'] = shortFormProspectusIssuer
df['reportJurisdiction'] = reportJurisdiction
df['stockExchange'] = stockExchange
df['stockSym'] = stockSym
df['auditor'] = auditor
df['generalPartner'] = generalPartner
df['transferAgent'] = transferAgent
df['assets'] = assets

# Save to CSV
df.to_csv("sedar_companies_1.csv", index=False)

# Display
df.head()

Unnamed: 0,Company,SEDAR_url,mailAddress,contactName,bizEmail,telNum,faxNum,dateForm,jurisdiction,industryClass,...,hqAddress,principalRegulator,shortFormProspectusIssuer,reportJurisdiction,stockExchange,stockSym,auditor,generalPartner,transferAgent,assets
0,01 Communique Laboratory Inc.,https://www.sedar.com/DisplayProfile.do?lang=E...,"789 DON MILLS ROAD\nSUITE 700\nTORONTO, ONTARI...",BRIAN STRINGER,investorrelations@01com.com,905 795-2888,905 795-0101,Oct 7 1992,ONTARIO,industrial products - technology - software,...,"789 DON MILLS ROAD\nSUITE 700\nTORONTO, ONTARI...",Ontario,No,All provinces and territories of Canada,"TSX Venture, Over-the-counter markets",ONE,McGovern Hurley,,EQUITY TRANSFER SERVICES INC.,"$5,000,001 to $25,000,000"
1,0373849 B.C. Ltd.,https://www.sedar.com/DisplayProfile.do?lang=E...,"Suite 108 - 4664 Lougheed Highway\nBurnaby, Br...",Jimmy Mah,JMah@procongroup.net,604 291-8292,604 291-8082,Oct 4 1989,British Columbia,metals and minerals - mining,...,"Suite108 - 4664 Lougheed Highway\nBurnaby, Bri...",British Columbia,No,Ceased Reporting,,,PriceWaterhouseCoopers LLP,,Computershare Investor Services Inc.,"$5,000,001 to $25,000,000"
2,0694758 B.C. Ltd.,https://www.sedar.com/DisplayProfile.do?lang=E...,"Suite 312 - 837 W. Hastings Street\nVancouver,...",Suzanne Gradl,sgradl@grossogroup.com,604 687-1828,604 687-1858,May 11 2004,BC,metals and minerals - mining,...,"Suite 312 - 837 W. Hastings Street\nVancouver,...",British Columbia,No,Not Applicable,,,PricewaterhouseCoopers LLP,,Computershare Investor Services Inc.,"$5,000,001 to $25,000,000"
3,0757138 B.C. Ltd.,https://www.sedar.com/DisplayProfile.do?lang=E...,"#102 - 17957 - 55th Avenue\nSurrey, British Co...",Laura Williams,,604 574-7510,604 574-7520,May 9 2006,British Columbia,other,...,"#102 - 17957 - 55th Avenue\nSurrey, British Co...",No filing is or has previously been made under...,No,Ceased Reporting,,,KPMG,,,"$5,000,001 to $25,000,000"
4,0799714 B.C. Ltd.,https://www.sedar.com/DisplayProfile.do?lang=E...,"1500 - 625 Howe Street\nVancouver, British Col...","Cori Compton, Manager, Corporate Governance",ccompton@panamericansilver.com,604 684-1175,604 684-0147,Feb 22 2019,British Columbia,metals and minerals - mining,...,"1500 - 625 Howe Street\nVancouver, British Col...",British Columbia,No,Ceased Reporting,,,Deloitte LLP,,Computershare Investor Services Inc.,"Over $1,000,000,000"
