# Crawler

The annual report of the company is one of the important data sources for market analysis. 






Therefore, we usually need to design programs to crawl the annual reports of multiple companies. 




Automatic crawlers will make our analysis easier. 





Here, take China Evergrande (HK 3333) as an example, please use python to design a crawler program that can automatically crawl and download the PDF of the annual report from https://www.hkexnews.hk/index.htm

In [4]:
!pip install requests
!pip install parsel

import requests
from parsel import Selector
import os
import json



In [5]:
# STEP 1: 
# first we get and parse the dictionary for tracking the stocks first



r = requests.get("https://www.hkexnews.hk/ncms/script/eds/activestock_sehk_e.json")
stock_table = r.json()
new_dict = {}
for i in range(len(stock_table)):
  new_dict[stock_table[i].get('c')]=stock_table[i]
print(new_dict)







{'00001': {'i': 1, 'c': '00001', 'n': 'CKH HOLDINGS', 's': 2363}, '00002': {'i': 2, 'c': '00002', 'n': 'CLP HOLDINGS', 's': 2373}, '00003': {'i': 3, 'c': '00003', 'n': 'HK & CHINA GAS', 's': 6177}, '00004': {'i': 4, 'c': '00004', 'n': 'WHARF HOLDINGS', 's': 16963}, '00005': {'i': 5, 'c': '00005', 'n': 'HSBC HOLDINGS', 's': 7175}, '00006': {'i': 6, 'c': '00006', 'n': 'POWER ASSETS', 's': 12036}, '00007': {'i': 7, 'c': '00007', 'n': 'HK FINANCE INV', 's': 6185}, '00008': {'i': 10, 'c': '00008', 'n': 'PCCW', 's': 11967}, '00009': {'i': 13, 'c': '00009', 'n': 'KEYNE LTD', 's': 9852}, '00010': {'i': 17, 'c': '00010', 'n': 'HANG LUNG GROUP', 's': 6108}, '00011': {'i': 18, 'c': '00011', 'n': 'HANG SENG BANK', 's': 6110}, '00012': {'i': 19, 'c': '00012', 'n': 'HENDERSON LAND', 's': 6144}, '00014': {'i': 21, 'c': '00014', 'n': 'HYSAN DEV', 's': 8304}, '00015': {'i': 22, 'c': '00015', 'n': "VANTAGE INT'L", 's': 15963}, '00016': {'i': 25, 'c': '00016', 'n': 'SHK PPT', 's': 14001}, '00017': {'i': 

In [6]:
# STEP 2
# Then we find the stockId in HKSE's System so that we can crawl directly to that stock
# we can set other params for the crawl request
# xpath to download_report



def get_stock_id(t): 
  stock= new_dict[str(t)]
  stock_id = stock.get('i')
  stock_name = stock.get('n')
  return stock_id



print(get_stock_id("03333"))

39454


In [8]:
# STEP 3 : crawl document to diretory



'''
POST /search/titlesearch.xhtml?lang=en HTTP/1.1
Host: www1.hkexnews.hk
Content-Type: application/x-www-form-urlencoded
Content-Length: 156

lang=EN&
category=0&
market=SEHK&
searchType=1&
documentType=-1&
t1code=40000&
t2Gcode=-2&
t2code=40100&
stockId=39454&
from=20070625&
to=20200401&
MB-Daterange=0&
title=


'''


# to get annual report, the code in post request is: t1=40000 t2Gcode =-2 t2code=40100



def crawl_annual_report(stock_ticker, date_from, date_to, directory_to_write):
  if(date_from==None):
    print("Please Input a Date From Parameter")
    return 
  if(date_to==None):
    print("Please Input a Date To Parameter")
    return 


  xpath= '//*[@id="titleSearchResultPanel"]/div/div[1]/table/tbody/tr/td[4]/div[2]/a'  # DO NOT MODIFY THIS
  hkex_uri = 'https://www1.hkexnews.hk' #host uri DO NOT MODIFY THIS
  search_url = hkex_uri+'/search/titlesearch.xhtml?lang=en' #path for search


  stock_id = get_stock_id(stock_ticker)


  # 1. send a https post request
  query_data = 'lang=EN&category=0&market=SEHK&searchType=1&documentType=-1&t1code=40000&t2Gcode=-2&t2code=40100&stockId='+str(stock_id)+'&from='+date_from+'&to='+date_to+'&MB-Daterange=0&title='

  response = requests.post(search_url, headers = {'Content-Type': 'application/x-www-form-urlencoded'}, data = query_data)
  
  
  # response = requests.post(hkex_url,headers = {'Content-Type': 'application/json'},params=query_params)
  # query_params = {
  #     "lang":"EN",
  #     "category":"0",
  #     "market":"SEHK",
  #     "searchType":"1",
  #     "documentType":"-1",
  #     "t1code":"40000",
  #     "t2Gcode":"-2",
  #     "t2code": "40100",
  #     "stockId":str(stock_id),
  #     "from":str(date_from),
  #     "to":str(date_to),
  #     "MB-Daterange":"0"

  # }
  # print(response.text)
  



  # 2. get back content and final all pdf annual report using XPath
  selector = Selector(text=response.text)
  links = selector.xpath(xpath)
  #optional, we can also use the data from the table
  stock_code = selector.xpath('//*[@id="titleSearchResultPanel"]/div/div[1]/table/tbody/tr/td[2]/text()').get().strip()
  stock_name = selector.xpath('//*[@id="titleSearchResultPanel"]/div/div[1]/table/tbody/tr/td[3]/text()').get().strip()




  # 3. send a get request for each annual report and download
  # we create a new folder
  if directory_to_write==None or not os.path.isdir(directory_to_write): 
    directory_to_write = os.getcwd()+'/'+stock_code+'_'+stock_name
    try: 
      os.mkdir(directory_to_write) 
    except OSError as error: 
      print("directory already exists. we will use the original directory", error) 

  for index,link in enumerate(links):
    path = link.xpath('@href').get().strip()
    name_year = stock_ticker+"_" + path.split("/").pop().split(".")[0][3:7]+"_Annual_Report"
    print(path, name_year)
    cur_report_url = hkex_uri + path
    get_pdf = requests.get(cur_report_url)
    write_path = directory_to_write+'/'+name_year+'.pdf'
    try: 
      open(write_path, 'wb').write(get_pdf.content)
    except OSError as error: 
      print("File Exists: we are going to overwrite on ", write_path)


  



In [None]:
# Let's test
date_from = "20070625" #optional param, user can set
date_to = "20200402" #optional param, user can set


# NOTE THAT ALL PARAMS NEED TO BE STRING
# ticker , date_from,  date_to,  diretory
crawl_annual_report("03333","20070625","20200401",None)
