# Import package

In [1]:
from collections import OrderedDict
from queue import Queue
import re
import threading
import time

from bs4 import BeautifulSoup
import pandas as pd
import requests

# Crawl the page by giving url

In [2]:
class CrawlThread(threading.Thread):
    def __init__(self, url, year, q, lock):
        threading.Thread.__init__(self)
        self.url = url
        self.year = year
        self.q = q
        self.lock = lock
    
    def run(self):
        self.crawler_job(self.year, self.url, self.q, self.lock)
        print(self.year, 'finish')
    
    def crawler_job(self, year, url, q, lock):
        pattern = re.compile('(?P<year>\d+)/(?P<month>\d+)/(?P<day>\d+)')
        lock.acquire()
        US_exchange_rate_data = OrderedDict()
        try:
            annual_page_content = BeautifulSoup(requests.get(url).text, "html.parser")
            # Get the table of exchange rate
            row_data = annual_page_content.find(class_='text12').find_all('tr')[1:]
            for row in row_data:
                date, NTD_USD = [r.text for r in row.find_all('td')]
                date = re.sub('\s|\u3000', '', date)
                if date == "":
                    continue
                if pattern.match(date) != None:
                    month = '{:02d}'.format(int(pattern.match(date).group('month')))
                    day = '{:02d}'.format(int(pattern.match(date).group('day')))
                    year_and_month = '{}/{}'.format(year, month)
                    # Replace the date format from '103/03/2' to '2014/03/2'
                    date = '{}/{}'.format(year_and_month, day)
                    US_exchange_rate_data[date] = NTD_USD
                else:
                    # Replace the date format from '2' to '2014/03/2'
                    date = '{}/{:02d}'.format(year_and_month, int(date))
                    US_exchange_rate_data[date] = NTD_USD
        except AttributeError:
            print(url)
            print('Error')
            
        q.put(US_exchange_rate_data)
        time.sleep(0.5)
        lock.release()       

# Start Crawl

In [3]:
def get_page(url):
    content = requests.get(url).text    
    soup = BeautifulSoup(content, "html.parser")

    return soup

In [4]:
BASE_URL = 'https://www.cbc.gov.tw/'
US_exchange_rate_data = OrderedDict()

url = 'https://www.cbc.gov.tw/lp.asp?ctNode=382&CtUnit=128&BaseDSD=7'
page_content = BeautifulSoup(requests.get(url).text, "html.parser")
# Get the page link of exchange rate of every year
annual_page_link = page_content.find(class_='list').find_all('a')
annual_data_url = ['{}{}'.format(BASE_URL, annual_link.get('href')) for annual_link in annual_page_link]
annual_data_year = [int(annual_link.get('title')[:4]) for annual_link in annual_page_link]

threads = []
lock = threading.Lock()
q = Queue()
for year, url in zip(annual_data_year, annual_data_url):
    t = CrawlThread(url, year, q, lock)
    t.start()
    threads.append(t)

for thread in threads:
    thread.join()

print('Done')
for _ in range(len(annual_page_link)):
    US_exchange_rate_data.update(q.get())

2018 finish
2017 finish
2016 finish
2015 finish
2014 finish
2013 finish
2012 finish
2011 finish
2010 finish
2009 finish
2008 finish
https://www.cbc.gov.tw/ct.asp?xItem=2284&ctNode=382&mp=1
Error
2007 finish
2006 finish
2005 finish
2004 finish
Done


# Output CSV

In [5]:
pd.DataFrame.from_dict(US_exchange_rate_data, orient='index').to_csv('data.csv')