# Import Package

In [8]:
import time
import random
import os
import requests
from bs4 import BeautifulSoup

# THSRC Image Downloader

In [9]:
class THSRCImageDownloader:
    def __init__(self, url, download_path):
        self.url = url
        self.download_path = download_path
        self.headers = { 
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
            "Accept-Encoding": "gzip, deflate, br",
            "Accept-Language": "zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7",
            "Cache-Control": "max-age=0",
            "Connection": "keep-alive",
            "Host": "irs.thsrc.com.tw",
            "sec-ch-ua-mobile": "?0",
            "sec-ch-ua-platform": "Windows",
            "Sec-Fetch-Dest": "document",
            "Sec-Fetch-Mode": "navigate",
            "Sec-Fetch-Site": "none",
            "Sec-Fetch-User": "?1",
            "Upgrade-Insecure-Requests": "1",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
        }
        
        if not os.path.exists(download_path):
            os.mkdir(download_path)
    
    def get_validation_code_link(self):
        r = requests.get(self.url, timeout=5, headers=self.headers)
        if r.status_code != requests.codes.ok:
            print("CAN NOT LOADING")
            return []
        soup = BeautifulSoup(r.text, 'html.parser')
        attr = {"class": "captcha-img"}
        img_tags = soup.find_all('img', attrs=attr)
        for tag in img_tags:
            link_str = self.url + tag["src"][6:]
            print(link_str)
        return link_str
    
    def download_img(self, img_index, link):
        img = requests.get(link, headers=self.headers)
        img = img.content
        with open(self.download_path + '/' + str(img_index) + '.jpg', 'wb') as pic_out:
            pic_out.write(img)

        print('img_index:', img_index, 'OK!')

# Batch Downloader

In [10]:
class THSRCImageBatchDownloader(THSRCImageDownloader):
    def __init__(self, url, download_path, img_num):
        super().__init__(url, download_path)
        self.img_num = img_num
        
    def download_images(self):
        for i in range(1, self.img_num + 1):
            time.sleep(random.randint(1, 2))
            link = self.get_validation_code_link()
            self.download_img(i, link)

# Run

In [11]:
url = 'http://irs.thsrc.com.tw/IMINT/'
download_path = "./raww_images"
img_num = 5000

batch_downloader = THSRCImageBatchDownloader(url, download_path, img_num)
batch_downloader.download_images()

http://irs.thsrc.com.tw/IMINT//captImg/xyy4QOLwQw.jpg
img_index: 1 OK!
http://irs.thsrc.com.tw/IMINT//captImg/Qy6UrbmvCy.jpg
img_index: 2 OK!
http://irs.thsrc.com.tw/IMINT//captImg/JUo5dU0x3Q.jpg
img_index: 3 OK!
http://irs.thsrc.com.tw/IMINT//captImg/XSb4n4CBw4.jpg
img_index: 4 OK!
http://irs.thsrc.com.tw/IMINT//captImg/suLSK4aJCz.jpg
img_index: 5 OK!
http://irs.thsrc.com.tw/IMINT//captImg/A5CgivyRmb.jpg
img_index: 6 OK!
http://irs.thsrc.com.tw/IMINT//captImg/7pTf1OfXvy.jpg
img_index: 7 OK!
http://irs.thsrc.com.tw/IMINT//captImg/LpyM0K7dGV.jpg
img_index: 8 OK!
http://irs.thsrc.com.tw/IMINT//captImg/o70HD39BAj.jpg
img_index: 9 OK!
http://irs.thsrc.com.tw/IMINT//captImg/XXmTnhw1gz.jpg
img_index: 10 OK!
http://irs.thsrc.com.tw/IMINT//captImg/kbjyOTXFx0.jpg
img_index: 11 OK!
http://irs.thsrc.com.tw/IMINT//captImg/PfRh0cKP2R.jpg
img_index: 12 OK!
http://irs.thsrc.com.tw/IMINT//captImg/cgWtcxQJ0i.jpg
img_index: 13 OK!
http://irs.thsrc.com.tw/IMINT//captImg/hm4J3cy0Vf.jpg
img_index: 14 OK!


KeyboardInterrupt: 