In [1]:
import matplotlib.pyplot as plt
import os
import PIL
import numpy
from sklearn.preprocessing import StandardScaler
from sklearn.externals import joblib

clf = joblib.load('captcha.pkl')

import cv2


def saveKaptcha(image, dest):
    """
    將驗證碼中的數字個別切割，並儲存成圖檔
    """
    pil_image = PIL.Image.open('kaptcha.jpg').convert('RGB')
    plt.imshow(pil_image)
    
    open_cv_image = numpy.array(pil_image)
    imgray = cv2.cvtColor(open_cv_image, cv2.COLOR_BGR2GRAY)  # 轉換成灰階
    ret, thresh = cv2.threshold(imgray, 127, 255, 0)  # 使用域值(threshold)將影像二值化
    image, contours, hierarchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)  # 搜尋物體的輪廓

    cnts = sorted([(c, cv2.boundingRect(c)[0]) for c in contours], key=lambda x:x[1])  # 排序將所有物體的輪廓數值

    array = []
    
    # 將數字的輪廓存進 array
    for (c, _) in cnts:
        (x, y, w, h) = cv2.boundingRect(c)
        if h == 24:
            array.append((x,y,w,h))
    
    # 將數字輸出圖檔
    figure = plt.figure()
    for id, (x, y, w, h) in enumerate(array):
        roi = open_cv_image[y:y+h, x:x+w]
        thresh = roi.copy()
        plt.imshow(thresh)
        plt.savefig(os.path.join(dest, '{}.jpg'.format(id)), dpi=100)
        plt.close()

        
def predictKaptcha(dest):
    """
    預測驗證碼
    """
    data = list()
    
    for index, img in enumerate(os.listdir(dest)):
        pil_image = PIL.Image.open(os.path.join(dest, '{}'.format(img))).convert('1')
        
        basewidth = 50
        wpercent = (basewidth/float(pil_image.size[0]))
        hsize = int((float(pil_image.size[1])*float(wpercent)))
        img = pil_image.resize((basewidth, hsize), PIL.Image.ANTIALIAS)
        data.append([pixel for pixel in iter(img.getdata())])
    
    digits_array = numpy.array(data)
    scaler = StandardScaler()
    scaler.fit(digits_array)
    data_scaled = scaler.transform(digits_array)
    return clf.predict(data_scaled)


def getVerificationCode(driver):
    """
    使用Selenium取得網頁上的驗證碼
    """
    driver.save_screenshot('test.png')

    element = driver.find_element_by_id('kaptcha')

    left = element.location['x']

    right = element.location['x'] + element.size['width']

    top = element.location['y']

    bottom = element.location['y'] + element.size['height']

    from PIL import Image

    img = Image.open('test.png')

    img = img.crop((left, top, right, bottom))

    img.convert("RGB").save('kaptcha.jpg', 'JPEG')

In [9]:
from selenium import webdriver
import time
# driver = webdriver.Chrome('chromedriver.exe')
driver = webdriver.Firefox(executable_path=r'geckodriver.exe')

driver.get('http://gcis.nat.gov.tw/pub/cmpy/cmpyInfoListAction.do')

while(True):
    getVerificationCode(driver)

    saveKaptcha('kaptcha.jpg', 'imagedata')

    kaptcha = predictKaptcha('imagedata')

    print(kaptcha)

    element = driver.find_element_by_name('queryStr')
    if element.get_attribute('value') != '':
        element.clear()

    driver.find_element_by_name('queryStr').send_keys('台灣積體電路')

    kaptcha = ''.join([str(i) for i in kaptcha])

    driver.find_element_by_name('imageCode').send_keys(kaptcha)
    time.sleep(3)

    driver.find_element_by_name('submitData').click()
    time.sleep(3)
    try:
        driver.find_element_by_id('kaptcha')
    except:
        break




[1 9 8 3 1 1]
[9 0 2 2 3 5]
[7 4 2 2 3 7]
[2 9 4 3 2 7]
[1 6 3 0 3 7]
[4 0 7 8 9 7]
[4 7 8 3 3 2]
[1 5 2 1 8 3]
