In [7]:
import os.path
from pathlib import Path
from sys import platform
from typing import Optional, Type, Union

from bs4 import BeautifulSoup

import easyocr
import matplotlib.pyplot as plt
import cv2
import numpy as np
import requests
from PIL import Image
import logging
import argparse
from multiprocessing import cpu_count
from concurrent import futures as cf

import pandas as pd
import time

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.common.exceptions import NoSuchElementException, WebDriverException
from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium.webdriver.chrome.service import Service as ChromeService

from selenium.webdriver.edge.options import Options as EdgeOptions
from selenium.webdriver.edge.service import Service as EdgeService

from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.webdriver.firefox.service import Service as GeckoService

from selenium.webdriver.safari.options import Options as SafariOptions
from selenium.webdriver.safari.webdriver import WebDriver as SafariDriver

from selenium.webdriver.remote.webdriver import WebDriver

from webdriver_manager.chrome import ChromeDriverManager
from webdriver_manager.firefox import GeckoDriverManager
from webdriver_manager.microsoft import EdgeChromiumDriverManager as EdgeDriverManager
from webdriver_manager.core.utils import ChromeType
from io import BytesIO
from tqdm import tqdm

BrowserOptions = Union[ChromeOptions, EdgeOptions, FirefoxOptions, SafariOptions]

url=f'https://verify.bmdc.org.bd/'

## Utility functions for scraping

In [8]:
def open_selenium_browser(browser_name: str, headless: bool):
    # browser_name = "firefox"
    # headless = False

    options_available = {
        "chrome": ChromeOptions,
        "edge": EdgeOptions,
        "firefox": FirefoxOptions,
        "safari": SafariOptions,
    }
    options = options_available[browser_name]()

    if headless:
        options.add_argument("--headless")
        options.add_argument("--disable-gpu")

    if browser_name == "edge":
        driver = webdriver.Edge(
            service=EdgeService(
                EdgeDriverManager().install()
            ),
            options=options
        )
    elif browser_name == "firefox":
        options.log.level = "fatal"
        driver = webdriver.Firefox(
            service=GeckoService(
                GeckoDriverManager().install()
            ),
            options=options
        )
    else:
        options.add_experimental_option('excludeSwitches', ['enable-logging'])
        driver = webdriver.Chrome(
            service=ChromeService(
                ChromeDriverManager(chrome_type=ChromeType.CHROMIUM).install()
            ),
            options=options
        )

    return driver


def go_to_page_with_selenium(driver, url: str="https://verify.bmdc.org.bd/") -> tuple[WebDriver, str]:
    """Scrape text from a website using selenium

    Args:
        url (str): The url of the website to scrape

    Returns:
        Tuple[WebDriver, str]: The webdriver and the text scraped from the website
    """

    driver.get(url)

    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.TAG_NAME, "body"))
    )

    # Get the HTML content directly from the browser's DOM
    page_source = driver.execute_script("return document.body.outerHTML;")

    return driver, page_source


def get_captcha_image(page_source):
    # page_source = requests.get(url=url)
    bs_html = BeautifulSoup(page_source, 'html.parser')
    with open("page_source.txt", "w") as file:
        file.write(page_source)
    captcha_img_url = bs_html.find('div', {"id": "captcha1"}).find("img")["src"]
    # print(captcha_img_url)
    # img = np.array(Image.open(requests.get(captcha_img_url, stream = True).raw))
    img = np.array(Image.open(BytesIO(requests.get(captcha_img_url,stream = True).content)))

    return img[1:29,1:99,:]


def process_image(img):

    img_inv = cv2.bitwise_not(img)
    erode_kernel = np.ones((2, 2), np.uint8)
    dilute_kernel = np.ones((2, 2), np.uint8)
    # kernel[[0,0,2,2],[0,2,0,2]] = 0
    img_inv = cv2.erode(img_inv, erode_kernel, iterations=1)

    # _, img_inv = cv2.threshold(img_inv,128,255,cv2.THRESH_BINARY)
    # img_inv = cv2.dilate(img_inv, dilute_kernel, iterations=2)
    # img_inv = cv2.cvtColor(img_inv , cv2.COLOR_BGR2GRAY)

    x_shift = 20
    y_shift = 20
    ocr_ready_img = np.zeros((img_inv.shape[0] + x_shift, img_inv.shape[1] + y_shift, 3)).astype(np.uint8)

    # x_start = ocr_ready_img.shape[0]//2-img_inv.shape[0]//2
    # y_start = ocr_ready_img.shape[1]//2-img_inv.shape[1]//2
    ocr_ready_img[x_shift:img_inv.shape[0]+x_shift, y_shift:img_inv.shape[1]+y_shift] = img_inv

    return ocr_ready_img


def solve_captcha(img):

    ## OCR Solution (easyOCR)
    reader = easyocr.Reader(['en'],) # Maybe define this at the top, while importing packages
    result= reader.readtext(img, allowlist='ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789')
    captcha_solution = result[0][1]

    ## Pytesseract
    # pytesseract.pytesseract.tesseract_cmd = "C:\\Program Files\\Tesseract-OCR\\tesseract.exe"
    # config = '-c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRST0123456789 --psm 6'
    # captcha_solution = pytesseract.image_to_string(img_bin, config=config)

    return captcha_solution


def submit_form_selenium(driver,  doc_id, captcha_solution,):

    input_doc_id = driver.find_element(By.XPATH, "//div[@class='form-group']/input" )
    input_doc_id.send_keys(f"{doc_id}")

    input_captcha = driver.find_element(By.XPATH, "//input[@id='captcha_code']" )
    input_captcha.send_keys(captcha_solution)

    submit_button = driver.find_element(By.XPATH, "//button[@id='submit']" )
    submit_button.click()


    return driver, driver.current_url


def get_doctor_dict_selenium(driver):
    # driver.maximize_window()
    name_xpath = "//div[@class='col-md-8']/h3"
    bmdc_code_xpath = "//div[@class='text-center']/h3"

    registration_year_xpath = "//h5[@class='font-weight-bold mb-0 d-block']"
    dob_bg_lxpath = "//div[@class='form-group row mb-0']/div/h6"
    other_lxpath = "//div[@class='col-md-12']/h6"

    name = driver.find_element(By.XPATH, name_xpath) if driver.find_element(By.XPATH, name_xpath) else None
    bmdc_code = driver.find_element(By.XPATH, bmdc_code_xpath) if driver.find_element(By.XPATH,
                                                                                      bmdc_code_xpath) else None

    registration_details = driver.find_elements(By.XPATH, registration_year_xpath)
    registration_year, registration_validity, _ = registration_details if registration_details else [None, None, None]

    dob_bg = driver.find_elements(By.XPATH, dob_bg_lxpath)
    dob, bg = dob_bg[:2] if dob_bg else [None, None]

    other_details = driver.find_elements(By.XPATH, other_lxpath)

    reg_status = other_details[-1] if other_details else None
    # Scroll to the last element
    driver.execute_script("arguments[0].scrollIntoView();", reg_status)

    if len(other_details) > 2:
        father_name = other_details[0]
        mother_name = other_details[1]
        permanent_add = other_details[-2]
    else:
        father_name = None
        mother_name = None
        permanent_add = None


    doc_entry_dict = {
        "name": name.text if name else None,
        "bmdc_code": bmdc_code.text if bmdc_code else None,
        "registration_year": registration_year.text if registration_year else None,
        "registration_validity": registration_validity.text if registration_validity else None,
        "dob": dob.text if dob else None,
        "bg": bg.text if bg else None,
        "father_name": father_name.text if father_name else None,
        "mother_name": mother_name.text if mother_name else None,
        "permanent_add": permanent_add.text if permanent_add else None,
        "reg_status": reg_status.text if reg_status else None,
    }

    return doc_entry_dict

## Scrap from doctor page functions

In [9]:
### These functions need to be organized
def doc_entry_generator(driver, id_start, id_end):
    id = id_start

    pbar = tqdm(total=id_end-id_start+1)
    while id <= id_end:
        driver, page_source = go_to_page_with_selenium(driver)
        captcha_img = get_captcha_image(page_source)
        captcha_img = process_image(captcha_img)
        captcha_solution = solve_captcha(captcha_img)

        driver, _ = submit_form_selenium(driver, id, captcha_solution)
        try:
            doc_entry_dict = get_doctor_dict_selenium(driver)
            yield doc_entry_dict
            id += 1
        except NoSuchElementException:
            pass
        time.sleep(3)
        pbar.update(1)
    pbar.close()


def single_doc_entry(id, browser_name, headless):
    driver = open_selenium_browser(browser_name, headless=headless)

    captcha_incorrect = True
    while captcha_incorrect:
        driver, page_source = go_to_page_with_selenium(driver)
        captcha_img = get_captcha_image(page_source)
        captcha_img = process_image(captcha_img)
        captcha_solution = solve_captcha(captcha_img)
        driver, _ = submit_form_selenium(driver, id, captcha_solution)
        try:
            doc_entry_dict = get_doctor_dict_selenium(driver)
            driver.close()
            return doc_entry_dict
        except NoSuchElementException:
            captcha_incorrect = True
            pass
        time.sleep(3)


def mp_doc_entry(id_start, id_end, browser_name, headless):
    driver = open_selenium_browser(browser_name, headless=headless)
    doc_list = []
    id = id_start
    while id <= id_end:
        driver, page_source = go_to_page_with_selenium(driver)
        captcha_img = get_captcha_image(page_source)
        captcha_img = process_image(captcha_img)
        captcha_solution = solve_captcha(captcha_img)

        driver, _ = submit_form_selenium(driver, id, captcha_solution)
        try:
            doc_entry_dict = get_doctor_dict_selenium(driver)
            doc_list.append(doc_entry_dict)
            id += 1
        except NoSuchElementException:
            pass
        time.sleep(3)
    driver.close()
    return doc_list

## Helper Functions

In [10]:
## Helpers
def divide_doc_ids(doc_id_start, doc_id_end, n_workers):
    delta = (doc_id_end - doc_id_start) // n_workers
    starts = [doc_id_start + i * (delta + 1) for i in range(n_workers)]
    ends = [
        doc_id_start + i * (delta + 1) + delta if doc_id_start + i * (delta + 1) + delta <= doc_id_end else doc_id_end
        for i in range(n_workers)]
    return starts, ends

## Main Functions

In [11]:
## Whole processes
def main_normal(doc_id_start, doc_id_end, browser_name, headless):
    driver = open_selenium_browser(browser_name, headless=headless)
    rows = doc_entry_generator(driver, doc_id_start, doc_id_end)
    df = pd.DataFrame(rows, columns=["name", "bmdc_code", "registration_year", "registration_validity", "dob",
                               "father_name", "mother_name", "permanent_add", "reg_status"])
    driver.quit()
    return df


def main_multithread(doc_id_start, doc_id_end, browser_name, headless, workers=4):
    # # Main Function start
    # print("Starting Browser. ------------------")
    # driver = open_selenium_browser(browser_name, headless=headless)
    # print("Browser Opened. Now scraping. ------------------")

    # Divide to smaller sub-tasks: Divide the doc-ids into `n_workers` parts
    starts, ends = divide_doc_ids(doc_id_start, doc_id_end, n_workers=workers)
    print(f"starts: {starts}")
    print(f"ends: {ends}")

    df = pd.DataFrame(columns=["name", "bmdc_code", "registration_year", "registration_validity", "dob",
                               "father_name", "mother_name", "permanent_add", "reg_status"])

    with cf.ThreadPoolExecutor(max_workers=workers) as pool:
        fs = [pool.submit(mp_doc_entry, start_id, end_id, browser_name, headless) for start_id, end_id in zip(starts, ends)]
        total_tasks = doc_id_end - doc_id_start + 1

        for f in tqdm(cf.as_completed(fs), total=total_tasks, bar_format='{l_bar}{bar:20}{r_bar}{bar:-20b} {percentage:3.0f}%'):
            data_generator = f.result()
            for row in data_generator:
                df = df._append(row, ignore_index=True)

    return df


def main_multiprocess(doc_id_start, doc_id_end, browser_name, headless, workers=4):

    df = pd.DataFrame(columns=["name", "bmdc_code", "registration_year", "registration_validity", "dob",
                               "father_name", "mother_name", "permanent_add", "reg_status"])

    with cf.ProcessPoolExecutor(max_workers=workers) as executor:
        total_tasks = doc_id_end - doc_id_start + 1
        fs = [executor.submit(single_doc_entry, i, browser_name, headless) for i in range(doc_id_start, doc_id_end+1)]
        for f in tqdm(cf.as_completed(fs), total=total_tasks, bar_format='{l_bar}{bar:20}{r_bar}{bar:-20b} {percentage:3.0f}%'):
            df = df._append(f.result(), ignore_index=True)

    return df

# TEST FULL PROCESS

In [13]:
driver = open_selenium_browser(browser_name="chrome", headless=False)
driver, page_source = go_to_page_with_selenium(driver, url)
captcha_img = get_captcha_image(page_source)
captcha_img = process_image(captcha_img)
captcha_solution = solve_captcha(captcha_img)

driver, current_url = submit_form_selenium(driver, 35, captcha_solution)
doc_entry_dict = get_doctor_dict_selenium(driver)
driver.quit()

In [None]:
doc_entry = single_doc_entry(21, "chrome", False)
doc_entry

## Rough Work

In [3]:
id_start = 11
delta = 20
id_end = 11 + delta
while id_start <= 77:
    # df = main_multiprocess(id_start, id_end, browser_name, headless, workers=workers)
    # df.to_csv(f"./scraped_data/doctor_{id_start}_{id_end}.csv", index=False)
    print(id_start, id_end)
    id_start = id_end + 1
    id_end = id_end + delta if id_end + delta <= 77 else 77

11 31
32 51
52 71
72 77


In [8]:
import platform

platform.architecture()
import webdriver_manager.core.utils as utils

utils.os_type()

'win64'

In [5]:
num_processes = cpu_count()
num_processes

16

In [None]:
s = 11
e = 28
w = 5

D = (e-s)//w
start = [s+i*(D+1)  for i in range(w)]
end = [s+i*(D+1)+D if s+i*(D+1)+D <= e else e for i in range(w) ]
start, end

In [51]:
from concurrent.futures import ThreadPoolExecutor, as_completed
import time

# This is our generator function
def countdown(n, low):
    while n > low:
        yield n
        n -= 1

# This is the function that will be run in each thread
def consume_generator(upper_limit, lower_limit=0):
    generator = countdown(upper_limit, lower_limit)
    print(f"Upper_limit_ {upper_limit}, Lower_limit_ {lower_limit}")
    out = []
    for item in generator:
        # Simulate some I/O-bound work with time.sleep
        time.sleep(1)
        print(f"Item_{item}\n")
        out.append(item)
    return out
# Create a generator
# generator = countdown(25)

# Create a ThreadPoolExecutor
with ThreadPoolExecutor(max_workers=2) as executor:
    # Start two threads that consume the generator
    fs1 = [executor.submit(consume_generator, x) for x in range(1,15)]
    for i, f in enumerate(as_completed(fs1)):
        print(f"Thread number {i}:")
        try:
            results = f.result()
        except ValueError:
            continue
        for result in results:
            print(result)

        # print(result)
    # executor.submit(consume_generator, 15)

Upper_limit_ 1, Lower_limit_ 0
Upper_limit_ 2, Lower_limit_ 0
Item_1

Upper_limit_ 3, Lower_limit_ 0
Thread number 0:
1
Item_2

Item_3

Item_1

Upper_limit_ 4, Lower_limit_ 0
Thread number 1:
2
1
Item_2

Item_4

Item_1

Upper_limit_ 5, Lower_limit_ 0
Thread number 2:
3
2
1
Item_3

Item_5

Item_2

Item_4

Item_1

Upper_limit_ 6, Lower_limit_ 0
Thread number 3:
4
3
2
1
Item_3

Item_6

Item_2

Item_5

Item_1

Upper_limit_ 7, Lower_limit_ 0
Thread number 4:
5
4
3
2
1
Item_4

Item_7

Item_3

Item_6

Item_2

Item_5

Item_1

Upper_limit_ 8, Lower_limit_ 0
Thread number 5:
6
5
4
3
2
1
Item_4

Item_8

Item_3

Item_7

Item_2

Item_6

Item_1

Upper_limit_ 9, Lower_limit_ 0
Thread number 6:
7
6
5
4
3
2
1
Item_5

Item_9

Item_4

Item_8

Item_3

Item_7

Item_2

Item_6

Item_1

Upper_limit_ 10, Lower_limit_ 0
Thread number 7:
8
7
6
5
4
3
2
1
Item_5

Item_10

Item_4

Item_9

Item_3

Item_8

Item_2

Item_7

Item_1

Upper_limit_ 11, Lower_limit_ 0
Thread number 8:
9
8
7
6
5
4
3
2
1
Item_6

Item_11

Item

In [50]:
res = [consume_generator(x) for x in range(1,15)]
res

Upper_limit_ 1, Lower_limit_ 0
Item_1

Upper_limit_ 2, Lower_limit_ 0
Item_2

Item_1

Upper_limit_ 3, Lower_limit_ 0
Item_3

Item_2

Item_1

Upper_limit_ 4, Lower_limit_ 0
Item_4

Item_3

Item_2

Item_1

Upper_limit_ 5, Lower_limit_ 0
Item_5

Item_4

Item_3

Item_2

Item_1

Upper_limit_ 6, Lower_limit_ 0
Item_6

Item_5

Item_4

Item_3

Item_2

Item_1

Upper_limit_ 7, Lower_limit_ 0
Item_7

Item_6

Item_5

Item_4

Item_3

Item_2

Item_1

Upper_limit_ 8, Lower_limit_ 0
Item_8

Item_7

Item_6

Item_5

Item_4

Item_3

Item_2

Item_1

Upper_limit_ 9, Lower_limit_ 0
Item_9

Item_8

Item_7

Item_6

Item_5

Item_4

Item_3

Item_2

Item_1

Upper_limit_ 10, Lower_limit_ 0
Item_10

Item_9

Item_8

Item_7

Item_6

Item_5

Item_4

Item_3

Item_2

Item_1

Upper_limit_ 11, Lower_limit_ 0
Item_11

Item_10

Item_9

Item_8

Item_7

Item_6

Item_5

Item_4

Item_3

Item_2

Item_1

Upper_limit_ 12, Lower_limit_ 0
Item_12

Item_11

Item_10

Item_9

Item_8

Item_7

Item_6

Item_5

Item_4

Item_3

Item_2

Ite

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [316]:
def my_func(n1, n2, random_param=0, random_param2=1):
    # for i in range(n+1):
    for i in range(n1,n2):
        yield i**2
    # return np.sum(range(n+1+random_param+random_param2))



with ThreadPoolExecutor(max_workers=5) as pool:
    input_range = range(2,7)
    # results = list(pool.map(my_func, input_range, [0,]*len(input_range), [0,]*len(input_range)))
    fs = [pool.submit(my_func, x, x+3, 0) for x in range(2,7)]

# results
outs = []
for f in as_completed(fs):
    print(f.result())
    out = []
    for res in f.result():
        out.append(res)
    outs.append(out)
outs

<generator object my_func at 0x000001CD1FEF4190>
<generator object my_func at 0x000001CD1FEFC040>
<generator object my_func at 0x000001CD1FEFE5F0>
<generator object my_func at 0x000001CD1FEF45F0>
<generator object my_func at 0x000001CD1FEF49E0>


[[25, 36, 49], [16, 25, 36], [36, 49, 64], [9, 16, 25], [4, 9, 16]]

In [252]:
x = f.result()

In [258]:
x.__next__()

StopIteration: 