In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
options = Options()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
options.add_argument("--window-size=1920,1080")
options.add_experimental_option("prefs", {
  "download.default_directory": "D\\Python\\DataForge-automated\\MathTHPT2025",
  "download.prompt_for_download": False,
  "download.directory_upgrade": True,
  "safebrowsing.enabled": False,
  "safebrowsing.ebabled": "false"
})
driver = webdriver.Chrome(options=options)
useOutsideOCR = 1
driver.get("https://dotsocr.xiaohongshu.com/" if useOutsideOCR == 1 else "https://dotsocr.trunghsgs.edu.vn/")

try:
    wait = WebDriverWait(driver, 60)
except NameError:
    wait = WebDriverWait(driver, 60)

try:
    wait.until(lambda d: d.execute_script("return document.readyState") == "complete")

    try:
        wait.until(EC.presence_of_element_located((By.ID, "parse_button")))
    except Exception:
        try:
            wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "input[type='file']")))
        except Exception:
            pass
    time.sleep(1) 
    print("Page loaded")
except Exception as e:
    print("Timeout waiting for page to load:", e)

In [None]:
from dotenv import load_dotenv, find_dotenv
import os, json, re

env_path = find_dotenv()
if env_path:
    load_dotenv(env_path)
    print(f"Loaded .env from: {env_path}")
else:
    load_dotenv()
    print("No .env file found; loaded environment from system variables")
    
data_dir = os.environ.get("DATA_DIR", "MathTHPT2025")
loading_folder_env = os.environ.get("LOADING_FOLDER", '["1.Chuyen de", "2.De chac diem 8", "3.De chac diem 9", "4.De luyen them", "5.De quan trong", "6.De so"]')
try:
    loading_folder = json.loads(loading_folder_env)
except Exception:
    try:
        loading_folder = ast.literal_eval(loading_folder_env)
    except Exception:
        loading_folder = [s.strip() for s in re.split(r'[;,]', loading_folder_env) if s.strip()]

In [None]:
import os
from pathlib import Path
import logging
from logging.handlers import RotatingFileHandler
log_dir = Path(data_dir)
log_dir.mkdir(parents=True, exist_ok=True)
log_path = log_dir / "process.log"

logger = logging.getLogger("pdf_processor")
logger.setLevel(logging.INFO)

# avoid adding duplicate handlers when re-running the cell
if not any(isinstance(h, RotatingFileHandler) and h.baseFilename == str(log_path) for h in logger.handlers):
    fh = RotatingFileHandler(str(log_path), maxBytes=5 * 1024 * 1024, backupCount=5, encoding="utf-8")
    fh.setLevel(logging.INFO)
    fmt = logging.Formatter("%(asctime)s %(levelname)s: %(message)s", "%Y-%m-%d %H:%M:%S")
    fh.setFormatter(fmt)
    logger.addHandler(fh)

    ch = logging.StreamHandler()
    ch.setLevel(logging.INFO)
    ch.setFormatter(fmt)
    logger.addHandler(ch)

# convenience alias (use logger.info(...) or log(...))
log = logger.info
for folder in loading_folder:
    pdf_files = []
    folder_path = os.path.join(data_dir, folder)
    for file in os.listdir(folder_path):
        if file.lower().endswith('.pdf'): #Duyet tung file pdf
            pdf_files.append(file)
            file_path = os.path.abspath(os.path.join(folder_path, file))

            #upload file
            file_input = driver.find_element(By.CSS_SELECTOR, "input[type='file'][data-testid='file-upload']")
            file_input.send_keys(file_path)

            print("Sent file to upload input:", file_path)

            #doi upload xong thi click nut parse
            upload_selector = "span.uploading"
            timeout1 = 1800
            try:
                upload_wait = WebDriverWait(driver, timeout1)
                upload_wait.until(EC.invisibility_of_element_located((By.CSS_SELECTOR, upload_selector)))
                print("Upload indicator disappeared")
            except Exception as e:
                print("Timeout waiting for upload indicator to disappear:", e)
            time.sleep(2)
            parse_btn = driver.find_element(By.ID, "parse_button")
            parse_btn.click()
            print("Clicked parse button")
            try:
                tab_btn = WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.ID, "component-38-button")))
                tab_btn.click()
                print("Clicked Markdown Raw Text tab")
            except Exception as e:
                print("Failed to click Markdown Raw Text tab:", e)

            file_no_ext = os.path.splitext(file)[0]
            download_dir = os.path.join("..\\..\\output", folder, file_no_ext, "")

            target_download_dir = os.path.join(os.getcwd(), download_dir)
            os.makedirs(target_download_dir, exist_ok=True)
            download_dir = target_download_dir
            before_files = set(os.listdir(download_dir))

            try:
                driver.execute_cdp_cmd("Page.setDownloadBehavior", {"behavior": "allow", "downloadPath": os.path.join(os.getcwd(), download_dir)})
            except Exception as e:
                print("Could not set download directory via CDP:", e)

            wait = WebDriverWait(driver, 1800)
            btn = wait.until(EC.element_to_be_clickable((By.ID, "component-45")))
            print("Target download dir:", os.path.join(os.getcwd(), download_dir))
            print("Download button found, clicking...")
            btn.click()

            #doi download xong
            timeout = 1800
            end_time = time.time() + timeout
            downloaded = None
            while time.time() < end_time:
                time.sleep(0.5)
                new_files = set(os.listdir(download_dir)) - before_files
                if new_files:
                    for f in new_files:
                        if not f.endswith('.crdownload'):
                            downloaded = f
                            break
                if downloaded:
                    break

            if downloaded:
                print("Downloaded file:", downloaded)
            else:
                print("No completed download detected within timeout.")
            
            #giai nen file
            import os, zipfile
            import logging
            from logging.handlers import RotatingFileHandler
            from pathlib import Path

            zip_path = os.path.join(download_dir, downloaded)  # uses existing vars
            extract_to = os.path.join("..\\..\\output", folder, file_no_ext, "output", "all_pages")
            os.makedirs(os.path.join("..\\..\\output", folder, file_no_ext, "output", "all_pages"), exist_ok=True)
            with zipfile.ZipFile(zip_path, 'r') as z:
                z.extractall(extract_to)

            print("Extracted:", zip_path, "->", extract_to)

            #ghep lai thanh 1 file md
            suffix = os.path.splitext(f)[0].split('_')[-1]  # e.g. 'ef13eefa' from 'layout_results_ef13eefa.zip'
            final_path = os.path.join("..\\..\\output", folder, file_no_ext, "output", "final.md")
            #os.makedirs(os.path.join("..\\..\\output", folder, file_no_ext, "output"), exist_ok=True)
            with open(final_path, "w", encoding="utf-8") as out_f:
                for i in range(0, 100):
                    part_name = f"demo_{suffix}_page_{i}.md"
                    part_path = os.path.join(extract_to, part_name)
                    if not os.path.exists(part_path):
                        #print("Missing:", part_name)
                        continue
                    with open(part_path, "r", encoding="utf-8") as part_f:
                        content = part_f.read()
                    # separate pages with two newlines to keep markdown blocks distinct
                    out_f.write(content)
                    if i != 13:
                        out_f.write("\n\n")

            print("Merged pages into:", final_path)
            
            #bam nut xoa tat ca
            try:
                clear_btn = WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.ID, "component-13")))
                clear_btn.click()
                print("Clicked Clear button (component-13)")
                # wait for the upload input to become available again before processing next file
                WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CSS_SELECTOR, "input[type='file'][data-testid='file-upload']")))
                time.sleep(1)
            except Exception as e:
                print("Failed to click Clear button or wait for reset:", e)

            #doi no clear xong nghia la cai cho de drop file xuat hien lai
            try:
                drop_selector = "div.wrap.svelte-12ioyct"
                WebDriverWait(driver, 1800).until(EC.visibility_of_element_located((By.CSS_SELECTOR, drop_selector)))
                print("Drop-area div appeared")
            except Exception as e:
                print("Timeout waiting for drop-area div to appear:", e)
            print()
    print(pdf_files)

In [None]:
"""
import os
from pathlib import Path
import logging
from logging.handlers import RotatingFileHandler
log_dir = Path(data_dir)
log_dir.mkdir(parents=True, exist_ok=True)
log_path = log_dir / "process.log"

logger = logging.getLogger("pdf_processor")
logger.setLevel(logging.INFO)

# avoid adding duplicate handlers when re-running the cell
if not any(isinstance(h, RotatingFileHandler) and h.baseFilename == str(log_path) for h in logger.handlers):
    fh = RotatingFileHandler(str(log_path), maxBytes=5 * 1024 * 1024, backupCount=5, encoding="utf-8")
    fh.setLevel(logging.INFO)
    fmt = logging.Formatter("%(asctime)s %(levelname)s: %(message)s", "%Y-%m-%d %H:%M:%S")
    fh.setFormatter(fmt)
    logger.addHandler(fh)

    ch = logging.StreamHandler()
    ch.setLevel(logging.INFO)
    ch.setFormatter(fmt)
    logger.addHandler(ch)

# convenience alias (use logger.info(...) or log(...))
log = logger.info
for folder in loading_folder:
    pdf_files = []
    folder_path = os.path.join(data_dir, folder)
    for file in os.listdir(folder_path):
        if file.lower().endswith('.pdf'): #Duyet tung file pdf
            pdf_files.append(file)
            file_path = os.path.abspath(os.path.join(folder_path, file))
            file_no_ext = os.path.splitext(file)[0]
            download_dir = os.path.join(folder_path, file_no_ext, "zip")

            target_download_dir = os.path.join(os.getcwd(), download_dir)
            os.makedirs(target_download_dir, exist_ok=True)
            download_dir = target_download_dir
            before_files = set(os.listdir(download_dir))

            timeout = 1800
            end_time = time.time() + timeout
            downloaded = None
            while time.time() < end_time:
                time.sleep(0.5)
                new_files = before_files
                if new_files:
                    for f in new_files:
                        if not f.endswith('.crdownload'):
                            downloaded = f
                            break
                if downloaded:
                    break

            if downloaded:
                print("Downloaded file:", downloaded)
            else:
                print("No completed download detected within timeout.")
            
            #giai nen file
            import os, zipfile
            import logging
            from logging.handlers import RotatingFileHandler
            from pathlib import Path

            zip_path = os.path.join(download_dir, downloaded)  # uses existing vars
            extract_to = os.path.join(folder_path, file_no_ext, "output")

            with zipfile.ZipFile(zip_path, 'r') as z:
                z.extractall(extract_to)

            print("Extracted:", zip_path, "->", extract_to)

            #ghep lai thanh 1 file md
            suffix = os.path.splitext(f)[0].split('_')[-1]  # e.g. 'ef13eefa' from 'layout_results_ef13eefa.zip'
            final_path = os.path.join(extract_to, "final.md")

            with open(final_path, "w", encoding="utf-8") as out_f:
                for i in range(0, 100):
                    part_name = f"demo_{suffix}_page_{i}.md"
                    part_path = os.path.join(extract_to, part_name)
                    if not os.path.exists(part_path):
                        #print("Missing:", part_name)
                        continue
                    with open(part_path, "r", encoding="utf-8") as part_f:
                        content = part_f.read()
                    # separate pages with two newlines to keep markdown blocks distinct
                    out_f.write(content)
                    if i != 13:
                        out_f.write("\n\n")

            print("Merged pages into:", final_path)
    print(pdf_files)
"""

In [None]:
os.getcwd()

In [None]:
import os
import re
import base64

print("Starting post-processing of existing final.md files...")

# Iterate through the main folders you processed before
for folder_name in loading_folder:
    folder_path = os.path.join(data_dir, folder_name)
    
    if not os.path.isdir(folder_path):
        continue

    # Iterate through the subdirectories created for each PDF
    for pdf_subfolder_name in os.listdir(folder_path):
        pdf_subfolder_path = os.path.join(folder_path, pdf_subfolder_name)
        
        if not os.path.isdir(pdf_subfolder_path):
            continue
            
        # Define the path to the final.md file
        output_dir = os.path.join(pdf_subfolder_path, "output")
        print(final_path)
        final_path = os.path.join(output_dir, "final.md")

        # Check if final.md exists before trying to process it
        if os.path.exists(final_path):
            
            # --- Core processing logic starts here ---
            
            image_dir = os.path.join(output_dir, "images")
            os.makedirs(image_dir, exist_ok=True)

            with open(final_path, "r", encoding="utf-8") as f:
                content = f.read()

            image_counter = 0
            
            def replace_base64_with_path(match):
                global image_counter
                image_counter += 1
                
                base64_data = match.group(1)
                
                # Fix potential Base64 padding errors
                missing_padding = len(base64_data) % 4
                if missing_padding:
                    base64_data += '=' * (4 - missing_padding)
                    
                try:
                    image_data = base64.b64decode(base64_data)
                    image_filename = f"image_{image_counter}.png"
                    image_save_path = os.path.join(image_dir, image_filename)
                    
                    with open(image_save_path, "wb") as img_file:
                        img_file.write(image_data)
                        
                    relative_image_path = os.path.join("images", image_filename).replace('\\', '/')
                    return f"![]({relative_image_path})"
                except Exception as e:
                    print(f"  - Error decoding/saving image {image_counter} in {final_path}: {e}")
                    return match.group(0) # Return original match on error

            # Regex to find markdown images with Base64 data
            base64_pattern = re.compile(r"!\[.*?\]\(data:image;base64,(.*?)\)")
            
            # Perform the replacement
            new_content = base64_pattern.sub(replace_base64_with_path, content)
            
            # Overwrite the final.md with the new content
            with open(final_path, "w", encoding="utf-8") as f:
                f.write(new_content)
            
            if image_counter > 0:
                print(f"Processed {final_path} and extracted {image_counter} images.")
            
            # --- Core processing logic ends here ---

print("\nPost-processing complete!")

In [None]:
%pip install markdown

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
options = Options()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
options.add_argument("--window-size=1920,1080")
options.add_experimental_option("prefs", {
  "download.default_directory": "D\\Python\\DataForge-automated\\MathTHPT2025",
  "download.prompt_for_download": False,
  "download.directory_upgrade": True,
  "safebrowsing.enabled": False,
  "safebrowsing.ebabled": "false"
})
driver = webdriver.Chrome(options=options)

In [None]:
"""
import os
import re
from selenium.webdriver.common.by import By

# IMPORTANT: Ensure the Selenium 'driver' variable is initialized and running
# from your previous notebook cells before you run this one.

print("Starting HTML table-to-image conversion process...")

# UPDATED REGEX: Now looks for <table>...</table> blocks, spanning multiple lines.
table_pattern = re.compile(r'(<table.*?>.*?</table>)', re.DOTALL)

# Use the same 'data_dir' and 'loading_folder' variables from your previous cells.
for folder_name in loading_folder:
    folder_path = os.path.join(data_dir, folder_name)
    
    if not os.path.isdir(folder_path):
        continue

    # Iterate through the subdirectories created for each PDF
    for pdf_subfolder_name in os.listdir(folder_path):
        pdf_subfolder_path = os.path.join(folder_path, pdf_subfolder_name)
        
        if not os.path.isdir(pdf_subfolder_path):
            continue
            
        output_dir = os.path.join(pdf_subfolder_path, "output")
        final_md_path = os.path.join(output_dir, "final.md")

        if os.path.exists(final_md_path):
            with open(final_md_path, "r", encoding="utf-8") as f:
                content = f.read()

            image_dir = os.path.join(output_dir, "images")
            os.makedirs(image_dir, exist_ok=True)
            
            table_counter = 0

            # This function will be called for each table found by the regex
            def replace_table_with_image(match):
                global table_counter
                table_counter += 1
                
                # This variable now contains the full <table>...</table> string
                html_table_string = match.group(0)
                
                # Create a simple, self-contained HTML file to render the table
                # We no longer need to convert from Markdown, we can use the HTML directly.
                html_for_render = f
                <html>
                <head>
                    <style>
                        body {{ font-family: sans-serif; background-color: white; display: inline-block; padding: 10px; }}
                        table {{ border-collapse: collapse; margin: 1px; }}
                        th, td {{ border: 1px solid #ccc; padding: 8px; text-align: left; }}
                        th {{ background-color: #f2f2f2; }}
                    </style>
                </head>
                <body>{html_table_string}</body>
                </html>
                
                
                # Write to a temporary HTML file
                temp_html_path = os.path.abspath("temp_table.html")
                with open(temp_html_path, "w", encoding="utf-8") as temp_f:
                    temp_f.write(html_for_render)

                try:
                    # Use Selenium to open the local file and screenshot the table
                    driver.get(f"file://{temp_html_path}")
                    table_element = driver.find_element(By.TAG_NAME, 'table')
                    
                    image_filename = f"table_{table_counter}.png"
                    image_save_path = os.path.join(image_dir, image_filename)
                    
                    # Take a screenshot of only the table element
                    table_element.screenshot(image_save_path)
                    
                    # Create the relative path for the markdown link
                    relative_image_path = os.path.join("images", image_filename).replace('\\', '/')
                    print(f"  - Converted a table to '{relative_image_path}' in {output_dir}")
                    
                    # Return the Markdown image link to replace the table text
                    return f"![]({relative_image_path})"
                
                except Exception as e:
                    print(f"  - Error converting table to image: {e}")
                    return html_table_string # On error, return the original table text

            # Use re.subn to find all tables and replace them
            new_content, num_replacements = table_pattern.subn(replace_table_with_image, content)
            
            if num_replacements > 0:
                # Overwrite the final.md file with the new content
                with open(final_md_path, "w", encoding="utf-8") as f:
                    f.write(new_content)
                print(f"Processed {final_md_path}, replaced {num_replacements} table(s).\n")

# Clean up the temporary file after the loop is done
if os.path.exists("temp_table.html"):
    os.remove("temp_table.html")

print("All HTML table conversions are complete!")
"""

In [None]:
driver.close()