```
conda install conda-forge::poppler  
conda install conda-forge::tesseract  

On Mac:  
`brew install tesseract`  
`brew install poppler-utils`
```

In [1]:
import os
import re
from collections import defaultdict
import difflib
import numpy as np
import pymupdf
from unstructured.partition.pdf import partition_pdf
# from unstructured_inference.models.tables import cells_to_html


# Get the current working directory
current_working_directory = os.getcwd()
# Print the current working directory
print(current_working_directory)

  from .autonotebook import tqdm as notebook_tqdm


/Users/hudanyunsheng/Documents/GitHub/JP_FAQ/notebooks


In [2]:
def compare_strings_similarity(str1, str2):
    return difflib.SequenceMatcher(None, str1, str2).ratio()


# Function to convert list of lists to HTML table
def list_to_html_table(data):
    html = '<table border="1">\n'
    for row in data:
        html += '  <tr>\n'
        for column in row:
            # Replace '\n' with '<br>'
            column = column.replace('\n', '<br>')
            html += f'    <td>{column}</td>\n'
        html += '  </tr>\n'
    html += '</table>'
    return html


def create_markdown_table(data):
    # Create the header row
    markdown = '| ' + ' | '.join([cell.replace('\n', '<br>') for cell in data[0]]) + ' |\n'
    # Create the separator row
    markdown += '| ' + ' | '.join(['---'] * len(data[0])) + ' |\n'
    # Create the data rows
    for row in data[1:]:
        print(row)
        markdown_row = '| ' + ' | '.join([cell.replace('\n', '<br>') for cell in row]) + ' |\n'
        # markdown_row = '| ' + ' | '.join([cell for cell in row]) + ' |\n'
        markdown += markdown_row
    return markdown

In [52]:
# path_pdf = "../data/JP Label/stelara iv Japanese PI.pdf"
path_pdf = "../data/JP Label/stelara sc Japanese PI.pdf"

filename = path_pdf.split("/")[-1].split(".pdf")[0]
dir_partitions = "../output/JP Label images partition/"
if not os.path.exists(dir_partitions):
    os.makedirs(dir_partitions)

#### Parse texts
This method parse everything as texts; meanwhile, it is capable of getting the page number and physical location of special elements. However, the table parsing is not 100% confident. 

In [53]:
texts = []
doc = pymupdf.open(path_pdf) # open a document
for page in doc: # iterate the document pages
    text = page.get_text() # get plain text encoded as UTF-8
    texts.append(text)

In [54]:
tbs = {}
bboxes = {}
n = 1
for page in doc:
    tbs[f"page {n}"] = []
    bboxes[f"page {n}"] = []
    print(f"Page {n}:")
    n_tb = 0
    for tb in page.find_tables(vertical_strategy="lines_strict", horizontal_strategy="lines_strict", snap_x_tolerance=1): 
        # print(len(tb.cells))
        n_tb += 1
        tbs[f"page {n}"].append(tb.extract())
        bboxes[f"page {n}"].append(tb.bbox) # x0, y0, x1, y1
    n += 1
    print(f"{n_tb} tables")

Page 1:
4 tables
Page 2:
1 tables
Page 3:
2 tables
Page 4:
2 tables


In [108]:
# split into each paragraph
text = "\n".join(texts)
result = {}
current_key = ''
lines = text.splitlines()
for line in lines:
    section_match = re.match(r'^(\d+\.\s+.+)', line)
    if section_match:
        # current_key = line.split(' ', 1)[1]
        current_key = section_match.group(1).strip()
        result[current_key] = []
    else:
        if current_key in result:
            result[current_key].append(line.strip())

In [109]:
# def match_table_to_paragraph(tables, paragraphs):
#     matched_tables = {}
    
#     # Flatten paragraphs to single string for easier comparison
#     paragraph_text = ' '.join(paragraphs)
    
#     for i, table in enumerate(tables):
#         table_text = ' '.join([' '.join([cell if cell is not None else '' for cell in row]) for row in table])
        
#         # Check for matching keywords or phrases
#         if any(keyword in paragraph_text for keyword in table_text.split()):
#             matched_tables[i] = table
    
#     return matched_tables

In [110]:
def match_table_to_paragraph(tables, paragraphs):
    matched_tables = []# {}
    paragraph_text = ' '.join(paragraphs)
    remaining_paragraph_text = paragraph_text

    for table_idx, table in enumerate(tables):
        table_texts = [cell.replace("\n", "") for row in table for cell in row if cell]
        
        # Check if all texts in the table match part of the paragraph
        if all(any(cell in paragraph for paragraph in paragraphs) for cell in table_texts):
            # matched_tables[table_idx] = table
            matched_tables.append((table_idx, table))
            
            # Remove matched texts from the paragraph and insert placeholder
            for cell in table_texts:
                remaining_paragraph_text = remaining_paragraph_text.replace(cell, f"{{Table_{table_idx}}}")

    return matched_tables, remaining_paragraph_text


In [111]:
matched_tables = dict()

for pg, _tbs in tbs.items():
    matched_tables[pg] = dict()
    for header, paragraph in result.items():
        matched_table, remaining_paragraph_text = match_table_to_paragraph(_tbs, paragraph)
        if len(matched_table) > 0:
            assert len(matched_table) == 1
            matched_tables[pg][header] = matched_table
            
            # update
            result[header] = remaining_paragraph_text

In [114]:
tbs['page 2'][0]

[['', '5%以上', '1～5%未満', '1%未満', '頻度不明'],
 ['感染症及び\n寄生虫症', '鼻咽頭炎', '上気道感染', '外陰腟真菌\n感染、副鼻\n腔炎、帯状\n疱疹、歯肉\n炎', ''],
 ['精神障害', '', '', 'うつ病', ''],
 ['神経系障害', '', '頭痛、浮動\n性めまい', '', ''],
 ['呼吸器、胸\n郭及び縦隔\n障害', '', '咽喉頭疼痛', '鼻閉', '好酸球性肺\n炎'],
 ['胃腸障害', '', '悪心、嘔吐', '下痢', ''],
 ['皮膚及び皮\n下組織障害', '', '発疹、そう\n痒症', 'ざ瘡、蕁麻\n疹、過敏性\n血管炎', '膿疱性乾\n癬、乾癬性\n紅皮症'],
 ['筋骨格系及\nび結合組織\n障害', '', '関節痛', '筋痛、背部\n痛', ''],
 ['全身障害及\nび投与局所\n様態', '', '注射部位反\n応、疲労', '無力症', '']]

In [121]:
table = tbs['page 2'][0]
table_texts = [cell.replace("\n", "") for row in table for cell in row if cell]
paragraphs = result['11. 副作用']

all(any(cell in paragraph for paragraph in paragraphs) for cell in table_texts)

False

In [123]:
table_texts

['5%以上',
 '1～5%未満',
 '1%未満',
 '頻度不明',
 '感染症及び寄生虫症',
 '鼻咽頭炎',
 '上気道感染',
 '外陰腟真菌感染、副鼻腔炎、帯状疱疹、歯肉炎',
 '精神障害',
 'うつ病',
 '神経系障害',
 '頭痛、浮動性めまい',
 '呼吸器、胸郭及び縦隔障害',
 '咽喉頭疼痛',
 '鼻閉',
 '好酸球性肺炎',
 '胃腸障害',
 '悪心、嘔吐',
 '下痢',
 '皮膚及び皮下組織障害',
 '発疹、そう痒症',
 'ざ瘡、蕁麻疹、過敏性血管炎',
 '膿疱性乾癬、乾癬性紅皮症',
 '筋骨格系及び結合組織障害',
 '関節痛',
 '筋痛、背部痛',
 '全身障害及び投与局所様態',
 '注射部位反応、疲労',
 '無力症']

In [128]:
def flatten_table(table):
    return [cell for row in table for cell in row if cell]

def match_table_to_paragraph(tables, paragraphs):
    matched_tables = {}
    remaining_paragraphs = list(paragraphs)  # Make a copy of the paragraphs list

    for table_idx, table in enumerate(tables):
        table_texts = flatten_table(table)
        table_matched = True
        
        for cell in table_texts:
            if not any(cell in para for para in paragraphs):
                table_matched = False
                break
        
        if table_matched:
            matched_tables[table_idx] = table
            
            # Remove matched texts from the paragraph and insert placeholder
            for cell in table_texts:
                remaining_paragraphs = [para.replace(cell, f"{{Table_{table_idx}}}") for para in remaining_paragraphs]

    return matched_tables, remaining_paragraphs

# Sample data
tables = [
    [['5%以上', '1～5%未満', '1%未満', '頻度不明', '感染症及び寄生虫症', '鼻咽頭炎', '上気道感染', '外陰腟真菌感染、副鼻腔炎、帯状疱疹、歯肉炎'],
     ['精神障害', 'うつ病', '神経系障害', '頭痛、浮動性めまい', '呼吸器、胸郭及び縦隔障害', '咽喉頭疼痛', '鼻閉', '好酸球性肺炎'],
     ['胃腸障害', '悪心、嘔吐', '下痢', '皮膚及び皮下組織障害', '発疹、そう痒症', 'ざ瘡、蕁麻疹、過敏性血管炎'],
     ['膿疱性乾癬、乾癬性紅皮症', '筋骨格系及び結合組織障害', '関節痛', '筋痛、背部痛', '全身障害及び投与局所様態', '注射部位反応、疲労', '無力症']]
]

paragraphs = [
    '次の副作用があらわれることがあるので、観察を十分に',
    '行い、異常が認められた場合には投与を中止するなど適',
    '切な処置を行うこと。',
    '11.1 重大な副作用',
    '11.1.1 アナフィラキシー（頻度不明）',
    '発疹、蕁麻疹、血管浮腫等があらわれることがある。',
    '11.1.2 重篤な感染症（1～5%未満）',
    'ウイルス、細菌あるいは真菌による重篤な感染症（蜂巣',
    '炎、憩室炎、骨髄炎、胃腸炎、肺炎及び尿路感染等）が',
    'あらわれることがある。重篤な感染症が発現した場合に',
    'は、感染が回復するまで本剤の投与をしないこと。',
    '［1.1、',
    '1.2、2.1、8.1、9.1.1参照］',
    '11.1.3 結核（頻度不明）',
    '結核が発現又は再活性化する可能性がある。',
    '［1.1、1.3、',
    '2.2、8.2、9.1.2参照］',
    '11.1.4 間質性肺炎（頻度不明）',
    '咳嗽、呼吸困難、発熱、肺音の異常（捻髪音）等が認め',
    'られた場合には、速やかに胸部X線、胸部CT、血清マー',
    'カー等の検査を実施すること。間質性肺炎が疑われた場',
    '合には投与を中止し、副腎皮質ホルモン剤の投与等の適',
    '切な処置を行うこと。',
    '11.2 その他の副作用',
    '5%以上',
    '1～5%未満',
    '1%未満',
    '頻度不明',
    '感染症及び',
    '寄生虫症',
    '鼻咽頭炎',
    '上気道感染',
    '外陰腟真菌',
    '感染、副鼻',
    '腔炎、帯状',
    '疱疹、歯肉',
    '炎',
    '精神障害',
    'うつ病',
    '神経系障害',
    '頭痛、浮動',
    '性めまい',
    '呼吸器、胸',
    '郭及び縦隔',
    '障害',
    '咽喉頭疼痛',
    '鼻閉',
    '好酸球性肺',
    '炎',
    '胃腸障害',
    '悪心、嘔吐',
    '下痢',
    '皮膚及び皮',
    '下組織障害',
    '発疹、そう',
    '痒症',
    'ざ瘡、蕁麻',
    '疹、過敏性',
    '血管炎',
    '膿疱性乾',
    '癬、乾癬性',
    '紅皮症',
    '筋骨格系及',
    'び結合組織',
    '障害',
    '関節痛',
    '筋痛、背部',
    '痛',
    '全身障害及',
    'び投与局所',
    '様態',
    '注射部位反',
    '応、疲労',
    '無力症'
]

matched_tables, updated_paragraphs = match_table_to_paragraph(tables, paragraphs)

# Print matched tables
for idx, table in matched_tables.items():
    print(f"Table {idx} matches the paragraph:")
    for row in table:
        print(row)
    print("\n")

# # Print the updated paragraphs with placeholders
# print("Updated Paragraph with Placeholders:")
# for para in updated_paragraphs:
#     print(para)


In [102]:
header

'3. 組成・性状'

In [100]:
matched_tables

{3: [['色・性状', '無色～淡黄色の澄明又はわずかに混濁した液'],
  ['pH', '5.7～6.3'],
  ['浸透圧比', '約1（生理食塩液に対する比）']]}

#### Parse tables (and / or images)

In [57]:
# parse tables and images from pdf file (and save them)
raw_pdf_elements = partition_pdf(
    filename=path_pdf,                  # mandatory
    strategy="hi_res",                                     # mandatory to use ``hi_res`` strategy
    extract_images_in_pdf=True,                            # mandatory to set as ``True``
    extract_image_block_types=["Image", "Table"],          # optional
    extract_image_block_to_payload=False,                  # optional
    extract_image_block_output_dir=dir_partitions + filename, #.split(".")[0] + "/",  # optional - only works when 
    chunking_strategy="by_title",
    # extract_image_block_to_payload=True
    multipage_sections = "False",
    include_page_breaks = "False",
    )

print(f"In total {len(raw_pdf_elements)} elements")
ele_types = set([ele.category for ele in raw_pdf_elements])
print(f"Categories of all elements are: {ele_types}")

ele_groups = dict((key, []) for key in ele_types)

for ele in raw_pdf_elements:
    ele_groups[ele.category].append(ele)
print(f"{len(ele_groups['Table'])} tables")

In total 42 elements
Categories of all elements are: {'CompositeElement', 'Table'}
7 tables


In [75]:
info_tables_page['page 1'][0]['text']

'販売名 ステラーラ皮下注45mgシリンジ 有効成分 （1シリンジ0.5mL中） ウステキヌマブ（遺伝子組換え）45mg含有 添加剤 精製白糖38mg、L-ヒスチジン0.5mg、ポリソ ルベート80 0.02mg'

In [58]:
info_tables = [x.to_dict() for x in ele_groups['Table']]
info_images = [x.to_dict() for x in ele_groups['CompositeElement']]

# parsed tables (and / or images) for each page
info_tables_page = defaultdict(list)
for info_table in info_tables:
    pg = info_table['metadata']['page_number']
    info_tables_page[f"page {pg}"].append(info_table)

info_images_page = defaultdict(list)
for info_image in info_images:
    pg = info_table['metadata']['page_number']
    info_images_page[f"page {pg}"].append(info_image)

#### Compare string similarity between two sets of extracted tables to create a mapping

In [59]:
scores = dict()
for pg, tbs_p in tbs.items():
    info_tbs_p = info_tables_page[pg]
    scores[pg] = dict()

    # one-to-one comparison
    for i in range(len(info_tbs_p)):
        info_tb = info_tbs_p[i]
        scores[pg][i] = []#dict()
        
        for j in range(len(tbs_p)):
            tb_p = tbs_p[j]           
            # scores[pg][i][j] = compare_strings_similarity(info_tb['text'], ' '.join([item for sublist in tb_p for item in sublist if item is not None]))
            scores[pg][i].append(compare_strings_similarity(info_tb['text'], ' '.join([item for sublist in tb_p for item in sublist if item is not None])))
scores

{'page 1': {0: [0.08108108108108109,
   0.05235602094240838,
   0.9809523809523809,
   0.07453416149068323],
  1: [0.08080808080808081, 0.07042253521126761, 0.07453416149068323, 1.0]},
 'page 2': {0: [0.8560460652591171]},
 'page 3': {0: [0.4217391304347826, 0.2057877813504823],
  1: [0.026578073089700997, 0.9605263157894737]},
 'page 4': {0: [0.8247422680412371, 0.582010582010582],
  1: [0.6666666666666666, 0.9519230769230769]}}

In [64]:
thres = 0.6
mappings = dict() # for each page, from text tbs to image tbs
for pg, score in scores.items():
    # print(pg)
    info_tbs_p = info_tables_page[pg]
    tbs_p = tbs[pg]
    mappings[pg] = dict()
    for i in range(len(info_tbs_p)):
        # print(scores[pg][i], np.argmax(scores[pg][i]))
        # print(f"\t{info_tbs_p[i]}")
        # print(f"{tbs_p[np.argmax(score[i])]}")
        mappings[pg][i] = np.argmax(score[i]) #if max(score[i]) >= thres else None
mappings

{'page 1': {0: 2, 1: 3},
 'page 2': {0: 0},
 'page 3': {0: 0, 1: 1},
 'page 4': {0: 0, 1: 1}}

In [65]:
# Initialize an empty string to collect all HTML content
html_content = "<html><body>"

for pg, mapping in mappings.items():
    for i, idx in mapping.items():
        html_table = list_to_html_table(tbs[pg][idx])
        # Add the 'pg, i, idx' text before each table
        html_content += f"<p>Page {pg}, table(image) {i+1}, table {idx+1}</p>\n{html_table}\n"

# Close the HTML tags
html_content += "</body></html>"

# Write the collected HTML content to a file
with open(f"tables_{filename}.html", "w") as file:
    file.write(html_content)

print("HTML file created successfully!")

HTML file created successfully!


In [69]:
result['5. 効能又は効果に関連する注意']

['〈尋常性乾癬及び関節症性乾癬〉',
 '5.1 以下のいずれかを満たす尋常性乾癬又は関節症性乾癬',
 '患者に投与すること。',
 '［1.4参照］',
 '・紫外線療法を含む既存の全身療法（生物製剤を除く）',
 'で十分な効果が得られず、皮疹が体表面積の10%以上',
 'に及ぶ患者。',
 '・難治性の皮疹又は関節症状を有する患者。',
 '〈クローン病〉',
 '5.2 過去の治療において、栄養療法、他の薬物療法（5-ア',
 'ミノサリチル酸製剤、ステロイド、アザチオプリン等）',
 '等による適切な治療を行っても、疾患に起因する明らか',
 'な臨床症状が残る場合に投与すること。',
 '［1.4参照］',
 '〈潰瘍性大腸炎〉',
 '5.3 過去の治療において、他の薬物療法（ステロイド、ア',
 'ザチオプリン等）等による適切な治療を行っても、疾患',
 'に起因する明らかな臨床症状が残る場合に投与すること。',
 '［1.4参照］']

In [40]:
len(info_tables_page['page 1'])

3

In [None]:
# Generate HTML tables
html_table = list_to_html_table(tbs_p[0])
# Print the HTML table
print(html_table)

# Generate markdown tables
md_table = create_markdown_table(tbs_p[0])
print(md_table)

In [123]:
# def parse_format_1(data):
#     # Extract the main sections
#     main_parts = data.split(' 投与量 ')
#     weight_part = main_parts[0].replace('患者体重 ', '')
#     dose_part = main_parts[1]

#     # Split the weight and dose parts
#     weights = re.split(r' (?=\d+kg)', weight_part)
#     doses = dose_part.split(' ')

#     # Combine into the structured format
#     structured_format = [['患者体重', '投与量']]
#     for weight, dose in zip(weights, doses):
#         structured_format.append([weight, dose])

#     return structured_format
    
# parse_format_1(info_tables_page[1][0]['text'])

In [124]:
# def convert_format_2_to_string(data):
#     # Combine header
#     header = f"{data[0][0]} {data[0][1]}"
    
#     # Combine the rest of the data
#     body = ' '.join([f"{item[0]} {item[1]}" for item in data[1:]])
    
#     # Combine header and body into one string
#     return f"{header} {body}"

# convert_format_2_to_string(tbs['page 1'][2])

In [164]:
md_table

'| 投与量 | プラセボ | 90mg<br>8週間隔投与 | 90mg<br>12週間隔投与 |\n| --- | --- | --- | --- |\n| Clinical remission rate | 35.9%<br>（47/131例） | 53.5%a）<br>（68/127例） | 48.8%b）<br>（63/129例） |\n'

| 投与量 | プラセボ | 90mg<br>8週間隔投与 | 90mg<br>12週間隔投与 |
| --- | --- | --- | --- |
| Clinical remission rate | 35.9%<br>（47/131例） | 53.5%a）<br>（68/127例） | 48.8%b）<br>（63/129例） |

----


| 投与量 | プラセボ | 90mg<br>8週間隔投与 | 90mg<br>12週間隔投与 |
| --- | --- | --- | --- |
| Clinical remission rate | 35.9%<br>（47/131例） | 53.5%a）<br>（68/127例） | 48.8%b）<br>（63/129例） |

In [31]:
import fitz  # PyMuPDF
from bs4 import BeautifulSoup

def extract_text_and_tables(pdf_path):
    doc = fitz.open(pdf_path)
    html_content = ""

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        html_content += f"<h2>Page {page_num + 1}</h2>\n"

        # Extract text as HTML
        html = page.get_text("html")
        soup = BeautifulSoup(html, 'html.parser')

        # Identify and extract tables
        for table in soup.find_all("table"):
            html_content += str(table)
        
        # Extract other text content
        for block in page.get_text("blocks"):
            html_content += f"<p>{block[4]}</p>\n"

    return html_content

def save_html(content, output_path):
    with open(output_path, 'w', encoding='utf-8') as file:
        file.write(content)

pdf_path = path_pdf  # Replace with your PDF file path
output_path = "output.html"

# Extract content from PDF and convert to HTML
html_content = extract_text_and_tables(pdf_path)

# Wrap in basic HTML structure
html_full_content = f"""
<!DOCTYPE html>
<html>
<head>
    <title>PDF to HTML</title>
</head>
<body>
    {html_content}
</body>
</html>
"""

# Save the HTML content to a file
save_html(html_full_content, output_path)

print(f"HTML file created: {output_path}")


HTML file created: output.html


In [32]:
import fitz  # PyMuPDF
import pdfplumber
from bs4 import BeautifulSoup

def extract_text_with_pymupdf(pdf_path):
    doc = fitz.open(pdf_path)
    html_content = ""

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        html_content += f"<h2>Page {page_num + 1}</h2>\n"

        # Extract text as HTML
        html = page.get_text("html")
        soup = BeautifulSoup(html, 'html.parser')

        # Extract other text content
        for block in page.get_text("blocks"):
            html_content += f"<p>{block[4]}</p>\n"

    return html_content

def extract_tables_with_pdfplumber(pdf_path):
    html_content = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages):
            html_content += f"<h2>Page {page_num + 1}</h2>\n"
            tables = page.extract_tables()
            for table in tables:
                html_content += "<table border='1'>\n"
                for row in table:
                    html_content += "<tr>\n"
                    for cell in row:
                        cell_content = cell if cell else ""
                        html_content += f"<td>{cell_content}</td>\n"
                    html_content += "</tr>\n"
                html_content += "</table>\n"
    return html_content

def save_html(content, output_path):
    with open(output_path, 'w', encoding='utf-8') as file:
        file.write(content)

pdf_path =  path_pdf  # Replace with your PDF file path
output_path = "output2.html"

# Extract text and tables
text_content = extract_text_with_pymupdf(pdf_path)
tables_content = extract_tables_with_pdfplumber(pdf_path)

# Combine content
html_full_content = f"""
<!DOCTYPE html>
<html>
<head>
    <title>PDF to HTML</title>
</head>
<body>
    {text_content}
    {tables_content}
</body>
</html>
"""

# Save the HTML content to a file
save_html(html_full_content, output_path)

print(f"HTML file created: {output_path}")


HTML file created: output2.html
