In [1]:
import pandas as pd
import os

# Define the full path to your Excel file
file_path = r"F:\PhD\RA\Schafer\IRA\data\unzipped\merge\test_keytruda.xlsx"

df = pd.read_excel(file_path)
print("✅ File loaded successfully.\n")

# Show the number of rows and columns
print(f"Shape: {df.shape}")
# Show column names
print("\nColumns:")
print(df.columns.tolist())

✅ File loaded successfully.

Shape: (256, 25)

Columns:
['ApplNo', 'ProductNo', 'Form', 'Strength', 'ReferenceDrug', 'DrugName', 'ActiveIngredient', 'ReferenceStandard', 'SubmissionClassCodeID', 'SubmissionType', 'SubmissionNo', 'SubmissionStatus', 'SubmissionStatusDate', 'SubmissionsPublicNotes', 'ReviewPriority', 'ApplType', 'ApplPublicNotes', 'SponsorName', 'ApplicationDocsID', 'ApplicationDocsTypeID', 'ApplicationDocsTitle', 'ApplicationDocsURL', 'ApplicationDocsDate', 'j_submissionActionTypeID', 'ActionTypes_LookupID']


In [2]:
import pandas as pd
import requests
import fitz  # PyMuPDF
import tempfile
import os

# --- PDF keyword analysis function ---
def analyze_fda_pdf_from_url(pdf_url, keywords=None, stop_section="APPROVAL & LABELING"):
    if keywords is None:
        keywords = ["new indication", "modification", "treatment", "pmr", "regular approval", "accelerated"]
    result = {kw: 0 for kw in keywords}
    result["pdf_opened_successfully"] = False

    try:
        headers = {
        "User-Agent": "Mozilla/5.0",
        "Referer": "https://www.accessdata.fda.gov/"
        }
        response = requests.get(pdf_url, headers=headers, timeout=20)
        if response.status_code != 200:
            result["error"] = f"Failed to download (status code: {response.status_code})"
            return result

        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
            tmp_file.write(response.content)
            tmp_path = tmp_file.name

        doc = fitz.open(tmp_path)
        full_text = ""
        for page in doc:
            full_text += page.get_text()
        doc.close()
        os.remove(tmp_path)

        stop_index = full_text.upper().find(stop_section.upper())
        pre_section_text = full_text[:stop_index] if stop_index != -1 else full_text

        lower_text = pre_section_text.lower()
        for kw in keywords:
            result[kw] = lower_text.count(kw)

        result["pdf_opened_successfully"] = True
    except Exception as e:
        result["error"] = str(e)

    return result

In [3]:
# --- Initialize result columns ---
keywords = ["new indication", "modification", "treatment", "pmr", "regular approval", "accelerated"]
keyword_cols = keywords + ["pdf_opened_successfully", "error"]

for col in keyword_cols:
    df[col] = None

# --- Loop through each row and apply logic conditionally ---
for idx, row in df.iterrows():
    if row.get("ActionTypes_LookupID") == 8 and row.get("ApplicationDocsTypeID") == 1:
        url = row.get("ApplicationDocsURL")
        if pd.notnull(url):
            result = analyze_fda_pdf_from_url(url)
            for key, val in result.items():
                df.at[idx, key] = val

In [4]:
MAX_RETRIES = 5

for attempt in range(1, MAX_RETRIES + 1):
    print(f"\n🔁 Retry Attempt {attempt}")

    # Identify rows that failed AND have a non-null URL
    failed_rows = df[
        (df["pdf_opened_successfully"] == False) &
        (df["ApplicationDocsURL"].notnull())
    ]

    if failed_rows.empty:
        print("✅ All PDFs processed successfully.")
        break

    print(f"  → {len(failed_rows)} rows to retry...")

    for idx, row in failed_rows.iterrows():
        url = row["ApplicationDocsURL"]
        if pd.notnull(url):
            result = analyze_fda_pdf_from_url(url, keywords=keywords)
            for key, val in result.items():
                df.at[idx, key] = val

print("\n✅ Retry loop completed.")



🔁 Retry Attempt 1
✅ All PDFs processed successfully.

✅ Retry loop completed.


In [5]:
# --- Save the full DataFrame with new columns ---
output_file = r"F:\PhD\RA\Schafer\IRA\data\unzipped\merge\test_keytruda_result.xlsx"
df.to_excel(output_file, index=False)

print(f"✅ All rows preserved. Results saved to: {output_file}")

✅ All rows preserved. Results saved to: F:\PhD\RA\Schafer\IRA\data\unzipped\merge\test_keytruda_result.xlsx


# Bullet

In [24]:
import pandas as pd
# File paths
input_path = r"F:\PhD\RA\Schafer\IRA\data\unzipped\merge\merged4_modified.xlsx"
output_path_all = r"F:\PhD\RA\Schafer\IRA\data\unzipped\merge\merged4_non_missing.xlsx"

# Load Excel file
df = pd.read_excel(input_path)

# Filter rows where ProductNo is not missing or blank
df_non_missing = df[df["ApplicationDocsURL"].notna() & (df["ApplicationDocsURL"].astype(str).str.strip() != "")]

# Save all non-missing ProductNo records
df_non_missing.to_excel(output_path_all, index=False)

In [1]:
import pandas as pd
# File paths
input_path = r"F:\PhD\RA\Schafer\IRA\data\unzipped\merge\merged4_non_missing.xlsx"
output_path_1000 = r"F:\PhD\RA\Schafer\IRA\data\unzipped\merge\merged4_non_missing_top1000.xlsx"

# Load Excel file
df = pd.read_excel(input_path)
# Save only the first 1000 rows
df.head(20000).to_excel(output_path_1000, index=False)

print("Export complete!")

Export complete!


In [4]:
import pandas as pd
import pdfplumber
import requests
import tempfile
import re
from tqdm import tqdm
import time

main_bullets = ["•", "\uf0b7", "·", "\u00b7", "."]

def is_two_column(page, center_width_fraction=0.02, min_text_length=26):
    width = page.width
    height = page.height
    center_start = width * (0.5 - center_width_fraction / 2)
    center_end = width * (0.5 + center_width_fraction / 2)
    center_text = page.within_bbox((center_start, 0, center_end, height)).extract_text() or ""
    return len(center_text) <= min_text_length

def extract_all_text(pdf_path):
    all_text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages):
            two_col = is_two_column(page)
            if two_col:
                width = page.width
                left_col = page.within_bbox((0, 0, width / 2, page.height)).extract_text() or ""
                right_col = page.within_bbox((width / 2, 0, width, page.height)).extract_text() or ""
                all_text += left_col + "\n" + right_col + "\n"
            else:
                page_text = page.extract_text() or ""
                all_text += page_text + "\n"
    return all_text

def find_section_indices(lines, is_2col):
    if is_2col:
        # LOOSER: anywhere in the line
        ind_start = [
            i for i, line in enumerate(lines)
            if re.search(r"indicat.{0,30}usag", line.replace(" ", "").lower())
        ]
        dos_start = [
            i for i, line in enumerate(lines)
            if re.search(r"dosag.{0,20}admin", line.replace(" ", "").lower())
        ]
        contra_start = [
            i for i, line in enumerate(lines)
            if re.search(r"contraindicat", line.replace(" ", "").lower())
        ]
    else:
        # STRICTER: must be at line start (with optional dash/number)
        ind_start = [
            i for i, line in enumerate(lines)
            if re.match(r"^\s*(\d+[\.\)]\s*)?[-—–\s]*indicat.{0,30}usag", line.replace(" ", "").lower())
        ]
        dos_start = [
            i for i, line in enumerate(lines)
            if re.match(r"^\s*(\d+[\.\)]\s*)?[-—–\s]*dosag.{0,20}admin", line.replace(" ", "").lower())
        ]
        contra_start = [
            i for i, line in enumerate(lines)
            if re.match(r"^\s*(\d+[\.\)]\s*)?[-—–\s]*contraindicat", line.replace(" ", "").lower())
        ]
    return ind_start, dos_start, contra_start

def get_section_lines_with_gap(ind_idx, dos_indices, lines, gaps=[5, 10, 15]):
    for gap in gaps:
        valid_dos_indices = [i for i in dos_indices if i - ind_idx > gap]
        if valid_dos_indices:
            first_valid = valid_dos_indices[0]
            return lines[ind_idx+1:first_valid], gap
    return [], None

def extract_section_loose(lines, is_2col):
    ind_start, dos_start, contra_start = find_section_indices(lines, is_2col)
    if is_2col:
        if ind_start and dos_start:
            section_lines, used_gap = get_section_lines_with_gap(ind_start[0], dos_start, lines)
            if section_lines:
                return "\n".join([lines[ind_start[0]]] + section_lines)
            else:
                return "\n".join(lines[ind_start[0]:])
        elif ind_start:
            return "\n".join(lines[ind_start[0]:])
        else:
            return ""
    else:
        if ind_start and contra_start:
            stop = [i for i in contra_start if i > ind_start[0]]
            if stop:
                stop = stop[0]
                return "\n".join(lines[ind_start[0]:stop])
        elif ind_start:
            return "\n".join(lines[ind_start[0]:])
        else:
            return ""
    return ""

def split_by_main_bullet(text):
    bullet_chars = ''.join(re.escape(b) for b in main_bullets if b != ".")
    pattern = r'((?:^|\n)[{}]\s)'.format(bullet_chars)
    pattern += r'|((?:^|\n)\.\s)'
    splits = [m.start() for m in re.finditer(pattern, text)]
    if not splits:
        return [text]
    sections = []
    for i, idx in enumerate(splits):
        end = splits[i+1] if i+1 < len(splits) else len(text)
        sections.append(text[idx:end].strip())
    return sections

def is_sub_bullet_line(line):
    line_stripped = line.lstrip()
    return line_stripped.startswith("o ") or line_stripped.startswith("\u25cb")

def count_sub_bullets(section):
    lines = section.splitlines()
    count = 0
    for line in lines:
        if is_sub_bullet_line(line):
            count += 1
    return count if count > 0 else 1

def should_exclude_section(section):
    return 'limitation of use' in section.lower()

def count_indicat_and_treatment(section_text):
    lines = section_text.splitlines()
    count_indicat = max(0, sum(1 for line in lines if re.search(r"indicated", line, re.IGNORECASE)) - 1)
    count_treatment = sum(1 for line in lines if re.search(r"treatm", line, re.IGNORECASE))
    return count_indicat, count_treatment

def find_max_numbered_bullet(lines):
    numbers = []
    for line in lines:
        match = re.match(r"^\s*(\d+)[\.\)]", line)
        if match:
            numbers.append(int(match.group(1)))
    return max(numbers) if numbers else 0

def count_indications_keyword_and_number(section_text):
    lines = section_text.splitlines()
    count_indicat, count_treatment = count_indicat_and_treatment(section_text)
    max_keyword = max(count_indicat, count_treatment)
    max_number = find_max_numbered_bullet(lines)
    final_count = max(max_keyword, max_number)
    return final_count

def get_indication_count(pdf_url):
    try:
        response = requests.get(pdf_url, headers={"User-Agent": "Mozilla/5.0"}, timeout=60)
        response.raise_for_status()
        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
            tmp_file.write(response.content)
            tmp_path = tmp_file.name
        with pdfplumber.open(tmp_path) as pdf:
            first_page = pdf.pages[0]
            is_2col = is_two_column(first_page)
            all_text = extract_all_text(tmp_path)
            lines = all_text.splitlines()
            section = extract_section_loose(lines, is_2col=is_2col)
            if is_2col:
                main_sections = [
                    s for s in split_by_main_bullet(section)
                    if s.strip() and not should_exclude_section(s)
                ]
                count = 0
                for sec in main_sections:
                    cnt = count_sub_bullets(sec)
                    count += cnt
            else:
                count = count_indications_keyword_and_number(section)
        return count, True
    except Exception as e:
        return None, False

def run_auto_pdf_extraction(
    excel_path,
    out_path="indication_results.xlsx",
    url_col="ApplicationDocsURL",
    max_retries=5
):
    df = pd.read_excel(excel_path)
    mask = (
        (df["ApplicationDocsTypeID"] == 2) &
        (df["ActionTypes_LookupID"] < 13) &
        (df["ApplicationDocsURL"].notnull())
    )
    df_proc = df.loc[mask].copy()
    indication_counts = [None] * len(df_proc)
    open_successes = [False] * len(df_proc)
    failed_indices = list(range(len(df_proc)))

    for attempt in range(max_retries):
        print(f"\n--- Iteration {attempt+1} ---")
        still_failed = []
        for idx in tqdm(failed_indices):
            row = df_proc.iloc[idx]
            count, opened = get_indication_count(row[url_col])
            indication_counts[idx] = count
            open_successes[idx] = opened
            if not opened:
                still_failed.append(idx)
                time.sleep(1)
        failed_indices = still_failed
        print(f"Iteration {attempt+1}: Failed PDFs = {len(failed_indices)}")
        if not failed_indices:
            break

    df_proc["indication_count"] = indication_counts
    df_proc["pdf_opened"] = open_successes
    df_proc.to_excel(out_path, index=False)
    print(f"Results saved to {out_path}")
    if failed_indices:
        print(f"Still failed indices after {max_retries} iterations: {failed_indices}")

# ---- Usage ----
run_auto_pdf_extraction(
    excel_path="F:/PhD/RA/Schafer/IRA/data/unzipped/merge/merged4_non_missing_top1000.xlsx",     # <-- your input file path
    out_path="F:/PhD/RA/Schafer/IRA/data/unzipped/merge/merged4_non_missing_top1000_improve.xlsx" # <-- your desired output file path
)


--- Iteration 1 ---


100%|██████████| 208/208 [14:27<00:00,  4.17s/it]


Iteration 1: Failed PDFs = 6

--- Iteration 2 ---


100%|██████████| 6/6 [01:22<00:00, 13.78s/it]


Iteration 2: Failed PDFs = 1

--- Iteration 3 ---


100%|██████████| 1/1 [00:03<00:00,  3.70s/it]


Iteration 3: Failed PDFs = 0
Results saved to F:/PhD/RA/Schafer/IRA/data/unzipped/merge/merged4_non_missing_top1000_improve.xlsx


# for checking

In [15]:
unique_chars = set(section_text)
for c in unique_chars:
    if not c.isalnum() and not c.isspace():  # likely a symbol
        print(f"Symbol: {repr(c)}, Unicode: U+{ord(c):04X}")

Symbol: '-', Unicode: U+002D
Symbol: '[', Unicode: U+005B
Symbol: ',', Unicode: U+002C
Symbol: '(', Unicode: U+0028
Symbol: ':', Unicode: U+003A
Symbol: '%', Unicode: U+0025
Symbol: ']', Unicode: U+005D
Symbol: '.', Unicode: U+002E
Symbol: '•', Unicode: U+2022
Symbol: '≥', Unicode: U+2265
Symbol: '/', Unicode: U+002F
Symbol: ')', Unicode: U+0029


In [None]:
pdf_url = "http://www.accessdata.fda.gov/drugsatfda_docs/label/2006/103705s5230-s5231lbl.pdf"

# Define bullet characters
possible_bullets = ["•", "\uf0b7", "·", "\u00b7"]

# Download PDF to a temporary file
headers = {"User-Agent": "Mozilla/5.0", "Referer": "https://www.accessdata.fda.gov/"}
response = requests.get(pdf_url, headers=headers, timeout=30)
response.raise_for_status()
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
    tmp_file.write(response.content)
    tmp_path = tmp_file.name
all_text = ""
with pdfplumber.open(tmp_path) as pdf:
    for page in pdf.pages:
        print(is_two_column(page))
os.remove(tmp_path)

In [9]:
import pdfplumber
import requests
import tempfile
import re

main_bullets = ["•", "\uf0b7", "·", "\u00b7", "."]

def is_two_column(page, center_width_fraction=0.02, min_text_length=26):
    width = page.width
    height = page.height
    center_start = width * (0.5 - center_width_fraction / 2)
    center_end = width * (0.5 + center_width_fraction / 2)
    center_text = page.within_bbox((center_start, 0, center_end, height)).extract_text() or ""
    return len(center_text) <= min_text_length

def extract_all_text(pdf_path):
    all_text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages):
            two_col = is_two_column(page)
            print(f"Page {page_num+1} is_two_column: {two_col}")
            if two_col:
                width = page.width
                left_col = page.within_bbox((0, 0, width / 2, page.height)).extract_text() or ""
                right_col = page.within_bbox((width / 2, 0, width, page.height)).extract_text() or ""
                all_text += left_col + "\n" + right_col + "\n"
            else:
                page_text = page.extract_text() or ""
                all_text += page_text + "\n"
    return all_text

def find_loose_section_line(lines, pattern):
    pat = re.compile(pattern, re.IGNORECASE)
    return [i for i, line in enumerate(lines) if pat.search(line.replace(" ", ""))]

def find_loose_and_dashed_section_index(lines, pattern):
    indices = find_loose_section_line(lines, pattern)
    for idx in indices:
        if "--" in lines[idx]:
            return idx
    return None

def find_next_dashed_section_index(lines, pattern, after_idx):
    pat = re.compile(pattern, re.IGNORECASE)
    for i in range(after_idx+1, len(lines)):
        if pat.search(lines[i].replace(" ", "")) and "--" in lines[i]:
            return i
    return None

def find_section_indices(lines, is_2col):
    if is_2col:
        # Loose match (anywhere in line)
        ind_start = [
            i for i, line in enumerate(lines)
            if re.search(r"indicat.{0,30}usag", line.replace(" ", "").lower())
        ]
        dos_start = [
            i for i, line in enumerate(lines)
            if re.search(r"dosag.{0,20}admin", line.replace(" ", "").lower())
        ]
        contra_start = [
            i for i, line in enumerate(lines)
            if re.search(r"contraindicat", line.replace(" ", "").lower())
        ]
    else:
        # Strict match (line start, optional dash/number)
        ind_start = [
            i for i, line in enumerate(lines)
            if re.match(r"^\s*(\d+[\.\)]\s*)?[-—–\s]*indicat.{0,30}usag", line.replace(" ", "").lower())
        ]
        dos_start = [
            i for i, line in enumerate(lines)
            if re.match(r"^\s*(\d+[\.\)]\s*)?[-—–\s]*dosag.{0,20}admin", line.replace(" ", "").lower())
        ]
        contra_start = [
            i for i, line in enumerate(lines)
            if re.match(r"^\s*(\d+[\.\)]\s*)?[-—–\s]*contraindicat", line.replace(" ", "").lower())
        ]
    return ind_start, dos_start, contra_start

def get_section_lines_with_gap(ind_idx, dos_indices, lines, gaps=[5, 10, 15]):
    for gap in gaps:
        valid_dos_indices = [i for i in dos_indices if i - ind_idx > gap]
        if valid_dos_indices:
            first_valid = valid_dos_indices[0]
            return lines[ind_idx+1:first_valid], gap
    return [], None

def extract_section_loose(lines, is_2col):
    if is_2col:
        ind_pattern = r"indicat.{0,30}usag"
        dos_pattern = r"dosag.{0,20}admin"

        start_idx = find_loose_and_dashed_section_index(lines, ind_pattern)
        if start_idx is None:
            print("[ERROR] No dashed 'INDICATIONS AND USAGE' header found.")
            return ""
        end_idx = find_next_dashed_section_index(lines, dos_pattern, start_idx)
        if end_idx is None:
            print("[ERROR] No dashed 'DOSAGE AND ADMINISTRATION' header found after 'INDICATIONS AND USAGE'.")
            return ""
        return "\n".join(lines[start_idx+1:end_idx])
    else:
        ind_start, _, contra_start = find_section_indices(lines, is_2col)
        if ind_start and contra_start:
            stop = [i for i in contra_start if i > ind_start[0]]
            if stop:
                stop = stop[0]
                return "\n".join(lines[ind_start[0]:stop])
        elif ind_start:
            return "\n".join(lines[ind_start[0]:])
        else:
            print("[ERROR] No 'INDICATIONS AND USAGE' header found in one-column section.")
            return ""
    return ""

def split_by_main_bullet(text):
    bullet_chars = ''.join(re.escape(b) for b in main_bullets if b != ".")
    pattern = r'((?:^|\n)[{}]\s)'.format(bullet_chars)
    pattern += r'|((?:^|\n)\.\s)'
    splits = [m.start() for m in re.finditer(pattern, text)]
    if not splits:
        return [text]
    sections = []
    for i, idx in enumerate(splits):
        end = splits[i+1] if i+1 < len(splits) else len(text)
        sections.append(text[idx:end].strip())
    return sections

def is_sub_bullet_line(line):
    line_stripped = line.lstrip()
    return line_stripped.startswith("o ") or line_stripped.startswith("\u25cb")

def count_sub_bullets(section):
    lines = section.splitlines()
    count = 0
    for line in lines:
        if is_sub_bullet_line(line):
            count += 1
    return count if count > 0 else 1

def should_exclude_section(section):
    return 'limitation of use' in section.lower()

def count_indicat_and_treatment(section_text):
    lines = section_text.splitlines()
    count_indicat = max(0, sum(1 for line in lines if re.search(r"indicated", line, re.IGNORECASE)) - 1)
    count_treatment = sum(1 for line in lines if re.search(r"treatm", line, re.IGNORECASE))
    return count_indicat, count_treatment

def find_max_numbered_bullet(lines):
    numbers = []
    for line in lines:
        match = re.match(r"^\s*(\d+)[\.\)]", line)
        if match:
            numbers.append(int(match.group(1)))
    return max(numbers) if numbers else 0

def count_indications_keyword_and_number(section_text):
    lines = section_text.splitlines()
    count_indicat, count_treatment = count_indicat_and_treatment(section_text)
    max_keyword = max(count_indicat, count_treatment)
    max_number = find_max_numbered_bullet(lines)
    final_count = max(max_keyword, max_number)
    return final_count

def test_pdf_indication_count(pdf_url):
    response = requests.get(pdf_url, headers={"User-Agent": "Mozilla/5.0"}, timeout=60)
    response.raise_for_status()
    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
        tmp_file.write(response.content)
        tmp_path = tmp_file.name
    with pdfplumber.open(tmp_path) as pdf:
        first_page = pdf.pages[0]
        is_2col = is_two_column(first_page)
        all_text = extract_all_text(tmp_path)
        lines = all_text.splitlines()
        section = extract_section_loose(lines, is_2col=is_2col)

        if not section.strip():
            print("[INFO] Section extraction returned empty. See error(s) above.")
            return

        if is_2col:
            main_sections = [
                s for s in split_by_main_bullet(section)
                if s.strip() and not should_exclude_section(s)
            ]
            count = 0
            for sec in main_sections:
                cnt = count_sub_bullets(sec)
                count += cnt
            method = "two-column (strict dashed header logic only)"
        else:
            count = count_indications_keyword_and_number(section)
            method = "one-column (max of keyword match or numbered bullet, strict header)"

        preview = "\n".join(section.splitlines()[:25])
        print(f"\n--- Section Preview (first 25 lines) ---\n{preview}")
        print(f"\n--- Total lines in section: {len(section.splitlines())}")
        print(f"\nIndication count: {count} (using {method})")

# Example usage:
test_pdf_indication_count("http://www.accessdata.fda.gov/drugsatfda_docs/label/2020/008085Orig1s071lbl.pdf")


Page 1 is_two_column: True
Page 2 is_two_column: False
Page 3 is_two_column: False
Page 4 is_two_column: False
Page 5 is_two_column: False
Page 6 is_two_column: False
Page 7 is_two_column: False
Page 8 is_two_column: False
Page 9 is_two_column: False
Page 10 is_two_column: False
Page 11 is_two_column: False
Page 12 is_two_column: False
Page 13 is_two_column: False
Page 14 is_two_column: False
Page 15 is_two_column: False
Page 16 is_two_column: False
Page 17 is_two_column: False
Page 18 is_two_column: False
Page 19 is_two_column: False
Page 20 is_two_column: False
Page 21 is_two_column: False
Page 22 is_two_column: False
Page 23 is_two_column: False
Page 24 is_two_column: True
Page 25 is_two_column: False
[ERROR] No dashed 'INDICATIONS AND USAGE' header found.
[INFO] Section extraction returned empty. See error(s) above.


In [11]:
import pdfplumber
import requests
import tempfile
import re

main_bullets = ["•", "\uf0b7", "·", "\u00b7", "."]

def is_two_column(page, center_width_fraction=0.02, min_text_length=26):
    width = page.width
    height = page.height
    center_start = width * (0.5 - center_width_fraction / 2)
    center_end = width * (0.5 + center_width_fraction / 2)
    center_text = page.within_bbox((center_start, 0, center_end, height)).extract_text() or ""
    return len(center_text) <= min_text_length

def extract_all_text(pdf_path):
    all_text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages):
            two_col = is_two_column(page)
            print(f"Page {page_num+1} is_two_column: {two_col}")
            if two_col:
                width = page.width
                left_col = page.within_bbox((0, 0, width / 2, page.height)).extract_text() or ""
                right_col = page.within_bbox((width / 2, 0, width, page.height)).extract_text() or ""
                all_text += left_col + "\n" + right_col + "\n"
            else:
                page_text = page.extract_text() or ""
                all_text += page_text + "\n"
    return all_text

def find_loose_and_dashed_section_index(lines, pattern):
    # Match lines with the pattern and at least 2 dashes or underscores
    pat = re.compile(pattern, re.IGNORECASE)
    for idx, line in enumerate(lines):
        if pat.search(line.replace(" ", "")) and (
            "--" in line or "__" in line
        ):
            return idx
    return None

def find_next_dashed_section_index(lines, pattern, after_idx):
    # Find next line after after_idx matching pattern and with dashes or underscores
    pat = re.compile(pattern, re.IGNORECASE)
    for i in range(after_idx + 1, len(lines)):
        if pat.search(lines[i].replace(" ", "")) and (
            "--" in lines[i] or "__" in lines[i]
        ):
            return i
    return None

def find_section_indices(lines, is_2col):
    if is_2col:
        # Loose match (anywhere in line)
        ind_start = [
            i for i, line in enumerate(lines)
            if re.search(r"indicat.{0,30}usag", line.replace(" ", "").lower())
        ]
        dos_start = [
            i for i, line in enumerate(lines)
            if re.search(r"dosag.{0,20}admin", line.replace(" ", "").lower())
        ]
        contra_start = [
            i for i, line in enumerate(lines)
            if re.search(r"contraindicat", line.replace(" ", "").lower())
        ]
    else:
        # Strict match (line start, optional dash/number)
        ind_start = [
            i for i, line in enumerate(lines)
            if re.match(r"^\s*(\d+[\.\)]\s*)?[-—–\s]*indicat.{0,30}usag", line.replace(" ", "").lower())
        ]
        dos_start = [
            i for i, line in enumerate(lines)
            if re.match(r"^\s*(\d+[\.\)]\s*)?[-—–\s]*dosag.{0,20}admin", line.replace(" ", "").lower())
        ]
        contra_start = [
            i for i, line in enumerate(lines)
            if re.match(r"^\s*(\d+[\.\)]\s*)?[-—–\s]*contraindicat", line.replace(" ", "").lower())
        ]
    return ind_start, dos_start, contra_start

def get_section_lines_with_gap(ind_idx, dos_indices, lines, gaps=[5, 10, 15]):
    for gap in gaps:
        valid_dos_indices = [i for i in dos_indices if i - ind_idx > gap]
        if valid_dos_indices:
            first_valid = valid_dos_indices[0]
            return lines[ind_idx+1:first_valid], gap
    return [], None

def extract_section_loose(lines, is_2col):
    if is_2col:
        ind_pattern = r"indicat.{0,30}usag"
        dos_pattern = r"dosag.{0,20}admin"

        # 1. Try dashed/underscored header logic
        start_idx = find_loose_and_dashed_section_index(lines, ind_pattern)
        end_idx = None
        if start_idx is not None:
            end_idx = find_next_dashed_section_index(lines, dos_pattern, start_idx)
        if start_idx is not None and end_idx is not None:
            print("[INFO] Using dashed/underscored header extraction.")
            return "\n".join(lines[start_idx + 1 : end_idx])
        else:
            print("[INFO] No dashed/underscored section header(s) found, falling back to gap-based extraction.")

        # 2. Fallback: original gap logic with loose detection
        ind_start, dos_start, _ = find_section_indices(lines, is_2col)
        if ind_start and dos_start:
            section_lines, used_gap = get_section_lines_with_gap(ind_start[0], dos_start, lines)
            if section_lines:
                return "\n".join([lines[ind_start[0]]] + section_lines)
            else:
                return "\n".join(lines[ind_start[0]:])
        elif ind_start:
            return "\n".join(lines[ind_start[0]:])
        else:
            print("[ERROR] No 'INDICATIONS AND USAGE' header found at all.")
            return ""
    else:
        # One-column logic (unchanged)
        ind_start, _, contra_start = find_section_indices(lines, is_2col)
        if ind_start and contra_start:
            stop = [i for i in contra_start if i > ind_start[0]]
            if stop:
                stop = stop[0]
                return "\n".join(lines[ind_start[0]:stop])
        elif ind_start:
            return "\n".join(lines[ind_start[0]:])
        else:
            print("[ERROR] No 'INDICATIONS AND USAGE' header found in one-column section.")
            return ""
    return ""

def split_by_main_bullet(text):
    bullet_chars = ''.join(re.escape(b) for b in main_bullets if b != ".")
    pattern = r'((?:^|\n)[{}]\s)'.format(bullet_chars)
    pattern += r'|((?:^|\n)\.\s)'
    splits = [m.start() for m in re.finditer(pattern, text)]
    if not splits:
        return [text]
    sections = []
    for i, idx in enumerate(splits):
        end = splits[i+1] if i+1 < len(splits) else len(text)
        sections.append(text[idx:end].strip())
    return sections

def is_sub_bullet_line(line):
    line_stripped = line.lstrip()
    return line_stripped.startswith("o ") or line_stripped.startswith("\u25cb")

def count_sub_bullets(section):
    lines = section.splitlines()
    count = 0
    for line in lines:
        if is_sub_bullet_line(line):
            count += 1
    return count if count > 0 else 1

def should_exclude_section(section):
    return 'limitation of use' in section.lower()

def count_indicat_and_treatment(section_text):
    lines = section_text.splitlines()
    count_indicat = max(0, sum(1 for line in lines if re.search(r"indicated", line, re.IGNORECASE)))
    count_treatment = sum(1 for line in lines if re.search(r"treatm", line, re.IGNORECASE))
    return count_indicat, count_treatment

def find_max_numbered_bullet(lines):
    numbers = []
    for line in lines:
        match = re.match(r"^\s*(\d+)[\.\)]", line)
        if match:
            numbers.append(int(match.group(1)))
    return max(numbers) if numbers else 0

def count_indications_keyword_and_number(section_text):
    lines = section_text.splitlines()
    count_indicat, count_treatment = count_indicat_and_treatment(section_text)
    max_keyword = max(count_indicat, count_treatment)
    max_number = find_max_numbered_bullet(lines)
    final_count = max(max_keyword, max_number)
    return final_count

def test_pdf_indication_count(pdf_url):
    response = requests.get(pdf_url, headers={"User-Agent": "Mozilla/5.0"}, timeout=60)
    response.raise_for_status()
    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
        tmp_file.write(response.content)
        tmp_path = tmp_file.name
    with pdfplumber.open(tmp_path) as pdf:
        first_page = pdf.pages[0]
        is_2col = is_two_column(first_page)
        all_text = extract_all_text(tmp_path)
        lines = all_text.splitlines()
        section = extract_section_loose(lines, is_2col=is_2col)

        if not section.strip():
            print("[INFO] Section extraction returned empty. See error(s) above.")
            return

        if is_2col:
            main_sections = [
                s for s in split_by_main_bullet(section)
                if s.strip() and not should_exclude_section(s)
            ]
            count = 0
            for sec in main_sections:
                cnt = count_sub_bullets(sec)
                count += cnt
            method = "two-column (hybrid: dashed/underscored headers preferred, gap fallback)"
        else:
            count = count_indications_keyword_and_number(section)
            method = "one-column (max of keyword match or numbered bullet, strict header)"

        preview = "\n".join(section.splitlines()[:25])
        print(f"\n--- Section Preview (first 25 lines) ---\n{preview}")
        print(f"\n--- Total lines in section: {len(section.splitlines())}")
        print(f"\nIndication count: {count} (using {method})")

# Example usage:
test_pdf_indication_count("https://www.accessdata.fda.gov/drugsatfda_docs/label/2005/018680s060lbl.pdf")


Page 1 is_two_column: False
Page 2 is_two_column: True
Page 3 is_two_column: False
Page 4 is_two_column: False
Page 5 is_two_column: False
Page 6 is_two_column: False
Page 7 is_two_column: True
Page 8 is_two_column: False
Page 9 is_two_column: False
Page 10 is_two_column: True
Page 11 is_two_column: False
Page 12 is_two_column: False
Page 13 is_two_column: True
Page 14 is_two_column: False
Page 15 is_two_column: False
Page 16 is_two_column: False
Page 17 is_two_column: False
Page 18 is_two_column: True
[ERROR] No 'INDICATIONS AND USAGE' header found in one-column section.
[INFO] Section extraction returned empty. See error(s) above.


## Newer

In [1]:
import pandas as pd
import pdfplumber
import requests
import tempfile
import re
import time
from tqdm import tqdm

main_bullets = ["•", "\uf0b7", "·", "\u00b7", "."]

def is_two_column(page, center_width_fraction=0.02, min_text_length=26):
    width = page.width
    height = page.height
    center_start = width * (0.5 - center_width_fraction / 2)
    center_end = width * (0.5 + center_width_fraction / 2)
    center_text = page.within_bbox((center_start, 0, center_end, height)).extract_text() or ""
    return len(center_text) <= min_text_length

def extract_all_text(pdf_path):
    all_text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages):
            two_col = is_two_column(page)
            # print(f"Page {page_num+1} is_two_column: {two_col}")
            if two_col:
                width = page.width
                left_col = page.within_bbox((0, 0, width / 2, page.height)).extract_text() or ""
                right_col = page.within_bbox((width / 2, 0, width, page.height)).extract_text() or ""
                all_text += left_col + "\n" + right_col + "\n"
            else:
                page_text = page.extract_text() or ""
                all_text += page_text + "\n"
    return all_text

def find_loose_and_dashed_section_index(lines, pattern):
    pat = re.compile(pattern, re.IGNORECASE)
    for idx, line in enumerate(lines):
        if pat.search(line.replace(" ", "")) and (
            "--" in line or "__" in line
        ):
            return idx
    return None

def find_next_dashed_section_index(lines, pattern, after_idx):
    pat = re.compile(pattern, re.IGNORECASE)
    for i in range(after_idx + 1, len(lines)):
        if pat.search(lines[i].replace(" ", "")) and (
            "--" in lines[i] or "__" in lines[i]
        ):
            return i
    return None

def find_section_indices(lines, is_2col):
    if is_2col:
        ind_start = [
            i for i, line in enumerate(lines)
            if re.search(r"indicat.{0,30}usag", line.replace(" ", "").lower())
        ]
        dos_start = [
            i for i, line in enumerate(lines)
            if re.search(r"dosag.{0,20}admin", line.replace(" ", "").lower())
        ]
        contra_start = [
            i for i, line in enumerate(lines)
            if re.search(r"contraindicat", line.replace(" ", "").lower())
        ]
    else:
        ind_start = [
            i for i, line in enumerate(lines)
            if re.match(r"^\s*(\d+[\.\)]\s*)?[-—–\s]*indicat.{0,30}usag", line.replace(" ", "").lower())
        ]
        dos_start = [
            i for i, line in enumerate(lines)
            if re.match(r"^\s*(\d+[\.\)]\s*)?[-—–\s]*dosag.{0,20}admin", line.replace(" ", "").lower())
        ]
        contra_start = [
            i for i, line in enumerate(lines)
            if re.match(r"^\s*(\d+[\.\)]\s*)?[-—–\s]*contraindicat", line.replace(" ", "").lower())
        ]
    return ind_start, dos_start, contra_start

def get_section_lines_with_gap(ind_idx, dos_indices, lines, gaps=[5, 10, 15]):
    for gap in gaps:
        valid_dos_indices = [i for i in dos_indices if i - ind_idx > gap]
        if valid_dos_indices:
            first_valid = valid_dos_indices[0]
            return lines[ind_idx+1:first_valid], gap
    return [], None

def extract_section_loose(lines, is_2col):
    if is_2col:
        ind_pattern = r"indicat.{0,30}usag"
        dos_pattern = r"dosag.{0,20}admin"
        # 1. Try dashed/underscored header logic
        start_idx = find_loose_and_dashed_section_index(lines, ind_pattern)
        end_idx = None
        if start_idx is not None:
            end_idx = find_next_dashed_section_index(lines, dos_pattern, start_idx)
        if start_idx is not None and end_idx is not None:
            return "\n".join(lines[start_idx + 1 : end_idx])
        # 2. Fallback: original gap logic with loose detection
        ind_start, dos_start, _ = find_section_indices(lines, is_2col)
        if ind_start and dos_start:
            section_lines, used_gap = get_section_lines_with_gap(ind_start[0], dos_start, lines)
            if section_lines:
                return "\n".join([lines[ind_start[0]]] + section_lines)
            else:
                return "\n".join(lines[ind_start[0]:])
        elif ind_start:
            return "\n".join(lines[ind_start[0]:])
        else:
            return ""
    else:
        ind_start, _, contra_start = find_section_indices(lines, is_2col)
        if ind_start and contra_start:
            stop = [i for i in contra_start if i > ind_start[0]]
            if stop:
                stop = stop[0]
                return "\n".join(lines[ind_start[0]:stop])
        elif ind_start:
            return "\n".join(lines[ind_start[0]:])
        else:
            return ""
    return ""

def split_by_main_bullet(text):
    bullet_chars = ''.join(re.escape(b) for b in main_bullets if b != ".")
    pattern = r'((?:^|\n)[{}]\s)'.format(bullet_chars)
    pattern += r'|((?:^|\n)\.\s)'
    splits = [m.start() for m in re.finditer(pattern, text)]
    if not splits:
        return [text]
    sections = []
    for i, idx in enumerate(splits):
        end = splits[i+1] if i+1 < len(splits) else len(text)
        sections.append(text[idx:end].strip())
    return sections

def is_sub_bullet_line(line):
    line_stripped = line.lstrip()
    return line_stripped.startswith("o ") or line_stripped.startswith("\u25cb")

def count_sub_bullets(section):
    lines = section.splitlines()
    count = 0
    for line in lines:
        if is_sub_bullet_line(line):
            count += 1
    return count if count > 0 else 1

def should_exclude_section(section):
    return 'limitation of use' in section.lower()

def count_indicat_and_treatment(section_text):
    lines = section_text.splitlines()
    count_indicat = max(0, sum(1 for line in lines if re.search(r"indicated", line, re.IGNORECASE)))
    count_treatment = sum(1 for line in lines if re.search(r"treatm", line, re.IGNORECASE))
    return count_indicat, count_treatment

def find_max_numbered_bullet(lines):
    numbers = []
    for line in lines:
        match = re.match(r"^\s*(\d+)[\.\)]", line)
        if match:
            numbers.append(int(match.group(1)))
    return max(numbers) if numbers else 0

def count_indications_keyword_and_number(section_text):
    lines = section_text.splitlines()
    count_indicat, count_treatment = count_indicat_and_treatment(section_text)
    max_keyword = max(count_indicat, count_treatment)
    max_number = find_max_numbered_bullet(lines)
    final_count = max(max_keyword, max_number)
    return final_count

def get_indication_count(pdf_url):
    try:
        response = requests.get(pdf_url, headers={"User-Agent": "Mozilla/5.0"}, timeout=60)
        response.raise_for_status()
        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
            tmp_file.write(response.content)
            tmp_path = tmp_file.name
        with pdfplumber.open(tmp_path) as pdf:
            first_page = pdf.pages[0]
            is_2col = is_two_column(first_page)
            all_text = extract_all_text(tmp_path)
            lines = all_text.splitlines()
            section = extract_section_loose(lines, is_2col=is_2col)
            if not section.strip():
                return None, False
            if is_2col:
                main_sections = [
                    s for s in split_by_main_bullet(section)
                    if s.strip() and not should_exclude_section(s)
                ]
                count = 0
                for sec in main_sections:
                    cnt = count_sub_bullets(sec)
                    count += cnt
            else:
                count = count_indications_keyword_and_number(section)
            return count, True
    except Exception as e:
        print(f"[ERROR] PDF extraction failed: {e}")
        return None, False

def run_auto_pdf_extraction(
    excel_path,
    out_path="indication_results.xlsx",
    url_col="ApplicationDocsURL",
    max_retries=5
):
    df = pd.read_excel(excel_path)
    # --- Adjust mask as needed for your data ---
    mask = (
        (df["ApplicationDocsTypeID"] == 2) &
        (df["ActionTypes_LookupID"] < 13) &
        (df["ApplicationDocsURL"].notnull())
    )
    df_proc = df.loc[mask].copy()
    indication_counts = [None] * len(df_proc)
    open_successes = [False] * len(df_proc)
    failed_indices = list(range(len(df_proc)))

    for attempt in range(max_retries):
        print(f"\n--- Iteration {attempt+1} ---")
        still_failed = []
        for idx in tqdm(failed_indices):
            row = df_proc.iloc[idx]
            count, opened = get_indication_count(row[url_col])
            indication_counts[idx] = count
            open_successes[idx] = opened
            if not opened:
                still_failed.append(idx)
                time.sleep(1)
        failed_indices = still_failed
        print(f"Iteration {attempt+1}: Failed PDFs = {len(failed_indices)}")
        if not failed_indices:
            break

    df_proc["indication_count"] = indication_counts
    df_proc["pdf_opened"] = open_successes
    df_proc.to_excel(out_path, index=False)
    print(f"Results saved to {out_path}")
    if failed_indices:
        print(f"Still failed indices after {max_retries} iterations: {failed_indices}")

# ---- Usage Example ----
run_auto_pdf_extraction(
    excel_path="F:/PhD/RA/Schafer/IRA/data/unzipped/merge/merged4_non_missing_top1000.xlsx",     # <-- input file path
    out_path="F:/PhD/RA/Schafer/IRA/data/unzipped/merge/merged4_non_missing_top1000_improve.xlsx" # <-- output file path
)



--- Iteration 1 ---


  9%|▊         | 18/208 [00:45<05:31,  1.75s/it]

[ERROR] PDF extraction failed: 502 Server Error: Bad Gateway for url: http://www.accessdata.fda.gov/drugsatfda_docs/label/2019/010721s063lbl.pdf


 27%|██▋       | 56/208 [03:30<07:54,  3.12s/it]

[ERROR] PDF extraction failed: HTTPSConnectionPool(host='www.accessdata.fda.gov', port=443): Read timed out. (read timeout=60)


 55%|█████▍    | 114/208 [07:44<05:09,  3.29s/it]

[ERROR] PDF extraction failed: HTTPSConnectionPool(host='www.accessdata.fda.gov', port=443): Max retries exceeded with url: /drugsatfda_docs/label/2013/018936s100s101,021235s021lbl.pdf (Caused by SSLError(SSLEOFError(8, '[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol (_ssl.c:1000)')))


 63%|██████▎   | 132/208 [09:42<04:36,  3.64s/it]

[ERROR] PDF extraction failed: HTTPSConnectionPool(host='www.accessdata.fda.gov', port=443): Max retries exceeded with url: /drugsatfda_docs/label/2002/18936s61s65,20101s27,20974s1lbl.pdf (Caused by SSLError(SSLEOFError(8, '[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol (_ssl.c:1000)')))


100%|██████████| 208/208 [14:45<00:00,  4.26s/it]


Iteration 1: Failed PDFs = 27

--- Iteration 2 ---


 26%|██▌       | 7/27 [00:21<01:03,  3.19s/it]

[ERROR] PDF extraction failed: HTTPSConnectionPool(host='www.accessdata.fda.gov', port=443): Max retries exceeded with url: /drugsatfda_docs/label/1998/18603s19lbl.pdf (Caused by SSLError(SSLEOFError(8, '[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol (_ssl.c:1000)')))


100%|██████████| 27/27 [01:34<00:00,  3.49s/it]


Iteration 2: Failed PDFs = 23

--- Iteration 3 ---


  4%|▍         | 1/23 [00:02<00:45,  2.08s/it]

[ERROR] PDF extraction failed: HTTPSConnectionPool(host='www.accessdata.fda.gov', port=443): Max retries exceeded with url: /drugsatfda_docs/label/2023/017031s041lbl.pdf (Caused by SSLError(SSLEOFError(8, '[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol (_ssl.c:1000)')))


100%|██████████| 23/23 [01:09<00:00,  3.02s/it]


Iteration 3: Failed PDFs = 23

--- Iteration 4 ---


100%|██████████| 23/23 [00:55<00:00,  2.43s/it]


Iteration 4: Failed PDFs = 23

--- Iteration 5 ---


 91%|█████████▏| 21/23 [00:49<00:04,  2.02s/it]

[ERROR] PDF extraction failed: HTTPSConnectionPool(host='www.accessdata.fda.gov', port=443): Max retries exceeded with url: /drugsatfda_docs/nda/2000/19304-S005_Tricor_prntlbl.pdf (Caused by SSLError(SSLEOFError(8, '[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol (_ssl.c:1000)')))


100%|██████████| 23/23 [01:12<00:00,  3.15s/it]


Iteration 5: Failed PDFs = 23
Results saved to F:/PhD/RA/Schafer/IRA/data/unzipped/merge/merged4_non_missing_top1000_improve.xlsx
Still failed indices after 5 iterations: [11, 64, 69, 91, 92, 93, 94, 95, 96, 100, 113, 156, 157, 158, 159, 160, 161, 162, 163, 173, 174, 175, 176]


In [2]:
import pandas as pd
# File paths
input_path = r"F:\PhD\RA\Schafer\IRA\data\unzipped\merge\merged4_modified.xlsx"
output_path = r"F:\PhD\RA\Schafer\IRA\data\unzipped\merge\processed\appl_product_first_years.xlsx"

# --- Load ---
df = pd.read_excel(input_path, dtype={"ApplNo": str, "ProductNo": str})

# --- Parse dates safely ---
df["SubmissionStatusDate"] = pd.to_datetime(df["SubmissionStatusDate"], errors="coerce")
df["ApplicationDocsDate"]  = pd.to_datetime(df["ApplicationDocsDate"], errors="coerce")

# --- Sort so earliest SubmissionStatusDate comes first ---
df = df.sort_values(["ApplNo", "ProductNo", "SubmissionStatusDate"])

# --- Pick the earliest row for each (ApplNo, ProductNo) ---
first_rows = df.groupby(["ApplNo", "ProductNo"], as_index=False).first()

# --- Extract years ---
first_rows["FirstSubmissionStatusYear"] = first_rows["SubmissionStatusDate"].dt.year
first_rows["FirstApplicationDocsYear"]  = first_rows["ApplicationDocsDate"].dt.year

# --- Keep only 4 columns ---
out = first_rows[["ApplNo", "ProductNo", "FirstSubmissionStatusYear", "FirstApplicationDocsYear"]]

# --- Save ---
out.to_excel(output_path, index=False)

print(out.head(10))


  ApplNo ProductNo  FirstSubmissionStatusYear  FirstApplicationDocsYear
0  10010         1                        NaN                       NaN
1  10010         2                        NaN                       NaN
2  10012         1                     1981.0                       NaN
3  10021        10                     1961.0                    2007.0
4  10021         2                     1961.0                    2007.0
5  10021         4                     1961.0                    2007.0
6  10021         7                     1961.0                    2007.0
7  10028         4                     1955.0                       NaN
8  10028         5                     1955.0                       NaN
9  10040         1                     1955.0                    2002.0
