# Some clean

In [5]:
import pandas as pd

# Load the Excel file
df = pd.read_excel(r"F:\PhD\RA\Schafer\IRA\data\unzipped\merge\merged4_non_missing.xlsx")

# Drop rows where DrugName is missing
df = df.dropna(subset=['DrugName'])

# Remove time from ApplicationDocsDate and keep only the date
df['ApplicationDocsDate'] = pd.to_datetime(df['ApplicationDocsDate']).dt.date

# Create 'year' column
df['year'] = pd.to_datetime(df['ApplicationDocsDate']).dt.year

# Define your unique ID columns (adjust as needed)
id_cols = ['ApplNo', 'ProductNo']

# Get the earliest year for each unique ID
df['appear'] = df.groupby(id_cols)['year'].transform('min')

# Sort for grouping
df = df.sort_values(id_cols + ['ApplicationDocsDate'])

# Create first_orig column robustly
def first_is_orig(series):
    return 1 if series.iloc[0] == 'ORIG' else 0

df['first_orig'] = df.groupby(id_cols)['SubmissionType'].transform(first_is_orig)

# **Drop records where appear < 2000**
df = df[df['appear'] >= 2000]

# Save to new Excel file
df.to_excel(r"F:\PhD\RA\Schafer\IRA\data\unzipped\merge\after00.xlsx", index=False)

In [1]:
import pandas as pd
import pdfplumber
import requests
import tempfile
import re
import time
from tqdm import tqdm

main_bullets = ["•", "\uf0b7", "·", "\u00b7", "."]

def is_two_column(page, center_width_fraction=0.02, min_text_length=26):
    width = page.width
    height = page.height
    center_start = width * (0.5 - center_width_fraction / 2)
    center_end = width * (0.5 + center_width_fraction / 2)
    center_text = page.within_bbox((center_start, 0, center_end, height)).extract_text() or ""
    return len(center_text) <= min_text_length

def extract_all_text(pdf_path):
    all_text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages):
            two_col = is_two_column(page)
            if two_col:
                width = page.width
                left_col = page.within_bbox((0, 0, width / 2, page.height)).extract_text() or ""
                right_col = page.within_bbox((width / 2, 0, width, page.height)).extract_text() or ""
                all_text += left_col + "\n" + right_col + "\n"
            else:
                page_text = page.extract_text() or ""
                all_text += page_text + "\n"
    return all_text

def find_loose_and_dashed_section_index(lines, pattern):
    pat = re.compile(pattern, re.IGNORECASE)
    for idx, line in enumerate(lines):
        if pat.search(line.replace(" ", "")) and (
            "--" in line or "__" in line
        ):
            return idx
    return None

def find_next_dashed_section_index(lines, pattern, after_idx):
    pat = re.compile(pattern, re.IGNORECASE)
    for i in range(after_idx + 1, len(lines)):
        if pat.search(lines[i].replace(" ", "")) and (
            "--" in lines[i] or "__" in lines[i]
        ):
            return i
    return None

def find_section_indices(lines, is_2col):
    if is_2col:
        ind_start = [
            i for i, line in enumerate(lines)
            if re.search(r"indicat.{0,30}usag", line.replace(" ", "").lower())
        ]
        dos_start = [
            i for i, line in enumerate(lines)
            if re.search(r"dosag.{0,20}admin", line.replace(" ", "").lower())
        ]
        contra_start = [
            i for i, line in enumerate(lines)
            if re.search(r"contraindicat", line.replace(" ", "").lower())
        ]
    else:
        ind_start = [
            i for i, line in enumerate(lines)
            if re.match(r"^\s*(\d+[\.\)]\s*)?[-—–\s]*indicat.{0,30}usag", line.replace(" ", "").lower())
        ]
        dos_start = [
            i for i, line in enumerate(lines)
            if re.match(r"^\s*(\d+[\.\)]\s*)?[-—–\s]*dosag.{0,20}admin", line.replace(" ", "").lower())
        ]
        contra_start = [
            i for i, line in enumerate(lines)
            if re.match(r"^\s*(\d+[\.\)]\s*)?[-—–\s]*contraindicat", line.replace(" ", "").lower())
        ]
    return ind_start, dos_start, contra_start

def get_section_lines_with_gap(ind_idx, dos_indices, lines, gaps=[5, 10, 15]):
    for gap in gaps:
        valid_dos_indices = [i for i in dos_indices if i - ind_idx > gap]
        if valid_dos_indices:
            first_valid = valid_dos_indices[0]
            return lines[ind_idx+1:first_valid], gap
    return [], None

def extract_section_loose(lines, is_2col):
    if is_2col:
        ind_pattern = r"indicat.{0,30}usag"
        dos_pattern = r"dosag.{0,20}admin"
        # 1. Try dashed/underscored header logic
        start_idx = find_loose_and_dashed_section_index(lines, ind_pattern)
        end_idx = None
        if start_idx is not None:
            end_idx = find_next_dashed_section_index(lines, dos_pattern, start_idx)
        if start_idx is not None and end_idx is not None:
            return "\n".join(lines[start_idx + 1 : end_idx])
        # 2. Fallback: original gap logic with loose detection
        ind_start, dos_start, _ = find_section_indices(lines, is_2col)
        if ind_start and dos_start:
            section_lines, used_gap = get_section_lines_with_gap(ind_start[0], dos_start, lines)
            if section_lines:
                return "\n".join([lines[ind_start[0]]] + section_lines)
            else:
                return "\n".join(lines[ind_start[0]:])
        elif ind_start:
            return "\n".join(lines[ind_start[0]:])
        else:
            return ""
    else:
        ind_start, _, contra_start = find_section_indices(lines, is_2col)
        if ind_start and contra_start:
            stop = [i for i in contra_start if i > ind_start[0]]
            if stop:
                stop = stop[0]
                return "\n".join(lines[ind_start[0]:stop])
        elif ind_start:
            return "\n".join(lines[ind_start[0]:])
        else:
            return ""
    return ""

def split_by_main_bullet(text):
    bullet_chars = ''.join(re.escape(b) for b in main_bullets if b != ".")
    pattern = r'((?:^|\n)[{}]\s)'.format(bullet_chars)
    pattern += r'|((?:^|\n)\.\s)'
    splits = [m.start() for m in re.finditer(pattern, text)]
    if not splits:
        return [text]
    sections = []
    for i, idx in enumerate(splits):
        end = splits[i+1] if i+1 < len(splits) else len(text)
        sections.append(text[idx:end].strip())
    return sections

def is_sub_bullet_line(line):
    line_stripped = line.lstrip()
    return line_stripped.startswith("o ") or line_stripped.startswith("\u25cb")

def count_sub_bullets(section):
    lines = section.splitlines()
    count = 0
    for line in lines:
        if is_sub_bullet_line(line):
            count += 1
    return count if count > 0 else 1

def should_exclude_section(section):
    return 'limitation of use' in section.lower()

def count_indicat_and_treatment(section_text):
    lines = section_text.splitlines()
    count_indicat = max(0, sum(1 for line in lines if re.search(r"indicated", line, re.IGNORECASE)))
    count_treatment = sum(1 for line in lines if re.search(r"treatm", line, re.IGNORECASE))
    return count_indicat, count_treatment

def find_max_numbered_bullet(lines):
    numbers = []
    for line in lines:
        match = re.match(r"^\s*(\d+)[\.\)]", line)
        if match:
            numbers.append(int(match.group(1)))
    return max(numbers) if numbers else 0

def count_indications_keyword_and_number(section_text):
    lines = section_text.splitlines()
    count_indicat, count_treatment = count_indicat_and_treatment(section_text)
    max_keyword = max(count_indicat, count_treatment)
    max_number = find_max_numbered_bullet(lines)
    final_count = max(max_keyword, max_number)
    return final_count

def get_indication_count(pdf_url):
    try:
        response = requests.get(pdf_url, headers={"User-Agent": "Mozilla/5.0"}, timeout=60)
        response.raise_for_status()
        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
            tmp_file.write(response.content)
            tmp_path = tmp_file.name
        with pdfplumber.open(tmp_path) as pdf:
            first_page = pdf.pages[0]
            is_2col = is_two_column(first_page)
            all_text = extract_all_text(tmp_path)
            lines = all_text.splitlines()
            section = extract_section_loose(lines, is_2col=is_2col)
            if not section.strip():
                return None, False
            if is_2col:
                main_sections = [
                    s for s in split_by_main_bullet(section)
                    if s.strip() and not should_exclude_section(s)
                ]
                count = 0
                for sec in main_sections:
                    cnt = count_sub_bullets(sec)
                    count += cnt
            else:
                count = count_indications_keyword_and_number(section)
            return count, True
    except Exception as e:
        print(f"[ERROR] PDF extraction failed: {e}")
        return None, False

def run_auto_pdf_extraction(
    excel_path,
    out_path="indication_results.xlsx",
    url_col="ApplicationDocsURL",
    max_retries=3
):
    df = pd.read_excel(excel_path)
    # --- Apply your mask ---
    mask = (
        (df["ApplicationDocsTypeID"] == 2) &
        (df["ApplicationDocsURL"].notnull()) &
        (
            (df["ActionTypes_LookupID"] < 13) |
            (df["SubmissionType"] == "ORIG")
        )
    )
    df_proc = df.loc[mask].copy()
    indication_counts = [None] * len(df_proc)
    open_successes = [False] * len(df_proc)
    failed_indices = list(range(len(df_proc)))

    for attempt in range(max_retries):
        print(f"\n--- Iteration {attempt+1} ---")
        still_failed = []
        for idx in tqdm(failed_indices):
            row = df_proc.iloc[idx]
            count, opened = get_indication_count(row[url_col])
            indication_counts[idx] = count
            open_successes[idx] = opened
            if not opened:
                still_failed.append(idx)
                time.sleep(1)
        failed_indices = still_failed
        print(f"Iteration {attempt+1}: Failed PDFs = {len(failed_indices)}")
        if not failed_indices:
            break

    df_proc["indication_count"] = indication_counts
    df_proc["pdf_opened"] = open_successes
    df_proc.to_excel(out_path, index=False)
    print(f"Results saved to {out_path}")
    if failed_indices:
        print(f"Still failed indices after {max_retries} iterations: {failed_indices}")


In [6]:
# ---- Usage Example ----
run_auto_pdf_extraction(
    excel_path="F:/PhD/RA/Schafer/IRA/data/unzipped/merge/main/exclude_part_1.xlsx",    
    out_path="F:/PhD/RA/Schafer/IRA/data/unzipped/merge/processed/exclude_part_1_processed.xlsx"
)


--- Iteration 1 ---


 40%|████      | 153/381 [08:31<15:42,  4.13s/it]

[ERROR] PDF extraction failed: 404 Client Error: Not Found for url: https://www.accessdata.fda.gov/drugsatfda_docs/nda/2002/20-839s019_ClopidogrelPlavix.htm


 41%|████      | 156/381 [08:39<11:36,  3.09s/it]

[ERROR] PDF extraction failed: 404 Client Error: Not Found for url: https://www.accessdata.fda.gov/drugsatfda_docs/nda/2002/20-839s019_ClopidogrelPlavix.htm


 48%|████▊     | 181/381 [09:34<08:28,  2.54s/it]Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Ca

[ERROR] PDF extraction failed: Bounding box (583.3695, 0, 607.1804999999999, 841.88998) is not fully within parent page bounding box (0, 0.0009800000000268483, 1190.55, 841.8909600000001)


100%|██████████| 381/381 [17:23<00:00,  2.74s/it]


Iteration 1: Failed PDFs = 70

--- Iteration 2 ---


 34%|███▍      | 24/70 [01:15<02:28,  3.22s/it]

[ERROR] PDF extraction failed: 404 Client Error: Not Found for url: https://www.accessdata.fda.gov/drugsatfda_docs/nda/2002/20-839s019_ClopidogrelPlavix.htm


 36%|███▌      | 25/70 [01:17<01:59,  2.66s/it]

[ERROR] PDF extraction failed: 404 Client Error: Not Found for url: https://www.accessdata.fda.gov/drugsatfda_docs/nda/2002/20-839s019_ClopidogrelPlavix.htm


 93%|█████████▎| 65/70 [02:46<00:13,  2.76s/it]

[ERROR] PDF extraction failed: Bounding box (583.3695, 0, 607.1804999999999, 841.88998) is not fully within parent page bounding box (0, 0.0009800000000268483, 1190.55, 841.8909600000001)


100%|██████████| 70/70 [03:07<00:00,  2.68s/it]


Iteration 2: Failed PDFs = 70

--- Iteration 3 ---


 34%|███▍      | 24/70 [01:15<02:28,  3.22s/it]

[ERROR] PDF extraction failed: 404 Client Error: Not Found for url: https://www.accessdata.fda.gov/drugsatfda_docs/nda/2002/20-839s019_ClopidogrelPlavix.htm


 36%|███▌      | 25/70 [01:17<01:58,  2.64s/it]

[ERROR] PDF extraction failed: 404 Client Error: Not Found for url: https://www.accessdata.fda.gov/drugsatfda_docs/nda/2002/20-839s019_ClopidogrelPlavix.htm


 93%|█████████▎| 65/70 [02:46<00:13,  2.79s/it]

[ERROR] PDF extraction failed: Bounding box (583.3695, 0, 607.1804999999999, 841.88998) is not fully within parent page bounding box (0, 0.0009800000000268483, 1190.55, 841.8909600000001)


100%|██████████| 70/70 [03:07<00:00,  2.67s/it]


Iteration 3: Failed PDFs = 70
Results saved to F:/PhD/RA/Schafer/IRA/data/unzipped/merge/processed/exclude_part_1_processed.xlsx
Still failed indices after 3 iterations: [13, 16, 19, 22, 50, 52, 56, 59, 60, 65, 66, 71, 72, 77, 78, 83, 84, 89, 90, 97, 100, 101, 102, 114, 153, 156, 164, 165, 166, 167, 168, 169, 176, 179, 180, 187, 188, 195, 196, 203, 204, 211, 212, 218, 220, 263, 264, 265, 267, 268, 269, 271, 272, 273, 275, 276, 277, 279, 280, 281, 300, 345, 350, 352, 357, 365, 373, 375, 377, 379]


In [7]:
# ---- Usage Example ----
run_auto_pdf_extraction(
    excel_path="F:/PhD/RA/Schafer/IRA/data/unzipped/merge/main/exclude_part_2.xlsx",    
    out_path="F:/PhD/RA/Schafer/IRA/data/unzipped/merge/processed/exclude_part_2_processed.xlsx"
)


--- Iteration 1 ---


 10%|▉         | 52/523 [01:45<16:14,  2.07s/it]Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
 10%|█         | 53/523 [01:48<16:32,  2.11s/it]Cannot set gray non-stroke color because /'P0' is an invalid float value
 10%|█         | 54/523 [01:52<20:36,  2.64s/it]Cannot set gray non-stroke color because /'P0' is an invalid float value
 11%|█         | 55/523 [01:56<23:58,  3.07s/it]Cannot set gray non-stroke color because /'P0' is an invalid float value
 11%|█         | 56/523 [02:00<27:49,  3.57s/it]Cannot set gray non-stroke color because /'P0' is an invalid float value
 11%|█         | 57/523 [02:06<33:03,  4.26s/it]Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an in

Iteration 1: Failed PDFs = 63

--- Iteration 2 ---


100%|██████████| 63/63 [03:00<00:00,  2.87s/it]


Iteration 2: Failed PDFs = 63

--- Iteration 3 ---


100%|██████████| 63/63 [02:59<00:00,  2.85s/it]


Iteration 3: Failed PDFs = 63
Results saved to F:/PhD/RA/Schafer/IRA/data/unzipped/merge/processed/exclude_part_2_processed.xlsx
Still failed indices after 3 iterations: [1, 2, 7, 8, 13, 14, 19, 20, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 68, 75, 82, 99, 115, 131, 147, 164, 168, 172, 176, 241, 249, 257, 265, 266, 267, 268, 269, 270, 394, 403, 418, 419, 420, 422, 423, 424, 448, 449, 451, 456, 461, 466, 471, 475, 522]


In [8]:
# ---- Usage Example ----
run_auto_pdf_extraction(
    excel_path="F:/PhD/RA/Schafer/IRA/data/unzipped/merge/main/exclude_part_3.xlsx",    
    out_path="F:/PhD/RA/Schafer/IRA/data/unzipped/merge/processed/exclude_part_3_processed.xlsx"
)


--- Iteration 1 ---


 11%|█         | 64/600 [03:42<42:37,  4.77s/it]

[ERROR] PDF extraction failed: Bounding box (599.76049, 0, 624.24051, 792.0) is not fully within parent page bounding box (-184.991, 98.5, 1039.01, 890.5)


 11%|█         | 66/600 [03:48<34:59,  3.93s/it]

[ERROR] PDF extraction failed: Bounding box (599.76049, 0, 624.24051, 792.0) is not fully within parent page bounding box (-184.991, 98.5, 1039.01, 890.5)


 50%|█████     | 302/600 [18:09<13:10,  2.65s/it]  Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P2' is an invalid float value
Cannot set gray non-stroke color because /'P3' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P3' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P2' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P3' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value


Iteration 1: Failed PDFs = 38

--- Iteration 2 ---


 68%|██████▊   | 26/38 [01:28<00:33,  2.78s/it]

[ERROR] PDF extraction failed: Bounding box (599.76049, 0, 624.24051, 792.0) is not fully within parent page bounding box (-184.991, 98.5, 1039.01, 890.5)


 71%|███████   | 27/38 [01:31<00:30,  2.80s/it]

[ERROR] PDF extraction failed: Bounding box (599.76049, 0, 624.24051, 792.0) is not fully within parent page bounding box (-184.991, 98.5, 1039.01, 890.5)


100%|██████████| 38/38 [02:19<00:00,  3.68s/it]


Iteration 2: Failed PDFs = 38

--- Iteration 3 ---


 68%|██████▊   | 26/38 [01:29<00:33,  2.78s/it]

[ERROR] PDF extraction failed: Bounding box (599.76049, 0, 624.24051, 792.0) is not fully within parent page bounding box (-184.991, 98.5, 1039.01, 890.5)


 71%|███████   | 27/38 [01:32<00:32,  2.98s/it]

[ERROR] PDF extraction failed: Bounding box (599.76049, 0, 624.24051, 792.0) is not fully within parent page bounding box (-184.991, 98.5, 1039.01, 890.5)


100%|██████████| 38/38 [02:29<00:00,  3.94s/it]


Iteration 3: Failed PDFs = 38
Results saved to F:/PhD/RA/Schafer/IRA/data/unzipped/merge/processed/exclude_part_3_processed.xlsx
Still failed indices after 3 iterations: [4, 9, 14, 16, 17, 18, 19, 20, 21, 22, 23, 24, 32, 34, 35, 36, 37, 38, 39, 40, 41, 42, 51, 52, 53, 59, 64, 66, 74, 75, 151, 153, 162, 171, 200, 249, 332, 337]


In [9]:
# ---- Usage Example ----
run_auto_pdf_extraction(
    excel_path="F:/PhD/RA/Schafer/IRA/data/unzipped/merge/main/exclude_part_4.xlsx",    
    out_path="F:/PhD/RA/Schafer/IRA/data/unzipped/merge/processed/exclude_part_4_processed.xlsx"
)


--- Iteration 1 ---


 60%|█████▉    | 307/513 [16:29<12:11,  3.55s/it]Cannot set gray non-stroke color because /'P0' is an invalid float value
 60%|██████    | 308/513 [16:32<12:26,  3.64s/it]Cannot set gray non-stroke color because /'P0' is an invalid float value
 60%|██████    | 309/513 [16:37<13:13,  3.89s/it]Cannot set gray non-stroke color because /'P0' is an invalid float value
 96%|█████████▌| 493/513 [27:18<00:37,  1.87s/it]Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
 96%|█████████▋| 494/513 [27:21<00:40,  2.15s/it]Cannot set gray non-strok

Iteration 1: Failed PDFs = 11

--- Iteration 2 ---


100%|██████████| 11/11 [00:28<00:00,  2.59s/it]


Iteration 2: Failed PDFs = 11

--- Iteration 3 ---


100%|██████████| 11/11 [00:27<00:00,  2.50s/it]


Iteration 3: Failed PDFs = 11
Results saved to F:/PhD/RA/Schafer/IRA/data/unzipped/merge/processed/exclude_part_4_processed.xlsx
Still failed indices after 3 iterations: [9, 10, 48, 49, 174, 338, 344, 350, 356, 362, 460]


In [10]:
# ---- Usage Example ----
run_auto_pdf_extraction(
    excel_path="F:/PhD/RA/Schafer/IRA/data/unzipped/merge/main/exclude_part_5.xlsx",    
    out_path="F:/PhD/RA/Schafer/IRA/data/unzipped/merge/processed/exclude_part_5_processed.xlsx"
)


--- Iteration 1 ---


  2%|▏         | 18/748 [01:08<49:05,  4.03s/it] Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
  3%|▎         | 19/748 [01:11<46:51,  3.86s/it]Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke 

[ERROR] PDF extraction failed: 404 Client Error: Not Found for url: https://www.accessdata.fda.gov/drugsatfda_docs/label/2018/103000s5308lbl.pdf


 21%|██        | 157/748 [08:35<52:23,  5.32s/it]  

[ERROR] PDF extraction failed: 404 Client Error: Not Found for url: https://www.accessdata.fda.gov/drugsatfda_docs/label/2018/103000s5308lbl.pdf


 24%|██▍       | 178/748 [10:21<50:02,  5.27s/it]  

[ERROR] PDF extraction failed: 404 Client Error: Not Found for url: https://www.accessdata.fda.gov/drugsatfda_docs/label/2018/103000s5308lbl.pdf


 27%|██▋       | 199/748 [12:07<48:12,  5.27s/it]  

[ERROR] PDF extraction failed: 404 Client Error: Not Found for url: https://www.accessdata.fda.gov/drugsatfda_docs/label/2018/103000s5308lbl.pdf


 29%|██▉       | 220/748 [13:52<46:11,  5.25s/it]  

[ERROR] PDF extraction failed: 404 Client Error: Not Found for url: https://www.accessdata.fda.gov/drugsatfda_docs/label/2018/103000s5308lbl.pdf


 31%|███       | 229/748 [14:43<42:23,  4.90s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 32%|███▏      | 236/748 [15:11<35:06,  4.11s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 32%|███▏      | 243/748 [15:35<30:50,  3.67s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 33%|███▎      | 250/748 [15:59<30:23,  3.66s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 34%|███▍      | 257/748 [16:23<30:08,  3.68s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 35%|███▌      | 264/748 [16:47<29:11,  3.62s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 36%|███▌      | 271/748 [17:11<28:49,  3.63s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 37%|███▋      | 278/748 [17:35<28:54,  3.69s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 38%|███▊      | 285/748 [18:00<28:07,  3.65s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 39%|███▉      | 292/748 [18:24<27:54,  3.67s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 40%|███▉      | 299/748 [18:48<28:01,  3.75s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 41%|████      | 305/748 [19:10<31:25,  4.26s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 42%|████▏     | 312/748 [19:27<22:03,  3.04s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 43%|████▎     | 319/748 [19:44<20:56,  2.93s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 44%|████▎     | 326/748 [20:00<19:54,  2.83s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 45%|████▍     | 333/748 [20:17<19:40,  2.85s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 45%|████▌     | 340/748 [20:33<19:41,  2.90s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 46%|████▋     | 347/748 [20:50<18:54,  2.83s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 47%|████▋     | 354/748 [21:06<18:36,  2.83s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 48%|████▊     | 361/748 [21:22<18:03,  2.80s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 49%|████▉     | 368/748 [21:38<17:32,  2.77s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 50%|█████     | 375/748 [21:54<17:21,  2.79s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 50%|█████     | 376/748 [21:55<14:35,  2.35s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 51%|█████▏    | 385/748 [22:37<32:21,  5.35s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 52%|█████▏    | 386/748 [22:38<24:37,  4.08s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 57%|█████▋    | 429/748 [25:35<21:10,  3.98s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 61%|██████    | 453/748 [27:43<40:20,  8.21s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 86%|████████▋ | 646/748 [43:46<09:09,  5.39s/it]  

[ERROR] PDF extraction failed: 404 Client Error: Not Found for url: https://www.accessdata.fda.gov/drugsatfda_docs/label/2018/125554s061lbl.pdf


100%|██████████| 748/748 [54:26<00:00,  4.37s/it]


Iteration 1: Failed PDFs = 96

--- Iteration 2 ---


 15%|█▍        | 14/96 [00:35<03:58,  2.91s/it]

[ERROR] PDF extraction failed: 404 Client Error: Not Found for url: https://www.accessdata.fda.gov/drugsatfda_docs/label/2018/103000s5308lbl.pdf


 19%|█▉        | 18/96 [00:46<04:04,  3.14s/it]

[ERROR] PDF extraction failed: 404 Client Error: Not Found for url: https://www.accessdata.fda.gov/drugsatfda_docs/label/2018/103000s5308lbl.pdf


 23%|██▎       | 22/96 [00:57<03:52,  3.14s/it]

[ERROR] PDF extraction failed: 404 Client Error: Not Found for url: https://www.accessdata.fda.gov/drugsatfda_docs/label/2018/103000s5308lbl.pdf


 27%|██▋       | 26/96 [01:08<03:39,  3.13s/it]

[ERROR] PDF extraction failed: 404 Client Error: Not Found for url: https://www.accessdata.fda.gov/drugsatfda_docs/label/2018/103000s5308lbl.pdf


 31%|███▏      | 30/96 [01:19<03:28,  3.16s/it]

[ERROR] PDF extraction failed: 404 Client Error: Not Found for url: https://www.accessdata.fda.gov/drugsatfda_docs/label/2018/103000s5308lbl.pdf


 33%|███▎      | 32/96 [01:22<02:36,  2.45s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 35%|███▌      | 34/96 [01:25<02:09,  2.08s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 38%|███▊      | 36/96 [01:29<01:53,  1.90s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 40%|███▉      | 38/96 [01:32<01:45,  1.82s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 42%|████▏     | 40/96 [01:35<01:38,  1.76s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 44%|████▍     | 42/96 [01:38<01:34,  1.75s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 46%|████▌     | 44/96 [01:42<01:30,  1.75s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 48%|████▊     | 46/96 [01:45<01:25,  1.71s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 50%|█████     | 48/96 [01:48<01:22,  1.72s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 52%|█████▏    | 50/96 [01:52<01:20,  1.74s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 54%|█████▍    | 52/96 [01:55<01:15,  1.72s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 55%|█████▌    | 53/96 [01:56<01:06,  1.54s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 57%|█████▋    | 55/96 [01:58<00:56,  1.37s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 59%|█████▉    | 57/96 [02:01<00:49,  1.28s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 61%|██████▏   | 59/96 [02:03<00:47,  1.27s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 64%|██████▎   | 61/96 [02:06<00:43,  1.25s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 66%|██████▌   | 63/96 [02:08<00:41,  1.26s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 68%|██████▊   | 65/96 [02:11<00:38,  1.24s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 70%|██████▉   | 67/96 [02:13<00:36,  1.27s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 72%|███████▏  | 69/96 [02:16<00:33,  1.23s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 74%|███████▍  | 71/96 [02:18<00:30,  1.23s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 76%|███████▌  | 73/96 [02:20<00:28,  1.22s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 77%|███████▋  | 74/96 [02:21<00:26,  1.19s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 80%|████████  | 77/96 [02:26<00:25,  1.33s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 81%|████████▏ | 78/96 [02:27<00:22,  1.26s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 86%|████████▋ | 83/96 [02:34<00:18,  1.46s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 91%|█████████ | 87/96 [02:47<00:30,  3.34s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 98%|█████████▊| 94/96 [03:07<00:04,  2.46s/it]

[ERROR] PDF extraction failed: 404 Client Error: Not Found for url: https://www.accessdata.fda.gov/drugsatfda_docs/label/2018/125554s061lbl.pdf


100%|██████████| 96/96 [03:11<00:00,  2.00s/it]


Iteration 2: Failed PDFs = 96

--- Iteration 3 ---


 15%|█▍        | 14/96 [00:34<03:57,  2.90s/it]

[ERROR] PDF extraction failed: 404 Client Error: Not Found for url: https://www.accessdata.fda.gov/drugsatfda_docs/label/2018/103000s5308lbl.pdf


 19%|█▉        | 18/96 [00:45<04:00,  3.09s/it]

[ERROR] PDF extraction failed: 404 Client Error: Not Found for url: https://www.accessdata.fda.gov/drugsatfda_docs/label/2018/103000s5308lbl.pdf


 23%|██▎       | 22/96 [00:56<03:53,  3.15s/it]

[ERROR] PDF extraction failed: 404 Client Error: Not Found for url: https://www.accessdata.fda.gov/drugsatfda_docs/label/2018/103000s5308lbl.pdf


 27%|██▋       | 26/96 [01:08<03:47,  3.25s/it]

[ERROR] PDF extraction failed: 404 Client Error: Not Found for url: https://www.accessdata.fda.gov/drugsatfda_docs/label/2018/103000s5308lbl.pdf


 31%|███▏      | 30/96 [01:18<03:28,  3.16s/it]

[ERROR] PDF extraction failed: 404 Client Error: Not Found for url: https://www.accessdata.fda.gov/drugsatfda_docs/label/2018/103000s5308lbl.pdf


 33%|███▎      | 32/96 [01:22<02:37,  2.46s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 35%|███▌      | 34/96 [01:25<02:08,  2.07s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 38%|███▊      | 36/96 [01:28<01:53,  1.89s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 40%|███▉      | 38/96 [01:31<01:44,  1.79s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 42%|████▏     | 40/96 [01:35<01:37,  1.75s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 44%|████▍     | 42/96 [01:38<01:32,  1.71s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 46%|████▌     | 44/96 [01:41<01:29,  1.72s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 48%|████▊     | 46/96 [01:44<01:26,  1.73s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 50%|█████     | 48/96 [01:48<01:22,  1.72s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 52%|█████▏    | 50/96 [01:51<01:18,  1.72s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 54%|█████▍    | 52/96 [01:54<01:15,  1.70s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 55%|█████▌    | 53/96 [01:55<01:05,  1.53s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 57%|█████▋    | 55/96 [01:58<00:56,  1.38s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 59%|█████▉    | 57/96 [02:00<00:51,  1.32s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 61%|██████▏   | 59/96 [02:03<00:47,  1.27s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 64%|██████▎   | 61/96 [02:05<00:42,  1.21s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 66%|██████▌   | 63/96 [02:07<00:39,  1.20s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 68%|██████▊   | 65/96 [02:10<00:36,  1.18s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 70%|██████▉   | 67/96 [02:12<00:34,  1.19s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 72%|███████▏  | 69/96 [02:14<00:31,  1.18s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 74%|███████▍  | 71/96 [02:17<00:30,  1.21s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 76%|███████▌  | 73/96 [02:19<00:27,  1.21s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 77%|███████▋  | 74/96 [02:20<00:26,  1.20s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 80%|████████  | 77/96 [02:24<00:24,  1.30s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 81%|████████▏ | 78/96 [02:25<00:22,  1.27s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 86%|████████▋ | 83/96 [02:33<00:18,  1.42s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 91%|█████████ | 87/96 [02:46<00:30,  3.34s/it]

[ERROR] PDF extraction failed: No /Root object! - Is this really a PDF?


 98%|█████████▊| 94/96 [03:06<00:05,  2.52s/it]

[ERROR] PDF extraction failed: 404 Client Error: Not Found for url: https://www.accessdata.fda.gov/drugsatfda_docs/label/2018/125554s061lbl.pdf


100%|██████████| 96/96 [03:10<00:00,  1.98s/it]


Iteration 3: Failed PDFs = 96
Results saved to F:/PhD/RA/Schafer/IRA/data/unzipped/merge/processed/exclude_part_5_processed.xlsx
Still failed indices after 3 iterations: [3, 75, 78, 83, 88, 93, 98, 103, 108, 113, 122, 123, 124, 127, 136, 144, 145, 148, 157, 165, 166, 169, 178, 186, 187, 190, 199, 207, 208, 211, 220, 228, 229, 235, 236, 242, 243, 249, 250, 256, 257, 263, 264, 270, 271, 277, 278, 284, 285, 291, 292, 298, 299, 305, 308, 312, 315, 319, 322, 326, 329, 333, 336, 340, 343, 347, 350, 354, 357, 361, 364, 368, 371, 375, 376, 377, 378, 385, 386, 387, 388, 427, 428, 429, 431, 432, 442, 453, 455, 456, 466, 615, 621, 622, 646, 746]


In [11]:
# ---- Usage Example ----
run_auto_pdf_extraction(
    excel_path="F:/PhD/RA/Schafer/IRA/data/unzipped/merge/main/exclude_part_6.xlsx",    
    out_path="F:/PhD/RA/Schafer/IRA/data/unzipped/merge/processed/exclude_part_6_processed.xlsx"
)


--- Iteration 1 ---


  6%|▌         | 42/745 [02:33<52:59,  4.52s/it] 

[ERROR] PDF extraction failed: Bounding box (291.68475, 0, 303.59024999999997, 841.8893999999999) is not fully within parent page bounding box (0.0, 14.173199999999952, 595.275, 856.0625999999999)


  6%|▌         | 43/745 [02:39<58:43,  5.02s/it]

[ERROR] PDF extraction failed: Bounding box (291.68475, 0, 303.59024999999997, 841.8893999999999) is not fully within parent page bounding box (0.0, 14.173199999999952, 595.275, 856.0625999999999)


 14%|█▍        | 105/745 [06:14<25:30,  2.39s/it] Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
 16%|█▌        | 118/745 [06:50<28:42,  2.75s/it]Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-strok

[ERROR] PDF extraction failed: 404 Client Error: Not Found for url: https://www.accessdata.fda.gov/drugsatfda_docs/appletter/2024/218591Orig1s000;207620Orig1s025ltr.pdf


 56%|█████▋    | 420/745 [27:02<14:28,  2.67s/it]

[ERROR] PDF extraction failed: 404 Client Error: Not Found for url: https://www.accessdata.fda.gov/drugsatfda_docs/appletter/2024/218591Orig1s000;207620Orig1s025ltr.pdf


 57%|█████▋    | 424/745 [27:11<13:30,  2.53s/it]

[ERROR] PDF extraction failed: 404 Client Error: Not Found for url: https://www.accessdata.fda.gov/drugsatfda_docs/appletter/2024/218591Orig1s000;207620Orig1s025ltr.pdf


 58%|█████▊    | 430/745 [27:21<10:54,  2.08s/it]

[ERROR] PDF extraction failed: Bounding box (291.68523999999996, 0, 303.59076, 841.8893999999999) is not fully within parent page bounding box (0, 14.173199999999952, 595.276, 856.0625999999999)


 89%|████████▊ | 660/745 [38:54<02:55,  2.07s/it]Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
 89%|████████▉ | 662/745 [39:01<03:48,  2.75s/it]Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
 89%|████████▉ | 664/745 [39:07<04:07,  3.05s/it]Cannot set gray stroke co

[ERROR] PDF extraction failed: Bounding box (388.07755, 0, 403.91745000000003, 611.9979999999999) is not fully within parent page bounding box (24.945, 8.361999999999966, 816.94, 620.3599999999999)


 97%|█████████▋| 724/745 [44:17<01:54,  5.47s/it]

[ERROR] PDF extraction failed: 404 Client Error: Not Found for url: https://www.accessdata.fda.gov/drugsatfda_docs/label/2017/761054Orig1s000TOC.cfm


 98%|█████████▊| 728/745 [44:32<01:06,  3.94s/it]

[ERROR] PDF extraction failed: Bounding box (291.68523999999996, 0, 303.59076, 913.89) is not fully within parent page bounding box (0.0, 36.0, 595.276, 949.89)


100%|██████████| 745/745 [46:11<00:00,  3.72s/it]


Iteration 1: Failed PDFs = 34

--- Iteration 2 ---


  0%|          | 0/34 [00:00<?, ?it/s]

[ERROR] PDF extraction failed: Bounding box (291.68475, 0, 303.59024999999997, 841.8893999999999) is not fully within parent page bounding box (0.0, 14.173199999999952, 595.275, 856.0625999999999)


  3%|▎         | 1/34 [00:05<03:15,  5.93s/it]

[ERROR] PDF extraction failed: Bounding box (291.68475, 0, 303.59024999999997, 841.8893999999999) is not fully within parent page bounding box (0.0, 14.173199999999952, 595.275, 856.0625999999999)


  6%|▌         | 2/34 [00:12<03:13,  6.04s/it]Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
 47%|████▋     | 16/34 [01:16<01:03,  3.54s/it]

[ERROR] PDF extraction failed: 404 Client Error: Not Found for url: https://www.accessdata.fda.gov/drugsatfda_docs/appletter/2024/218591Orig1s000;207620Orig1s025ltr.pdf


 50%|█████     | 17/34 [01:17<00:48,  2.85s/it]

[ERROR] PDF extraction failed: 404 Client Error: Not Found for url: https://www.accessdata.fda.gov/drugsatfda_docs/appletter/2024/218591Orig1s000;207620Orig1s025ltr.pdf


 53%|█████▎    | 18/34 [01:18<00:37,  2.32s/it]

[ERROR] PDF extraction failed: 404 Client Error: Not Found for url: https://www.accessdata.fda.gov/drugsatfda_docs/appletter/2024/218591Orig1s000;207620Orig1s025ltr.pdf


 56%|█████▌    | 19/34 [01:19<00:29,  1.97s/it]

[ERROR] PDF extraction failed: Bounding box (291.68523999999996, 0, 303.59076, 841.8893999999999) is not fully within parent page bounding box (0, 14.173199999999952, 595.276, 856.0625999999999)


 91%|█████████ | 31/34 [02:08<00:13,  4.60s/it]

[ERROR] PDF extraction failed: Bounding box (388.07755, 0, 403.91745000000003, 611.9979999999999) is not fully within parent page bounding box (24.945, 8.361999999999966, 816.94, 620.3599999999999)


 94%|█████████▍| 32/34 [02:13<00:09,  4.73s/it]

[ERROR] PDF extraction failed: 404 Client Error: Not Found for url: https://www.accessdata.fda.gov/drugsatfda_docs/label/2017/761054Orig1s000TOC.cfm


 97%|█████████▋| 33/34 [02:15<00:03,  3.70s/it]

[ERROR] PDF extraction failed: Bounding box (291.68523999999996, 0, 303.59076, 913.89) is not fully within parent page bounding box (0.0, 36.0, 595.276, 949.89)


100%|██████████| 34/34 [02:18<00:00,  4.07s/it]


Iteration 2: Failed PDFs = 34

--- Iteration 3 ---


  0%|          | 0/34 [00:00<?, ?it/s]

[ERROR] PDF extraction failed: Bounding box (291.68475, 0, 303.59024999999997, 841.8893999999999) is not fully within parent page bounding box (0.0, 14.173199999999952, 595.275, 856.0625999999999)


  3%|▎         | 1/34 [00:05<03:17,  5.98s/it]

[ERROR] PDF extraction failed: Bounding box (291.68475, 0, 303.59024999999997, 841.8893999999999) is not fully within parent page bounding box (0.0, 14.173199999999952, 595.275, 856.0625999999999)


  6%|▌         | 2/34 [00:12<03:12,  6.01s/it]Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
 47%|████▋     | 16/34 [01:15<01:03,  3.55s/it]

[ERROR] PDF extraction failed: 404 Client Error: Not Found for url: https://www.accessdata.fda.gov/drugsatfda_docs/appletter/2024/218591Orig1s000;207620Orig1s025ltr.pdf


 50%|█████     | 17/34 [01:17<00:48,  2.86s/it]

[ERROR] PDF extraction failed: 404 Client Error: Not Found for url: https://www.accessdata.fda.gov/drugsatfda_docs/appletter/2024/218591Orig1s000;207620Orig1s025ltr.pdf


 53%|█████▎    | 18/34 [01:18<00:37,  2.35s/it]

[ERROR] PDF extraction failed: 404 Client Error: Not Found for url: https://www.accessdata.fda.gov/drugsatfda_docs/appletter/2024/218591Orig1s000;207620Orig1s025ltr.pdf


 56%|█████▌    | 19/34 [01:19<00:30,  2.00s/it]

[ERROR] PDF extraction failed: Bounding box (291.68523999999996, 0, 303.59076, 841.8893999999999) is not fully within parent page bounding box (0, 14.173199999999952, 595.276, 856.0625999999999)


 91%|█████████ | 31/34 [02:07<00:13,  4.56s/it]

[ERROR] PDF extraction failed: Bounding box (388.07755, 0, 403.91745000000003, 611.9979999999999) is not fully within parent page bounding box (24.945, 8.361999999999966, 816.94, 620.3599999999999)


 94%|█████████▍| 32/34 [02:12<00:09,  4.70s/it]

[ERROR] PDF extraction failed: 404 Client Error: Not Found for url: https://www.accessdata.fda.gov/drugsatfda_docs/label/2017/761054Orig1s000TOC.cfm


 97%|█████████▋| 33/34 [02:14<00:03,  3.70s/it]

[ERROR] PDF extraction failed: Bounding box (291.68523999999996, 0, 303.59076, 913.89) is not fully within parent page bounding box (0.0, 36.0, 595.276, 949.89)


100%|██████████| 34/34 [02:17<00:00,  4.05s/it]


Iteration 3: Failed PDFs = 34
Results saved to F:/PhD/RA/Schafer/IRA/data/unzipped/merge/processed/exclude_part_6_processed.xlsx
Still failed indices after 3 iterations: [42, 43, 105, 121, 125, 129, 133, 137, 141, 167, 172, 177, 272, 277, 365, 378, 416, 420, 424, 430, 511, 527, 529, 539, 542, 563, 564, 567, 568, 571, 572, 714, 724, 728]


In [5]:
import pandas as pd
import math

# Load the full Excel file
input_path = "F:/PhD/RA/Schafer/IRA/data/unzipped/merge/excluded_after00_updated.xlsx"
df = pd.read_excel(input_path)

n_splits = 6
rows_per_split = math.ceil(len(df) / n_splits)

for i in range(n_splits):
    start = i * rows_per_split
    end = start + rows_per_split
    df_part = df.iloc[start:end]
    out_path = f"F:/PhD/RA/Schafer/IRA/data/unzipped/merge/main/exclude_part_{i+1}.xlsx"
    df_part.to_excel(out_path, index=False)
    print(f"Saved file {i+1}: {out_path} ({len(df_part)} rows)")

Saved file 1: F:/PhD/RA/Schafer/IRA/data/unzipped/merge/main/exclude_part_1.xlsx (5875 rows)
Saved file 2: F:/PhD/RA/Schafer/IRA/data/unzipped/merge/main/exclude_part_2.xlsx (5875 rows)
Saved file 3: F:/PhD/RA/Schafer/IRA/data/unzipped/merge/main/exclude_part_3.xlsx (5875 rows)
Saved file 4: F:/PhD/RA/Schafer/IRA/data/unzipped/merge/main/exclude_part_4.xlsx (5875 rows)
Saved file 5: F:/PhD/RA/Schafer/IRA/data/unzipped/merge/main/exclude_part_5.xlsx (5875 rows)
Saved file 6: F:/PhD/RA/Schafer/IRA/data/unzipped/merge/main/exclude_part_6.xlsx (5871 rows)


# FIX SOME

In [3]:
import pandas as pd

# Load both files
df_merged = pd.read_excel(r"F:\PhD\RA\Schafer\IRA\data\unzipped\merge\merged4_non_missing.xlsx")
df_after00 = pd.read_excel(r"F:\PhD\RA\Schafer\IRA\data\unzipped\merge\after00.xlsx")

# Define ID columns
id_cols = ['ApplNo', 'ProductNo']

# Get records that are in merged but NOT in after00
df_excluded = df_merged.merge(df_after00[id_cols], on=id_cols, how='left', indicator=True)
df_excluded = df_excluded[df_excluded['_merge'] == 'left_only'].drop(columns=['_merge'])

# Drop rows with missing DrugName
df_excluded = df_excluded.dropna(subset=['DrugName'])

# Convert date columns
df_excluded['ApplicationDocsDate'] = pd.to_datetime(df_excluded['ApplicationDocsDate'], errors='coerce')
df_excluded['SubmissionStatusDate'] = pd.to_datetime(df_excluded['SubmissionStatusDate'], errors='coerce')

# Take the maximum of both dates
df_excluded['max_date'] = df_excluded[['ApplicationDocsDate', 'SubmissionStatusDate']].max(axis=1)
df_excluded['year'] = df_excluded['max_date'].dt.year

# Calculate first year this ID appeared
df_excluded['appear'] = df_excluded.groupby(id_cols)['year'].transform('min')

# Exclude appear < 2000
df_excluded = df_excluded[df_excluded['appear'] >= 2000]

# Sort by ID and SubmissionStatusDate
df_excluded = df_excluded.sort_values(id_cols + ['SubmissionStatusDate'])

# Create first_orig column
def first_is_orig(series):
    return 1 if series.iloc[0] == 'ORIG' else 0

df_excluded['first_orig'] = df_excluded.groupby(id_cols)['SubmissionType'].transform(first_is_orig)

# Save to Excel
df_excluded.to_excel(r"F:\PhD\RA\Schafer\IRA\data\unzipped\merge\excluded_after00_updated.xlsx", index=False)
