In [1]:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options

# Set your download directory
download_dir = r"F:\PhD\RA\Schafer\IRA\data\nme"

# Chrome options
chrome_options = Options()
chrome_options.add_experimental_option("prefs", {
    "download.default_directory": download_dir,
    "download.prompt_for_download": False,
    "directory_upgrade": True,
    "safebrowsing.enabled": True
})

# Start Chrome
driver = webdriver.Chrome(options=chrome_options)

try:
    # Go to the target FDA compilation page
    url = "https://www.fda.gov/drugs/drug-approvals-and-databases/compilation-cder-new-molecular-entity-nme-drug-and-new-biologic-approvals"
    driver.get(url)
    time.sleep(3)  # Let page load

    # Find and click the link by its partial link text
    # If you want it 100% robust, use the full visible link text below:
    link_text = "Compilation of CDER NME and New Biologic Approvals 1985-2023"
    download_link = driver.find_element(By.PARTIAL_LINK_TEXT, link_text)
    download_link.click()
    time.sleep(15)  # Wait for the download to complete (increase if your network is slow)

finally:
    driver.quit()

print("Download complete!")

Download complete!


In [73]:
import pandas as pd

csv_path = r"F:\PhD\RA\Schafer\IRA\data\nme\compilation_of_cder_nme_and_new_biologic_approvals_1985-2023.csv"
output_path = r"F:\PhD\RA\Schafer\IRA\data\nme\nme_approved_2020onward.xlsx"

# Read the CSV
df = pd.read_csv(csv_path)

In [74]:
# Convert date
df['FDA Approval Date'] = pd.to_datetime(df['FDA Approval Date'], errors='coerce')
# Filter for 2020+
df_filtered = df[df['FDA Approval Date'].dt.year >= 2000].copy()
# Rename
df_filtered.rename(columns={'Proprietary  Name': 'drugname'}, inplace=True)
# Save
df_filtered[['drugname', 'FDA Approval Date']].to_excel(output_path, index=False)

In [75]:
# Clean up drug names for the URL (lowercase, strip spaces, etc.)
df_filtered['drug_url'] = df_filtered['drugname'].str.strip().str.lower().str.replace(' ', '-')
df_filtered['drug_url'] = "https://www.drugs.com/history/" + df_filtered['drug_url'] + ".html"

df_filtered[['drugname', 'FDA Approval Date', 'drug_url']].to_excel(output_path, index=False)

In [76]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

input_path = r"F:\PhD\RA\Schafer\IRA\data\nme\nme_approved_2000onward.xlsx"
df = pd.read_excel(input_path)
drugnames = df['drugname'].dropna().str.strip().str.lower().str.replace(' ', '-').unique().tolist()

all_rows = []
skipped_drugs = []

for drugname in drugnames:
    url = f"https://www.drugs.com/history/{drugname}.html"
    print(f"Processing {drugname}...")
    try:
        response = requests.get(url, timeout=30)
        soup = BeautifulSoup(response.text, "html.parser")
        table = soup.find("table")
        found = False
        if table:
            for tr in table.find_all("tr"):
                tds = tr.find_all("td")
                if len(tds) >= 2:
                    date = tds[0].get_text(strip=True)
                    details = tds[1].get_text(strip=True)
                    if details.lower().startswith("approval"):
                        all_rows.append({
                            "drugname": drugname,
                            "date": date,
                            "details": details,
                            "url": url
                        })
                        found = True
        # Mark as skipped if no approvals were found
        if not found:
            all_rows.append({
                "drugname": drugname,
                "date": None,
                "details": "SKIPPED_OR_NOT_FOUND",
                "url": url
            })
            skipped_drugs.append(drugname)
    except Exception as e:
        print(f"Error processing {drugname}: {e}")
        all_rows.append({
            "drugname": drugname,
            "date": None,
            "details": f"SKIPPED_OR_ERROR: {e}",
            "url": url
        })
        skipped_drugs.append(drugname)
    time.sleep(0.5)

df_all = pd.DataFrame(all_rows)
output_path = r"F:\PhD\RA\Schafer\IRA\data\nme\all_drugs_approval_entries.xlsx"
df_all.to_excel(output_path, index=False)
print(f"\n✅ Done! Data saved to: {output_path}")

# Save the skipped list for re-processing or manual review
skipped_path = r"F:\PhD\RA\Schafer\IRA\data\nme\skipped_drugs.csv"
pd.Series(skipped_drugs, name="drugname").to_csv(skipped_path, index=False)
print(f"Skipped drug list saved to: {skipped_path}")

Processing evoxac...
Processing trileptal...
Processing protonix...
Processing lotronex...
Processing skin-exposure-reduction-paste-against-chemical-warfare-agents-(serpacwa)...
Processing zonegran...
Processing septocaine...
Processing visudyne...
Processing mobic...
Processing zyvox...
Processing lantus...
Processing exelon...
Processing mylotarg...
Processing welchol...
Processing tnkase...
Processing novolog...
Processing trelstar-depot...
Processing acova...
Processing innohep...
Processing colazal...
Processing abreva...
Processing rescula...
Processing cetrotide...
Processing kaletra...
Processing trisenox...
Processing mifeprex...
Processing myobloc...
Processing angiomax...
Processing starlix...
Processing peg-intron...
Processing cancidas...
Processing geodon...
Processing foradil...
Processing reminyl...
Processing travatan...
Processing lumigan...
Processing campath...
Processing axert...
Processing gleevec...
Processing yasmin...
Error processing yasmin: HTTPSConnectionPoo

In [77]:
import pandas as pd

skipped_path = r"F:\PhD\RA\Schafer\IRA\data\nme\skipped_drugs.csv"
skipped_drugs = pd.read_csv(skipped_path)['drugname'].tolist()
import requests
from bs4 import BeautifulSoup
import time

retry_rows = []
still_skipped = []

for drugname in skipped_drugs:
    url = f"https://www.drugs.com/history/{drugname}.html"
    print(f"Retrying {drugname}...")
    try:
        response = requests.get(url, timeout=30)
        soup = BeautifulSoup(response.text, "html.parser")
        table = soup.find("table")
        found = False
        if table:
            for tr in table.find_all("tr"):
                tds = tr.find_all("td")
                if len(tds) >= 2:
                    date = tds[0].get_text(strip=True)
                    details = tds[1].get_text(strip=True)
                    if details.lower().startswith("approval"):
                        retry_rows.append({
                            "drugname": drugname,
                            "date": date,
                            "details": details,
                            "url": url
                        })
                        found = True
        if not found:
            retry_rows.append({
                "drugname": drugname,
                "date": None,
                "details": "STILL_SKIPPED_OR_NOT_FOUND",
                "url": url
            })
            still_skipped.append(drugname)
    except Exception as e:
        print(f"Error retrying {drugname}: {e}")
        retry_rows.append({
            "drugname": drugname,
            "date": None,
            "details": f"STILL_SKIPPED_OR_ERROR: {e}",
            "url": url
        })
        still_skipped.append(drugname)
    time.sleep(0.5)

df_retry = pd.DataFrame(retry_rows)

# Load your first run results
df_all = pd.read_excel(r"F:\PhD\RA\Schafer\IRA\data\nme\all_drugs_approval_entries.xlsx")

# Drop the old skipped rows before appending retry results
df_success = df_all[~df_all['drugname'].isin(skipped_drugs)].copy()

# Only keep retries that were successful (not skipped again)
df_retry_success = df_retry[~df_retry['details'].str.contains("SKIPPED|NOT FOUND|ERROR", case=False, na=False)].copy()

# Combine the original successful and new retry successes
df_final = pd.concat([df_success, df_retry_success], ignore_index=True)

# Save all results
final_output_path = r"F:\PhD\RA\Schafer\IRA\data\nme\all_drugs_approval_entries_combined.xlsx"
df_final.to_excel(final_output_path, index=False)
print(f"\n✅ Combined data saved to: {final_output_path}")

# Save the still-skipped list for review or manual check
still_skipped_path = r"F:\PhD\RA\Schafer\IRA\data\nme\still_skipped_drugs.csv"
pd.Series(still_skipped, name="drugname").to_csv(still_skipped_path, index=False)
print(f"⛔️ Still-skipped drug list saved to: {still_skipped_path}")

Retrying evoxac...
Retrying trileptal...
Retrying protonix...
Retrying lotronex...
Retrying skin-exposure-reduction-paste-against-chemical-warfare-agents-(serpacwa)...
Retrying zonegran...
Retrying septocaine...
Retrying visudyne...
Retrying mobic...
Retrying zyvox...
Retrying lantus...
Retrying exelon...
Retrying welchol...
Retrying novolog...
Retrying trelstar-depot...
Retrying acova...
Retrying innohep...
Retrying colazal...
Retrying abreva...
Retrying rescula...
Retrying cetrotide...
Retrying kaletra...
Retrying trisenox...
Retrying mifeprex...
Retrying angiomax...
Retrying starlix...
Retrying peg-intron...
Retrying cancidas...
Retrying geodon...
Retrying foradil...
Retrying reminyl...
Retrying travatan...
Retrying lumigan...
Retrying campath...
Retrying axert...
Retrying yasmin...
Retrying natrecor...
Retrying zometa...
Retrying spectracef...
Retrying aranesp...
Retrying nuvaring...
Retrying viread...
Retrying frova...
Retrying bextra...
Retrying avodart...
Retrying ortho-evra...


In [81]:
import pandas as pd

# Load FDA base sheet
fda_path = r"F:\PhD\RA\Schafer\IRA\data\nme\nme_approved_2020onward.xlsx"
df_fda = pd.read_excel(fda_path)

# Load all approvals (combined after retry)
combined_path = r"F:\PhD\RA\Schafer\IRA\data\nme\all_drugs_approval_entries_combined.xlsx"
df_approvals = pd.read_excel(combined_path)

# Clean drug names for merge (lowercase, strip, replace space with hyphen)
df_fda['drugname_clean'] = df_fda['drugname'].str.strip().str.lower().str.replace(' ', '-')
df_approvals['drugname_clean'] = df_approvals['drugname'].str.strip().str.lower()

# Merge, keeping all FDA rows, and add approvals where found
df_merged = pd.merge(
    df_fda,
    df_approvals[['drugname_clean', 'date', 'details', 'url']],
    on='drugname_clean',
    how='left'
)

# (Optional) Drop the helper column for output
df_merged = df_merged.drop(columns=['drugname_clean'])

import pandas as pd
import re

# Assuming df_merged is your DataFrame and the column is called "date"
def clean_date(s):
    if pd.isna(s):
        return s
    # Replace multiple spaces with a single space
    s = re.sub(r'\s+', ' ', s.strip())
    return s

df_merged['date_clean'] = df_merged['date'].apply(clean_date)
# Now convert to datetime (this will coerce invalid formats to NaT)
df_merged['date_parsed'] = pd.to_datetime(df_merged['date_clean'], errors='coerce')

# Preview result
print(df_merged[['date', 'date_clean', 'date_parsed']].head())
df_merged['date'] = df_merged['date'].apply(clean_date)
df_merged['date'] = pd.to_datetime(df_merged['date'], errors='coerce')

# Save to Excel
merged_path = r"F:\PhD\RA\Schafer\IRA\data\nme\nme_fda_merged_with_approvals.xlsx"
df_merged.to_excel(merged_path, index=False)
print(f"✅ Done! Merged data saved to: {merged_path}")

  date date_clean date_parsed
0  NaN        NaN         NaT
1  NaN        NaN         NaT
2  NaN        NaN         NaT
3  NaN        NaN         NaT
4  NaN        NaN         NaT
✅ Done! Merged data saved to: F:\PhD\RA\Schafer\IRA\data\nme\nme_fda_merged_with_approvals.xlsx


In [1]:
import pandas as pd

# Load FDA base sheet
fda_path = r"F:\PhD\RA\Schafer\IRA\data\nme\nme_fda_merged_with_approvals.xlsx"
df = pd.read_excel(fda_path)

# Convert both columns to datetime (if not already)
df['FDA Approval Date'] = pd.to_datetime(df['FDA Approval Date'], errors='coerce')
df['date'] = pd.to_datetime(df['date'], errors='coerce')

# Keep only the date part (as a string, so Excel shows YYYY-MM-DD)
df['FDA Approval Date'] = df['FDA Approval Date'].dt.date
df['date'] = df['date'].dt.date

# Drop rows with empty or missing details
df = df[df['details'].notna() & (df['details'].str.strip() != '')].copy()

# Drop only rows where details starts with "Approval " (with a space)
mask = df['details'].str.match(r'^Approval\s', na=False)
df = df[~mask].copy()

# Save to Excel
output_path = r"F:\PhD\RA\Schafer\IRA\data\nme\nme_fda_merged_with_approvals_clean.xlsx"
df.to_excel(output_path, index=False)
print(f"✅ Saved clean file to: {output_path}")

✅ Saved clean file to: F:\PhD\RA\Schafer\IRA\data\nme\nme_fda_merged_with_approvals_clean.xlsx


## Check

In [61]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# Path to your filtered Excel with 'drugname'
input_path = r"F:\PhD\RA\Schafer\IRA\data\nme\nme_approved_2020onward.xlsx"  # update if needed

# Read in your DataFrame with the 'drugname' column
df_filtered = pd.read_excel(input_path)

# Clean up drug names for use in URL (lowercase, strip spaces, replace spaces with '-')
df_filtered['drugname_clean'] = df_filtered['drugname'].str.strip().str.lower().str.replace(' ', '-')
drugnames = ['keytruda']

all_rows = []

for drugname in drugnames:
    url = f"https://www.drugs.com/history/{drugname}.html"
    print(f"Processing {drugname}...")
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, "html.parser")
        table = soup.find("table")
        found = False
        if table:
            for tr in table.find_all("tr"):
                tds = tr.find_all("td")
                if len(tds) >= 2:
                    date = tds[0].get_text(strip=True)
                    details = tds[1].get_text(strip=True)
                    # Only rows starting with "Approval"
                    if details.lower().startswith("approval"):
                        all_rows.append({
                            "drugname": drugname,
                            "date": date,
                            "details": details,
                            "url": url
                        })
                        found = True
        if not found:
            for entry in soup.find_all("li", class_="ddc-history-event"):
                label = entry.find("span", class_="label")
                if label and "Approval" in label.text:
                    date = entry.find("span", class_="date").text.strip()
                    article = entry.find("span", class_="title").text.strip()
                    if article.lower().startswith("approval"):
                        all_rows.append({
                            "drugname": drugname,
                            "date": date,
                            "details": article,
                            "url": url
                        })
        time.sleep(0.5)
    except Exception as e:
        print(f"Error processing {drugname}: {e}")

# Create DataFrame, merge back to original for pretty names and approval dates
df_all = pd.DataFrame(all_rows)

output_path = r"F:\PhD\RA\Schafer\IRA\data\nme\all_drugs_approval_entries.xlsx"
df_all.to_excel(output_path, index=False)

# Merge with your original DataFrame to get FDA Approval Date and pretty drugname
df_all = pd.merge(
    df_all,
    df_filtered[['drugname_clean', 'drugname', 'FDA Approval Date']],
    on='drugname_clean',
    how='left'
)

print(df_all.head())
print("Total records:", len(df_all))

# Save to Excel
output_path = r"F:\PhD\RA\Schafer\IRA\data\nme\all_drugs_approval_entries.xlsx"
df_all.to_excel(output_path, index=False)
print(f"\n✅ Done! Data saved to: {output_path}")

Processing keytruda...


KeyError: 'drugname_clean'

In [45]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

drugname = "keytruda"
url = f"https://www.drugs.com/history/{drugname}.html"

response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
rows = []

# --- Grab the first table on the page (should be history)
table = soup.find("table")
if table:
    for tr in table.find_all("tr"):
        tds = tr.find_all("td")
        if len(tds) >= 2:
            date = tds[0].get_text(strip=True)
            details = tds[1].get_text(strip=True)
            # Filter only approval rows (case-insensitive, matches at start)
            if details.lower().startswith("approval"):
                rows.append({"drugname": drugname, "date": date, "details": details, "url": url})

# --- Fallback to the old structure (ul/li, rarely needed now)
if not rows:
    for entry in soup.find_all("li", class_="ddc-history-event"):
        label = entry.find("span", class_="label")
        if label and "Approval" in label.text:
            date = entry.find("span", class_="date").text.strip()
            article = entry.find("span", class_="title").text.strip()
            rows.append({"drugname": drugname, "date": date, "details": article, "url": url})

df_drug = pd.DataFrame(rows)
print(df_drug)

    drugname          date                                            details  \
0   keytruda  Mar 19, 2025  ApprovalFDA Approves Pembrolizumab for HER2 Po...   
1   keytruda  Sep 18, 2024  ApprovalFDA Approves Merck’s Keytruda (pembrol...   
2   keytruda  Jul  9, 2024  Approval for the First Dose Cohort in Phase 1a...   
3   keytruda  Jun 17, 2024  ApprovalFDA Approves Merck’s Keytruda (pembrol...   
4   keytruda  Jan 12, 2024  ApprovalFDA Approves Merck’s Keytruda (pembrol...   
5   keytruda  Dec 15, 2023  ApprovalPadcev (enfortumab vedotin-ejfv) with ...   
6   keytruda  Nov 16, 2023  ApprovalFDA Approves Merck’s Keytruda (pembrol...   
7   keytruda  Nov  1, 2023  ApprovalFDA Approves Merck’s Keytruda (pembrol...   
8   keytruda  Oct 16, 2023  ApprovalFDA Approves Keytruda (pembrolizumab) ...   
9   keytruda  Apr  3, 2023  ApprovalFDA Approves Merck’s Keytruda (pembrol...   
10  keytruda  Mar 29, 2023  ApprovalFDA Converts to Full Approval Indicati...   
11  keytruda  Jan 27, 2023  

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

drugname = "keytruda"   # Or any other drug
url = f"https://www.drugs.com/history/{drugname}.html"

response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
rows = []

# Handle table structure (e.g., ayvakit)
table = soup.find("table")
if table:
    for tr in table.find_all("tr"):
        tds = tr.find_all("td")
        if len(tds) >= 2:
            date = tds[0].get_text(strip=True)
            details = tds[1].get_text(strip=True)
            if details.lower().startswith("approval"):
                rows.append({
                    "drugname": drugname,
                    "date": date,
                    "details": details,
                    "url": url,
                    "green_badge": None    # Can't tell from table
                })
# Handle list/badge structure (e.g., keytruda)
else:
    for entry in soup.find_all("li", class_="ddc-history-event"):
        label = entry.find("span", class_="label")
        if label and "Approval" in label.text:
            date = entry.find("span", class_="date").text.strip()
            article = entry.find("span", class_="title").text.strip()
            # Check if badge is green
            label_classes = label.get("class", [])
            is_green = "success" in label_classes
            rows.append({
                "drugname": drugname,
                "date": date,
                "details": article,
                "url": url,
                "green_badge": is_green
            })

df_drug = pd.DataFrame(rows)
print(df_drug)