In [4]:
import time
import os
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import glob
import zipfile

In [None]:
# Set your download directory
download_dir = r"F:\PhD\RA\Schafer\IRA\data"

# Configure Chrome options
chrome_options = Options()
chrome_options.add_experimental_option("prefs", {
    "download.default_directory": download_dir,         # Change default download location
    "download.prompt_for_download": False,              # Disable download prompt
    "directory_upgrade": True,
    "safebrowsing.enabled": True
})

# Optional: Run in headless mode
# chrome_options.add_argument("--headless")

# Start Chrome
driver = webdriver.Chrome(options=chrome_options)

try:
    # Go to the page
    url = "https://www.fda.gov/drugs/drug-approvals-and-databases/drugsfda-data-files#download"
    driver.get(url)
    time.sleep(3)  # Wait for the page to load

    # Find and click the download link
    download_link = driver.find_element(By.PARTIAL_LINK_TEXT, "Drugs@FDA Download File")
    download_link.click()
    time.sleep(10)  # Wait for the download to complete (adjust as needed)

finally:
    driver.quit()

print("Download complete!")

Download complete!


In [5]:
# Folder where the file is downloaded
download_dir = r"F:\PhD\RA\Schafer\IRA\data"

# Get list of zip files and sort by modification time (descending)
zip_files = glob.glob(os.path.join(download_dir, "*.zip"))
latest_zip = max(zip_files, key=os.path.getmtime)
print("Using ZIP file:", latest_zip)

extract_dir = os.path.join(download_dir, "unzipped")
os.makedirs(extract_dir, exist_ok=True)

with zipfile.ZipFile(latest_zip, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

print("Unzipped all files to:", extract_dir)

Using ZIP file: F:\PhD\RA\Schafer\IRA\data\daf05212025.zip
Unzipped all files to: F:\PhD\RA\Schafer\IRA\data\unzipped


In [13]:
for txt_file in txt_files:
    # Try to read with fallback encoding and skip bad lines
    try:
        df = pd.read_csv(txt_file, sep='\t', header=None, dtype=str, encoding="utf-8", on_bad_lines='skip')
    except UnicodeDecodeError:
        df = pd.read_csv(txt_file, sep='\t', header=None, dtype=str, encoding="latin1", on_bad_lines='skip')
    except Exception as e:
        print(f"Error processing {txt_file}: {e}")
        continue

    # Set first row as header
    new_header = df.iloc[0]
    df = df[1:]
    df.columns = new_header

    # Drop empty columns (optional, for cleaner Excel)
    # df = df.dropna(axis=1, how='all')
    # df = df.loc[:, ~(df == '').all()]

    # Save to Excel
    excel_name = os.path.splitext(os.path.basename(txt_file))[0] + ".xlsx"
    excel_path = os.path.join(extract_dir, excel_name)
    df.to_excel(excel_path, index=False)
    print(f"Converted {txt_file} -> {excel_path}")

Converted F:\PhD\RA\Schafer\IRA\data\unzipped\ActionTypes_Lookup.txt -> F:\PhD\RA\Schafer\IRA\data\unzipped\ActionTypes_Lookup.xlsx
Converted F:\PhD\RA\Schafer\IRA\data\unzipped\ApplicationDocs.txt -> F:\PhD\RA\Schafer\IRA\data\unzipped\ApplicationDocs.xlsx
Converted F:\PhD\RA\Schafer\IRA\data\unzipped\Applications.txt -> F:\PhD\RA\Schafer\IRA\data\unzipped\Applications.xlsx
Converted F:\PhD\RA\Schafer\IRA\data\unzipped\ApplicationsDocsType_Lookup.txt -> F:\PhD\RA\Schafer\IRA\data\unzipped\ApplicationsDocsType_Lookup.xlsx
Converted F:\PhD\RA\Schafer\IRA\data\unzipped\Join_Submission_ActionTypes_Lookup.txt -> F:\PhD\RA\Schafer\IRA\data\unzipped\Join_Submission_ActionTypes_Lookup.xlsx
Converted F:\PhD\RA\Schafer\IRA\data\unzipped\MarketingStatus.txt -> F:\PhD\RA\Schafer\IRA\data\unzipped\MarketingStatus.xlsx
Converted F:\PhD\RA\Schafer\IRA\data\unzipped\MarketingStatus_Lookup.txt -> F:\PhD\RA\Schafer\IRA\data\unzipped\MarketingStatus_Lookup.xlsx
Converted F:\PhD\RA\Schafer\IRA\data\unzip

In [12]:
prod_path = r"F:\PhD\RA\Schafer\IRA\data\unzipped\Products.xlsx"
subm_path = r"F:\PhD\RA\Schafer\IRA\data\unzipped\Submissions.xlsx"
appl_path = r"F:\PhD\RA\Schafer\IRA\data\unzipped\ApplicationDocs.xlsx"
act_path = r"F:\PhD\RA\Schafer\IRA\data\unzipped\Join_Submission_ActionTypes_Lookup.xlsx"
out_path = r"F:\PhD\RA\Schafer\IRA\data\unzipped\integrated_file.xlsx"
applty_path = r"F:\PhD\RA\Schafer\IRA\data\unzipped\Applications.xlsx"
df_prod = pd.read_excel(prod_path)
df_subm = pd.read_excel(subm_path)
df_appl = pd.read_excel(appl_path)
df_act = pd.read_excel(act_path)
df_applty = pd.read_excel(applty_path)

In [16]:
keys = ['ApplNo', 'SubmissionType', 'SubmissionNo']
dfs = [df_prod, df_subm, df_appl, df_act, df_applty]
names = ['df_prod', 'df_subm', 'df_appl', 'df_act', 'df_applty']

for df, name in zip(dfs, names):
    if 'ApplNo' in df.columns:
        # Remove whitespace, then convert to integer
        df['ApplNo'] = df['ApplNo'].astype(str).str.strip()
        # If any 'nan', replace with None to avoid conversion error
        df['ApplNo'] = df['ApplNo'].replace('nan', None)
        df['ApplNo'] = df['ApplNo'].astype(float).astype('Int64')  # allows for NaN
        print(f"{name} ApplNo dtype: {df['ApplNo'].dtype} sample: {df['ApplNo'].unique()[:5]}")

df_prod ApplNo dtype: Int64 sample: <IntegerArray>
[4, 159, 552, 734, 793]
Length: 5, dtype: Int64
df_subm ApplNo dtype: Int64 sample: <IntegerArray>
[4, 159, 415, 552, 654]
Length: 5, dtype: Int64
df_appl ApplNo dtype: Int64 sample: <IntegerArray>
[4782, 5010, 5213, 5378, 5619]
Length: 5, dtype: Int64
df_act ApplNo dtype: Int64 sample: <IntegerArray>
[17866, 71450, 17514, 14716, 18276]
Length: 5, dtype: Int64
df_applty ApplNo dtype: Int64 sample: <IntegerArray>
[4, 159, 552, 734, 793]
Length: 5, dtype: Int64


In [17]:
# df_prod: Products DataFrame
# df_subm: Submissions DataFrame

merged1 = pd.merge(df_prod, df_subm, how='outer', on='ApplNo')
merged2 = pd.merge(merged1, df_applty, how='outer', on='ApplNo')
merged3 = pd.merge(
    merged2,
    df_appl,
    how='outer',  # or 'left', 'right', 'outer' as you need
    on=['ApplNo', 'SubmissionType', 'SubmissionNo']
)
merged4 = pd.merge(
    merged3,
    df_act,
    how='outer',  # or 'left', 'right', 'outer' as you need
    on=['ApplNo', 'SubmissionType', 'SubmissionNo']
)

output_path = r"F:\PhD\RA\Schafer\IRA\data\unzipped\merge\merged4.xlsx"
merged4.to_excel(output_path, index=False)
print(f"Merged data saved to {output_path}")

Merged data saved to F:\PhD\RA\Schafer\IRA\data\unzipped\merge\merged4.xlsx


In [1]:
import pandas as pd

# File path
file_path = r"F:\PhD\RA\Schafer\IRA\data\unzipped\merge\merged4.xlsx"

# Read the Excel file
df = pd.read_excel(file_path)

# Convert 'SubmissionStatusDate' and 'ApplicationDocsDate' to datetime (if not already)
df['SubmissionStatusDate'] = pd.to_datetime(df['SubmissionStatusDate'])
df['ApplicationDocsDate'] = pd.to_datetime(df['ApplicationDocsDate'])

# Keep only the date part (remove time)
df['SubmissionStatusDate'] = df['SubmissionStatusDate'].dt.date
df['ApplicationDocsDate'] = df['ApplicationDocsDate'].dt.date

# If you want to save the modified DataFrame back to Excel
output_file_path = r"F:\PhD\RA\Schafer\IRA\data\unzipped\merge\merged4_modified.xlsx"
df.to_excel(output_file_path, index=False)

print("Dates cleaned and saved to:", output_file_path)

Dates cleaned and saved to: F:\PhD\RA\Schafer\IRA\data\unzipped\merge\merged4_modified.xlsx
