In [1]:
import pandas as pd

In [3]:
# Define the file path
file_path = r"F:\PhD\RA\Schafer\IRA\data\transformed\combined_2_cleaned.xlsx"

# Load the Excel file
df = pd.read_excel(file_path)

# Define the columns to keep and their new names
columns_to_keep = {
    "Citeline Drug ID": "drugid",
    "Generic Drug Name": "drugprimaryname",
    "Drug Names": "drugnamesynonyms",
    "Event Date": "eventdate",
    "Event Type": "eventtype",
    "Event Details": "eventdetails",
    "Drug Type": "origin",
    "NCE": "nce",
    "Therapeutic Class": "therapeuticclasses",
    "Mechanism Of Action": "mechanismsofaction"
}

# Filter and rename the columns
df_cleaned = df[list(columns_to_keep.keys())].rename(columns=columns_to_keep)

# Backup original eventdate column
df_cleaned["eventdate_original"] = df_cleaned["eventdate"]

# Convert eventdate to datetime format (assuming it's in YYYY/MM/DD format)
df_cleaned["eventdate"] = pd.to_datetime(df_cleaned["eventdate"], format="%Y/%m/%d", errors="coerce")

# Convert to YYYY-MM-DD format
df_cleaned["eventdate"] = df_cleaned["eventdate"].dt.strftime("%Y-%m-%d")

# Define the output file path
output_file_path = r"F:\PhD\RA\Schafer\IRA\data\transformed\cleaned_data2.xlsx"

# Save the cleaned data to a new Excel file
df_cleaned.to_excel(output_file_path, index=False)

print(f"Cleaned data saved to {output_file_path}")


Cleaned data saved to F:\PhD\RA\Schafer\IRA\data\transformed\cleaned_data2.xlsx


In [4]:
# Define file paths
input_file = r"F:\PhD\RA\Schafer\IRA\data\transformed\transformed_data1.xlsx"
output_file = r"F:\PhD\RA\Schafer\IRA\data\transformed\cleaned_data1.xlsx"

# Load the dataset
df = pd.read_excel(input_file)

# Rename the specified columns
df.rename(columns={
    "Event Date": "eventdate",
    "Event Type": "eventtype",
    "Event Details": "eventdetails"
}, inplace=True)

# Define the columns to keep
columns_to_keep = [
    "drugid",
    "drugprimaryname",
    "drugnamesynonyms",
    "eventdate",
    "eventtype",
    "eventdetails",
    "origin",
    "nce",
    "therapeuticclasses",
    "mechanismsofaction"
]

# Filter the dataset to keep only these columns
df_cleaned = df[columns_to_keep].copy()  # Use .copy() to avoid modifying original df

# Backup original eventdate column
df_cleaned["eventdate_original"] = df_cleaned["eventdate"]

# Convert `eventdate` to datetime format, handling "T00:00:00Z" issue
df_cleaned["eventdate"] = df_cleaned["eventdate"].astype(str).str.replace(r"T\d{2}:\d{2}:\d{2}Z", "", regex=True)
df_cleaned["eventdate"] = pd.to_datetime(df_cleaned["eventdate"], errors="coerce")  # Convert to datetime
df_cleaned["eventdate"] = df_cleaned["eventdate"].dt.strftime("%Y-%m-%d")  # Standardize format

# Save the cleaned data to a new file
df_cleaned.to_excel(output_file, index=False)

print(f"Cleaned data saved to {output_file}")

Cleaned data saved to F:\PhD\RA\Schafer\IRA\data\transformed\cleaned_data1.xlsx


In [5]:
# Define file paths
file1 = r"F:\PhD\RA\Schafer\IRA\data\transformed\cleaned_data1.xlsx"
file2 = r"F:\PhD\RA\Schafer\IRA\data\transformed\cleaned_data2.xlsx"
output_file = r"F:\PhD\RA\Schafer\IRA\data\transformed\all.xlsx"

# Load both cleaned datasets
df1 = pd.read_excel(file1)
df2 = pd.read_excel(file2)

# Append the datasets
df_combined = pd.concat([df1, df2], ignore_index=True)

# Save the combined data
df_combined.to_excel(output_file, index=False)

print(f"Combined cleaned data saved to {output_file}")

Combined cleaned data saved to F:\PhD\RA\Schafer\IRA\data\transformed\all.xlsx


## Data Reformating

In [8]:
# Define file paths
input_file = r"F:\PhD\RA\Schafer\IRA\data\transformed\all.xlsx"
output_file = r"F:\PhD\RA\Schafer\IRA\data\transformed\all_formatted.xlsx"
check_file = r"F:\PhD\RA\Schafer\IRA\data\transformed\check.xlsx"

# Load the dataset
df = pd.read_excel(input_file)

# Ensure 'eventdate' remains a datetime type (Only if necessary)
# if not pd.api.types.is_datetime64_any_dtype(df["eventdate"]):
#    df["eventdate"] = pd.to_datetime(df["eventdate"], errors='coerce')

# Standardize 'nce' column: Yes → 1, No → 0, Missing → 0
df["nce"] = df["nce"].replace({"Yes": 1, "No": 0}).fillna(0).astype(int)

# Create 'bio' column: 1 if 'origin' contains "biological" (case insensitive), else 0
df["bio"] = df["origin"].astype(str).str.contains("biological", case=False, na=False).astype(int)

# Create 'pass' column (1 if eventdate >= 2021-11-19, else 0)
df["pass"] = (df["eventdate"] >= "2021-11-19").astype(int)

# Create 'effective' column (1 if eventdate >= 2022-08-16, else 0)
df["effective"] = (df["eventdate"] >= "2022-08-16").astype(int)

# Save the modified dataset
df.to_excel(check_file, index=False)

print(f"Formatted data saved to {check_file}")

  df["nce"] = df["nce"].replace({"Yes": 1, "No": 0}).fillna(0).astype(int)


Formatted data saved to F:\PhD\RA\Schafer\IRA\data\transformed\check.xlsx


In [9]:
# Find all drug IDs where eventtype contains "approval" or "launch" (case insensitive)
matching_drug_ids = df[df["eventtype"].astype(str).str.contains("approval|launch", case=False, na=False)]["drugid"].unique()

# Filter the dataset to keep only rows with those drug IDs
df_filtered = df[df["drugid"].isin(matching_drug_ids)]

# Save the filtered dataset
df_filtered.to_excel(output_file, index=False)

print(f"Filtered dataset saved to {output_file}, keeping all rows for drugs with 'approval' or 'launch' events.")

Filtered dataset saved to F:\PhD\RA\Schafer\IRA\data\transformed\all_formatted.xlsx, keeping all rows for drugs with 'approval' or 'launch' events.


In [13]:
import re

# Define file paths
input_file = r"F:\PhD\RA\Schafer\IRA\data\transformed\all_formatted.xlsx"
output_file = r"F:\PhD\RA\Schafer\IRA\data\transformed\all_disease.xlsx"

# Load the dataset
df = pd.read_excel(input_file)


# Function to count diseases correctly
def count_diseases(row):
    if row["eventtype"] != "New Disease":
        return 0
    
    event_details = str(row["eventdetails"]).strip().rstrip(";")

    if not event_details or event_details.lower() == "nan":
        return 1

    # Split on actual delimiters only
    parts = re.split(r"\s*(?:;|&| and )\s*", event_details)
    parts = [p.strip() for p in parts if p.strip()]

    return len(parts)
    
# Apply function to create `diseasenum` column
df["diseasenum"] = df.apply(count_diseases, axis=1)

def count_usapp(row):
    if row["eventtype"] != "Supplemental Approval":
        return 0  # Only process Supplemental Approval events

    details = str(row["eventdetails"])

    # Check if "US;" or "USA;" is contained anywhere
    if "US;" not in details and "USA;" not in details:
        return 0

    # Clean and normalize the string
    details = details.strip().rstrip(";")

    # Count semicolons
    semicolon_count = details.count(";")

    if semicolon_count <= 1:
        return 1
    else:
        return semicolon_count - 1  # Subtract 1 for the "US;" or "USA;" presence

df["usapp"] = df.apply(count_usapp, axis=1)

# Step 1: Get First Approval date
first_approval = (
    df[df["eventtype"] == "First Approval"]
    .groupby("drugid")["eventdate"]
    .min()
    .reset_index()
    .rename(columns={"eventdate": "first_approval_date"})
)

# Step 2: Get First Launch date
first_launch = (
    df[df["eventtype"] == "First Launch"]
    .groupby("drugid")["eventdate"]
    .min()
    .reset_index()
    .rename(columns={"eventdate": "first_launch_date"})
)

# Step 3: Merge both into main dataframe
df = df.merge(first_approval, on="drugid", how="left")
df = df.merge(first_launch, on="drugid", how="left")

# Step 4: Define fallback first_approval_date (use Launch if Approval is missing)
df["first_approval_or_launch"] = df["first_approval_date"].combine_first(df["first_launch_date"])

# Step 5: Create `postapp` flag
df["postapp"] = (df["eventdate"] > df["first_approval_or_launch"]).astype(int)

# Save the filtered dataset
df.to_excel(output_file, index=False)

print(f"Filtered dataset saved to {output_file}, contains dieases number and post indicator")

Filtered dataset saved to F:\PhD\RA\Schafer\IRA\data\transformed\all_disease.xlsx, contains dieases number and post indicator


In [15]:
input_file = r"F:\PhD\RA\Schafer\IRA\data\transformed\all_disease.xlsx"

# Load the dataset
df = pd.read_excel(input_file)

# Step 1: Create `phaseii` column
df["phaseii"] = df["eventdetails"].astype(str).str.contains(r"phase\s*ii|phase\s*2", case=False, na=False).astype(int)

# Step 2: Create `phaseiii` column
df["phaseiii"] = df["eventdetails"].astype(str).str.contains(r"phase\s*iii|phase\s*3", case=False, na=False).astype(int)

# Step 3: Fix `phaseii` where `phaseiii = 1` to prevent double-counting
df.loc[df["phaseiii"] == 1, "phaseii"] = 0

In [17]:
output_file = r"F:\PhD\RA\Schafer\IRA\data\transformed\all_complete.xlsx"
# Save the filtered dataset
df.to_excel(output_file, index=False)

print(f"Filtered dataset saved to {output_file}, contains dieases number and post indicator")

Filtered dataset saved to F:\PhD\RA\Schafer\IRA\data\transformed\all_complete.xlsx, contains dieases number and post indicator


In [3]:
import pandas as pd

# Step 1: Define file paths
input_file = r"F:\PhD\RA\Schafer\IRA\data\transformed\all_complete.xlsx"
output_file = r"F:\PhD\RA\Schafer\IRA\data\check\5940.xlsx"

# Step 2: Read the input file
df = pd.read_excel(input_file)

# Step 3: Filter for drugid == 69763
df_filtered = df[df["drugid"] == 5940]

# Step 4: Save the filtered result
df_filtered.to_excel(output_file, index=False)

print(f"Filtered data for drugid 5940 saved to:\n{output_file}")

Filtered data for drugid 5940 saved to:
F:\PhD\RA\Schafer\IRA\data\check\5940.xlsx


In [3]:
import pandas as pd

# Step 1: Define file paths
input_file = r"F:\PhD\RA\Schafer\IRA\data\transformed\all_complete.xlsx"
df = pd.read_excel(input_file)

# Filter rows with "Approval" or "Launch" in `eventtype` AND "US;" in `eventdetails`
us_approval_events = df[
    df["eventtype"].str.contains("Approval|Launch", case=True, na=False) &
    df["eventdetails"].str.contains("US", case=True, na=False)
]

# For each drugid, find the earliest matching `eventdate`
us_approval_dates = (
    us_approval_events.groupby("drugid")["eventdate"]
    .min()
    .reset_index()
    .rename(columns={"eventdate": "usapprovaldate"})
)

# Merge the result back into the original dataframe
df = df.merge(us_approval_dates, on="drugid", how="left")

def count_us_new_approval(row):
    if row["eventtype"] != "New Approval":
        return 0

    details = str(row["eventdetails"]).strip().rstrip(";")
    
    if "US;" not in details:
        return 0

    return details.count(";")

# Apply the function to create a new column
df["us_new_approval"] = df.apply(count_us_new_approval, axis=1)

# Column: has_us_approval
df["has_us_approval"] = ((df["us_new_approval"] + df["usapp"]) > 0).astype(int)

# Column: has_disease
df["has_disease"] = (df["diseasenum"] > 0).astype(int)

df["postapp"] = (df["eventdate"] > df["usapprovaldate"]).astype(int)

output_path = r"F:\PhD\RA\Schafer\IRA\data\transformed\all_complete_with_usflags.xlsx"
df.to_excel(output_path, index=False)
print(f"✅ Updated dataset saved to: {output_path}")

✅ Updated dataset saved to: F:\PhD\RA\Schafer\IRA\data\transformed\all_complete_with_usflags.xlsx
