In [2]:
import pandas as pd

# Define file paths (adjust as needed)
part_d_path = r"F:\PhD\RA\Schafer\IRA\data\part\Part_D_Spending_by_Drug_2022.csv"
part_b_path = r"F:\PhD\RA\Schafer\IRA\data\part\Part_B_Spending_by_Drug_2022.csv"

# Load the datasets
df_part_d = pd.read_csv(part_d_path)
df_part_b = pd.read_csv(part_b_path)


In [3]:
# Filter to keep only rows where Mftr_Name includes "overall"
df_part_d = df_part_d[df_part_d['Mftr_Name'].str.contains("overall", case=False, na=False)]

df_part_d_top = df_part_d.copy()

df_part_d_top['single'] = (df_part_d_top['Tot_Mftr'] == 1).astype(int)

# Step 2: Sort by spending
df_part_d_top = df_part_d_top.sort_values(by='Tot_Spndng_2022', ascending=False)

# Step 3: Mark top 50 and top 100
df_part_d_top['dt50'] = 0
df_part_d_top['dt100'] = 0

# Step 3: Get the subset with single == 1
df_single = df_part_d_top[df_part_d_top['single'] == 1].copy()

# Step 4: Sort single subset by spending
df_single = df_single.sort_values(by='Tot_Spndng_2022', ascending=False)

# Step 5: Assign dt50 and dt100 for the top spending drugs in the single group
top50_indices = df_single.index[:50]
top100_indices = df_single.index[:100]

df_part_d_top.loc[top50_indices, 'dt50'] = 1
df_part_d_top.loc[top100_indices, 'dt100'] = 1

# Step 4: Keep relevant columns and rename
df_part_d_top = df_part_d_top[['Brnd_Name', 'Gnrc_Name', 'Tot_Spndng_2022', 'dt50', 'dt100', 'single']]
df_part_d_top = df_part_d_top.rename(columns={'Tot_Spndng_2022': 'd2022'})

# Optional: Reset index for clean display
df_part_d_top.reset_index(drop=True, inplace=True)

In [4]:
output_path = r"F:\PhD\RA\Schafer\IRA\data\part\check_single.csv"
df_part_d_top.to_csv(output_path, index=False)

In [5]:
# Step 1: Filter to only drugs with spending > $200 million
df_part_b_top = df_part_b.copy()

# Step 2: Sort by total spending, descending
df_part_b_top = df_part_b_top.sort_values(by='Tot_Spndng_2022', ascending=False)

# Step 3: Mark top 50 and top 100
df_part_b_top['bt50'] = 0
df_part_b_top['bt100'] = 0

df_part_b_top.loc[df_part_b_top.index[:50], 'bt50'] = 1
df_part_b_top.loc[df_part_b_top.index[:100], 'bt100'] = 1

# Step 4: Keep only selected columns and rename
df_part_b_top = df_part_b_top[['Brnd_Name', 'Gnrc_Name', 'Tot_Spndng_2022', 'bt50', 'bt100']]
df_part_b_top = df_part_b_top.rename(columns={'Tot_Spndng_2022': 'b2022'})

# Optional: Reset index for clean display
df_part_b_top.reset_index(drop=True, inplace=True)

In [6]:
df_part_b_top['Brnd_Name'] = df_part_b_top['Brnd_Name'].str.replace('*', '', regex=False)
df_part_b_top['Gnrc_Name'] = df_part_b_top['Gnrc_Name'].str.replace('*', '', regex=False)
df_part_d_top['Brnd_Name'] = df_part_d_top['Brnd_Name'].str.replace('*', '', regex=False)
df_part_d_top['Gnrc_Name'] = df_part_d_top['Gnrc_Name'].str.replace('*', '', regex=False)
df_part_b_top['partb'] = 1
df_part_d_top['partd'] = 1

In [7]:
print("📦 Part B Top Count:", len(df_part_b_top))
print("💊 Part D Top Count:", len(df_part_d_top))

📦 Part B Top Count: 635
💊 Part D Top Count: 3575


In [8]:
# Step 1: Create lowercase versions of brand names
df_part_b_top['Brnd_Name_lc'] = df_part_b_top['Brnd_Name'].str.lower().str.strip()
df_part_d_top['Brnd_Name_lc'] = df_part_d_top['Brnd_Name'].str.lower().str.strip()

# Step 2: Merge on the lowercase version
merged_df = pd.merge(df_part_b_top, df_part_d_top, on='Brnd_Name_lc', how='outer', suffixes=('_b', '_d'))

# Drop one of the duplicate generic names (keep Part D if it exists, otherwise B)
merged_df['Gnrc_Name'] = merged_df['Gnrc_Name_d'].combine_first(merged_df['Gnrc_Name_b'])

# Drop one of the duplicate brand names (keep Part D if it exists, otherwise B)
merged_df['Brnd_Name'] = merged_df['Brnd_Name_d'].combine_first(merged_df['Brnd_Name_b'])

# Drop extra columns
merged_df.drop(columns=['Gnrc_Name_b', 'Gnrc_Name_d', 'Brnd_Name_lc', 'Brnd_Name_b', 'Brnd_Name_d'], inplace=True)

cols_to_fill = ['b2022', 'bt50', 'bt100', 'partb', 'd2022', 'dt50', 'dt100', 'partd', 'single']
merged_df[cols_to_fill] = merged_df[cols_to_fill].fillna(0)

In [9]:
output_path = r"F:\PhD\RA\Schafer\IRA\data\part\check_combined.csv"
merged_df.to_csv(output_path, index=False)

In [10]:
df_path = r"F:\PhD\RA\Schafer\IRA\data\transformed\all_accu.dta"
df = pd.read_stata(df_path)
original_df = pd.read_stata(df_path)

In [11]:
import re

# Define a regex to remove leading/trailing special characters and spaces
clean_pattern = r"^[\s\*\-;:,\.]+|[\s\*\-;:,\.]+$"

# Apply to both brand and generic names
merged_df['Brnd_Name'] = merged_df['Brnd_Name'].str.replace(clean_pattern, '', regex=True)
merged_df['Gnrc_Name'] = merged_df['Gnrc_Name'].str.replace(clean_pattern, '', regex=True)

In [12]:
df['drugprimaryname_lc'] = df['drugprimaryname'].str.lower().fillna('')
df['drugnamesynonyms_lc'] = df['drugnamesynonyms'].str.lower().fillna('')
merged_df['Brnd_Name_lc'] = merged_df['Brnd_Name'].str.lower().fillna('')

# Initialize the columns if not already in df
df['matched_name'] = None
df['b2022'] = 0.0
df['d2022'] = 0.0
df['partb'] = 0
df['partd'] = 0
df['bt50'] = 0
df['bt100'] = 0
df['dt50'] = 0
df['dt100'] = 0
df['single'] = 0

# Loop through merged_df to assign values to all matched rows in df
for _, row in merged_df.iterrows():
    bname = row['Brnd_Name_lc']
    
    # Match where brand name appears in primary or synonym
    mask = (
        (df['drugprimaryname_lc'] == bname) |
        (df['drugnamesynonyms_lc'].str.contains(bname, na=False))
    )
    
    df.loc[mask, 'matched_name'] = row['Brnd_Name']
    df.loc[mask, 'b2022'] = row['b2022']
    df.loc[mask, 'd2022'] = row['d2022']
    df.loc[mask, 'partb'] = row['partb']
    df.loc[mask, 'partd'] = row['partd']
    df.loc[mask, 'bt50'] = row['bt50']
    df.loc[mask, 'bt100'] = row['bt100']
    df.loc[mask, 'dt50'] = row['dt50']
    df.loc[mask, 'dt100'] = row['dt100']
    df.loc[mask, 'single'] = row['single']

  (df['drugnamesynonyms_lc'].str.contains(bname, na=False))
  (df['drugnamesynonyms_lc'].str.contains(bname, na=False))
  (df['drugnamesynonyms_lc'].str.contains(bname, na=False))
  (df['drugnamesynonyms_lc'].str.contains(bname, na=False))
  (df['drugnamesynonyms_lc'].str.contains(bname, na=False))
  (df['drugnamesynonyms_lc'].str.contains(bname, na=False))
  (df['drugnamesynonyms_lc'].str.contains(bname, na=False))
  (df['drugnamesynonyms_lc'].str.contains(bname, na=False))
  (df['drugnamesynonyms_lc'].str.contains(bname, na=False))
  (df['drugnamesynonyms_lc'].str.contains(bname, na=False))
  (df['drugnamesynonyms_lc'].str.contains(bname, na=False))
  (df['drugnamesynonyms_lc'].str.contains(bname, na=False))
  (df['drugnamesynonyms_lc'].str.contains(bname, na=False))
  (df['drugnamesynonyms_lc'].str.contains(bname, na=False))
  (df['drugnamesynonyms_lc'].str.contains(bname, na=False))
  (df['drugnamesynonyms_lc'].str.contains(bname, na=False))
  (df['drugnamesynonyms_lc'].str.contain

In [13]:
# Step 1: Drop rows without any matched name
df_matched = df[df['matched_name'].notna()].copy()

df_matched = df_matched.sort_values(by=['matched_name', 'appyear', 'drugid'])
df_min_year = df_matched.loc[df_matched.groupby('matched_name')['appyear'].idxmin()]

df_best_match = df_min_year[[
    'drugid', 'matched_name', 'b2022', 'd2022', 'partb', 'partd',
    'bt50', 'bt100', 'dt50', 'dt100', 'single'
]].copy()

final_df = original_df.merge(df_best_match, on='drugid', how='left')

In [14]:
matched_counts = final_df.groupby('matched_name')['drugid'].nunique().reset_index()
matched_counts = matched_counts.rename(columns={'drugid': 'unique_drugid_count'})
duplicates = matched_counts[matched_counts['unique_drugid_count'] > 1]
total_matched = len(matched_counts)
with_duplicates = len(duplicates)

print(f"Total matched_name entries: {total_matched}")
print(f"matched_names mapped to multiple drugids: {with_duplicates}")

Total matched_name entries: 1358
matched_names mapped to multiple drugids: 0


In [15]:
import pandas as pd

output_path = r"F:\PhD\RA\Schafer\IRA\data\transformed\full_drug_data_with_matches.xlsx"

with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
    final_df.to_excel(writer, index=False)

# Optional

In [None]:
import pandas as pd

df_path = r"F:\PhD\RA\Schafer\IRA\data\transformed\full_drug_data_life.xlsx"
df = pd.read_excel(df_path)  # read_xlsx is deprecated, use read_excel

In [None]:
drug_list = [
    "Ozempic", "Rybelsus", "Wegovy",
    "Trelegy Ellipta", "Xtandi", "Pomalyst", "Ibrance", "Ofev",
    "Linzess", "Calquence", "Austedo", "Austedo XR",
    "Breo Ellipta", "Tradjenta", "Xifaxan", "Vraylar",
    "Janu met", "Janumet XR", "Otezla"
]

# Normalize both lists to lowercase for comparison
matched_names_lower = df['matched_name'].dropna().str.lower().unique()
drug_list_lower = [drug.lower() for drug in drug_list]

# Find which drugs from the list are matched
matched_in_list = [drug for drug in drug_list_lower if drug in matched_names_lower]

# Output
print(f"✅ Matched drugs: {matched_in_list}")
print(f"🔢 Count of matched drugs: {len(matched_in_list)}")

In [None]:
import pandas as pd

df_path = r"F:\PhD\RA\Schafer\IRA\data\transformed\full_drug_data_with_matches.xlsx"
df = pd.read_excel(df_path)  # read_xlsx is deprecated, use read_excel

In [None]:
# Load only needed columns to avoid memory issues
use_cols = ['drugid', 'eventdetails', 'therapeuticclasses']
df = pd.read_excel(df_path, usecols=use_cols)

# Combine text columns into one for unified search
df['combined_text'] = (df['eventdetails'].fillna('') + ' ' + df['therapeuticclasses'].fillna('')).str.lower()

# Define keywords to look for (partial word matches)
keywords = ['diabet', 'asthma', 'copd', 'prostat', 'cancer', 'oncolog', 'tumor']

# Create the flag column
pattern = '|'.join(keywords)
df['disease_flag'] = df['combined_text'].str.contains(pattern, case=False, na=False).astype(int)

# Reduce to drugid-level flag (if any row for a drugid is flagged, mark it 1)
df_disease_flag = df.groupby('drugid')['disease_flag'].max().reset_index()

In [None]:
# Merge the disease flag back into the original dataframe
df_full = pd.read_excel(df_path)  # load full original data
df_full = df_full.merge(df_disease_flag, on='drugid', how='left')

# Fill missing flags with 0 (in case some drugids weren't matched)
df_full['disease_flag'] = df_full['disease_flag'].fillna(0).astype(int)

In [None]:
output_path = r"D:\PhD\RA\Schafer\IRA\data\transformed\full_drug_data_with_matches.xlsx"
df_full.to_excel(output_path, index=False)