In [None]:
import pandas as pd

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [None]:
def load_rxnconso(file_path):
    """
    Load the RXNCONSO.RRF file into a pandas DataFrame.

    Args:
    file_path (str): The path to the RXNCONSO.RRF file.

    Returns:
    pd.DataFrame: A DataFrame containing the RXNCONSO data.
    """
    column_names = [
        "RXCUI",
        "LAT",
        "TS",
        "LUI",
        "STT",
        "SUI",
        "ISPREF",
        "RXAUI",
        "SAUI",
        "SCUI",
        "SDUI",
        "SAB",
        "TTY",
        "CODE",
        "STR",
        "SRL",
        "SUPPRESS",
        "CVF",
    ]
    df = pd.read_csv(file_path, sep="|", names=column_names, index_col=False)
    return df


rxnconso_df = load_rxnconso("../RxNorm_eval/data/RXNCONSO.RRF")

rxnconso_df.head()

In [None]:
def load_rxnrel(file_path):
    """
    Load the RXNREL.RRF file into a pandas DataFrame.

    Args:
    file_path (str): The path to the RXNREL.RRF file.

    Returns:
    pd.DataFrame: A DataFrame containing the RXNREL data.
    """
    column_names = [
        "RXCUI1",
        "RXAUI1",
        "STYPE1",
        "REL",
        "RXCUI2",
        "RXAUI2",
        "STYPE2",
        "RELA",
        "RUI",
        "SRUI",
        "SAB",
        "SL",
        "DIR",
        "RG",
        "SUPPRESS",
        "CVF",
    ]
    df = pd.read_csv(file_path, sep="|", names=column_names, index_col=False)
    return df


# Load the data
rxnrel_df = load_rxnrel("../RxNorm_eval/data/RXNREL.RRF")

rxnrel_df.head(20)

#### Term types information

Click the drop down toggle below for more information on the term types used in this task.

<details>
<summary>REL</summary>

| Column Number | Column   | Description                                                                                                                                                                                                                                                                                                                |
| ------------- | -------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| 1             | RXCUI1   | Unique identifier of first concept                                                                                                                                                                                                                                                                                         |
| 2             | RXAUI1   | Unique identifier for first atom                                                                                                                                                                                                                                                                                           |
| 3             | STYPE1   | The name of the column in RXNCONSO.RRF that contains the identifier used for the first concept or first atom in source of the relationship (e.g., 'AUI' or 'CUI').                                                                                                                                                         |
| 4             | REL      | Relationship of second concept or atom to first concept or atom                                                                                                                                                                                                                                                            |
| 5             | RXCUI2   | Unique identifier of second concept                                                                                                                                                                                                                                                                                        |
| 6             | RXAUI2   | Unique identifier for second atom                                                                                                                                                                                                                                                                                          |
| 7             | STYPE2   | The name of the column in RXNCONSO.RRF that contains the identifier used for the second concept or second atom in the source of the relationship (e.g., 'AUI' or 'CUI').                                                                                                                                                   |
| 8             | RELA     | Additional (more specific) relationship label (optional)                                                                                                                                                                                                                                                                   |
| 9             | RUI      | Unique identifier for relationship                                                                                                                                                                                                                                                                                         |
| 10            | SRUI     | Source asserted relationship identifier, if present (no value provided)                                                                                                                                                                                                                                                    |
| 11            | SAB      | Abbreviation of the source of relationship                                                                                                                                                                                                                                                                                 |
| 12            | SL       | Source of relationship labels (no value provided)                                                                                                                                                                                                                                                                          |
| 13            | RG☨      | Machine generated and unverified indicator (optional). RG values will occupy column 14 using the database load scripts provided below.                                                                                                                                                                                     |
| 14            | DIR☦     | Source asserted directionality flag. (no value provided) Y indicates that this is the direction of the relationship in its source; N indicates that it is not; a blank indicates that it is not important or has not yet been determined. DIR values will occupy column 13 using the database load scripts provided below. |
| 15            | SUPPRESS | Suppressible flag. Values = Y, E, or N. Reflects the suppressible status of the relationship; not yet in use. See also SUPPRESS in MRCONSO.RRF and MRDEF.RRF and MRREL.RRF in the UMLS Reference Manual.                                                                                                                   |
| 16            | CVF      | Content                                                                                                                                                                                                                                                                                                                    |

☨ RG values will occupy column 14 using the database load scripts provided below.

☦ DIR values will occupy column 13 using the database load scripts provided below.

</details>

<details>
<summary>CONSO</summary>
The unique term types (TTY) in the RxNorm database represent various types of names and identifiers for drugs and related concepts. Here is an explanation of what each abbreviation means:

- **PT - Preferred Term**: The main name for a concept, which is usually the most commonly used name.
- **FN - Fully Specified Name**: A complete and detailed name for a concept.
- **SY - Synonym**: An alternative name for a concept.
- **BN - Brand Name**: The proprietary name given by a manufacturer.
- **IN - Ingredient Name**: The generic name for a drug ingredient.
- **SU - Substance**: A more general term than an ingredient, often used for chemicals and compounds.
- **GN - Generic Name**: The non-proprietary name assigned to a drug or drug component.
- **FSY - Formulated Synonym**: A synonym specific to a particular formulation.
- **MS - Multum Specific Drug Name**: A name specific to Multum, a drug database.
- **PIN - Precise Ingredient**: A more specific term for a particular ingredient.
- **TMSY - Tall Man Lettering Synonym**: A synonym that uses Tall Man lettering to reduce confusion with similar drug names.
- **SYGB - Synonym Generated by Algorithm**: A synonym generated by an algorithm.
- **RXN_PT - RxNorm Preferred Term**: A preferred term specific to RxNorm.
- **DF - Dose Form**: The form in which the drug is administered, such as tablet, capsule, etc.
- **PTGB - Preferred Term Generated by Algorithm**: A preferred term generated by an algorithm.
- **RXN_IN - RxNorm Ingredient Name**: An ingredient name specific to RxNorm.
- **CDA - Clinical Drug Alias**: An alias for a clinical drug.
- **CDC - Clinical Drug Component**: A component of a clinical drug.
- **CDD - Clinical Drug Description**: A description of a clinical drug.
- **SC - Semantic Clinical Drug Component**: A clinical drug component with semantic information.
- **AB - Abbreviation**: A short form or abbreviation for a concept.
- **CD - Clinical Drug**: A drug concept used in a clinical setting.
- **ET - Entry Term**: A term used for entry in a database.
- **BD - Brand Drug**: A branded version of a drug.
- **MIN - Multiple Ingredient**: A drug that contains multiple ingredients.
- **SCDF - Semantic Clinical Dose Form**: The dose form of a clinical drug with semantic information.
- **SBDF - Semantic Branded Dose Form**: The dose form of a branded drug with semantic information.
- **SCD - Semantic Clinical Drug**: A clinical drug concept with semantic information.
- **DP - Display Name**: A name used for display purposes.
- **PSN - Precise Synonym**: A synonym that is more precise than a general synonym.
- **SBD - Semantic Branded Drug**: A branded drug concept with semantic information.
- **SCDC - Semantic Clinical Drug Component**: A clinical drug component with detailed semantic information.
- **MTH_RXN_DP - Metathesaurus RxNorm Display Name**: A display name from the Metathesaurus specific to RxNorm.
- **MTH_RXN_CD - Metathesaurus RxNorm Clinical Drug**: A clinical drug concept from the Metathesaurus specific to RxNorm.
- **MTH_RXN_BD - Metathesaurus RxNorm Brand Drug**: A brand drug concept from the Metathesaurus specific to RxNorm.
- **MTH_RXN_CDC - Metathesaurus RxNorm Clinical Drug Component**: A clinical drug component from the Metathesaurus specific to RxNorm.
- **SBDC - Semantic Branded Drug Component**: A branded drug component with detailed semantic information.
- **GPCK - Generic Pack**: A generic version of a pack (a combination of drugs or components).
- **BPCK - Branded Pack**: A branded version of a pack (a combination of drugs or components).
- **DFG - Dose Form Group**: A group of dose forms.
- **SCDG - Semantic Clinical Dose Group**: A dose form group with semantic information specific to clinical drugs.
- **SBDG - Semantic Branded Dose Group**: A dose form group with semantic information specific to branded drugs.
- **SCDFP - Semantic Clinical Dose Form Precise**: A precise dose form of a clinical drug with semantic information.
- **SBDFP - Semantic Branded Dose Form Precise**: A precise dose form of a branded drug with semantic information.
- **SCDGP - Semantic Clinical**
</details>


## Join rxnrel_df with rxnconso_df


In [None]:
# Filter rxnconso_df for IN and BN entries
rxnconso_filtered = rxnconso_df[rxnconso_df["TTY"].isin(["IN", "PT", "BN"])]
# len(rxnconso_filtered) # 150k
rxnconso_filtered.head()

#### Join on RXCUI


In [None]:
# Join rxnrel_df with rxnconso_filtered for RXCUI1
rxnrel_df = rxnrel_df.merge(
    rxnconso_filtered[["RXCUI", "TTY", "STR"]],
    left_on="RXCUI1",
    right_on="RXCUI",
    how="left",
    suffixes=("", "_tmp1"),
)
rxnrel_df.rename(columns={"TTY": "TTY1", "STR": "STR1"}, inplace=True)

# Join rxnrel_df with rxnconso_filtered for RXCUI2
rxnrel_df = rxnrel_df.merge(
    rxnconso_filtered[["RXCUI", "TTY", "STR"]],
    left_on="RXCUI2",
    right_on="RXCUI",
    how="left",
    suffixes=("", "_tmp2"),
)
rxnrel_df.rename(columns={"TTY": "TTY2", "STR": "STR2"}, inplace=True)

#### Handle Missing RXCUI by Joining on RXAUI


In [None]:
# Fill missing TTY1 and STR1 using RXAUI1
rxnrel_df.loc[rxnrel_df["TTY1"].isnull(), ["TTY1", "STR1"]] = (
    rxnrel_df.loc[rxnrel_df["TTY1"].isnull()]
    .merge(
        rxnconso_filtered[["RXAUI", "TTY", "STR"]],
        left_on="RXAUI1",
        right_on="RXAUI",
        how="left",
    )[["TTY", "STR"]]
    .values
)

# Fill missing TTY2 and STR2 using RXAUI2
rxnrel_df.loc[rxnrel_df["TTY2"].isnull(), ["TTY2", "STR2"]] = (
    rxnrel_df.loc[rxnrel_df["TTY2"].isnull()]
    .merge(
        rxnconso_filtered[["RXAUI", "TTY", "STR"]],
        left_on="RXAUI2",
        right_on="RXAUI",
        how="left",
    )[["TTY", "STR"]]
    .values
)

In [None]:
## Filter for relation of ingredient_of or tradename_of
rxnrel_df = rxnrel_df[rxnrel_df["RELA"].isin(["tradename_of"])]

rxnrel_df.head(20)

### Get Pairs


In [None]:
# Filter for 'has_tradename_of'
tradename_pairs = rxnrel_df[rxnrel_df["RELA"] == "tradename_of"][["STR1", "STR2"]]
tradename_pairs.rename(
    columns={"STR1": "Ingredient", "STR2": "Tradename"}, inplace=True
)

print("Pairs for 'has_tradename_of length: ", len(tradename_pairs))
tradename_pairs.head(50)

### Remove duplicates


In [None]:
# Normalize the strings by stripping spaces and converting to lowercase
tradename_pairs["Ingredient_norm"] = (
    tradename_pairs["Ingredient"].str.lower().str.strip()
)
tradename_pairs["Tradename_norm"] = tradename_pairs["Tradename"].str.lower().str.strip()

In [None]:
# Drop duplicates based on normalized columns
tradename_pairs_deduped = tradename_pairs.drop_duplicates(
    subset=["Ingredient_norm", "Tradename_norm"]
)

# rename to index, generic, brand (need to reorder columns)
tradename_pairs_deduped = tradename_pairs_deduped[
    ["Ingredient_norm", "Tradename_norm"]
].reset_index(drop=True)

tradename_pairs_deduped.rename(
    columns={"Ingredient_norm": "generic", "Tradename_norm": "brand"}, inplace=True
)

print(f"Unique pairs after removing duplicates: {len(tradename_pairs_deduped)}")
tradename_pairs_deduped.head(25)

In [None]:
# Fill NaN values with an empty string
tradename_pairs_deduped["generic"].fillna("", inplace=True)
tradename_pairs_deduped["brand"].fillna("", inplace=True)


# Drop rows where generic or brand is empty
initial_len = len(tradename_pairs_deduped)
tradename_pairs_deduped = tradename_pairs_deduped[
    (tradename_pairs_deduped["generic"] != "")
    & (tradename_pairs_deduped["brand"] != "")
].reset_index(drop=True)

rows_dropped = initial_len - len(tradename_pairs_deduped)
print(f"Rows dropped where generic or brand is empty: {rows_dropped}")

# Overlapping keywords


In [None]:
overlapping_words = [
    "today",
    "thrive",
    "program",
    "react",
    "perform",
    "tomorrow"
    # GBaker/MedQA-USMLE-4-options-hf - brand_to_generic
    "bronchial",
    "copd",
    "duration",
    "matrix",
    "blockade",
    "sustain",
    "overtime",
    "android",
    "suppressor",
    "nephron",
    # GBaker/MedQA-USMLE-4-options-hf - generic_to_brand
    "alcohol",
    "liver",
    "thyroid",
    "potassium",
    "prothrombin",
    "alanine",
    "water",
    # augtoma/usmle_step_1 - brand_to_generic
    # augtoma/usmle_step_1 - generic_to_brand
    "oxygen",
    "liver",
    "alcohol",
    "water",
    "thryroid",
    "peanut",
    # augtoma/usmle_step_2 - brand_to_generic
    # augtoma/usmle_step_2 - generic_to_brand
    "alcohol",
    "urea",
    "nitrogen",
    "thryroid",
    "liver",
    # augtoma/usmle_step_3 - brand_to_generic
    # hails/mmlu_no_train/anatomy - brand_to_generic
    "matrix",
    # hails/mmlu_no_train/anatomy - generic_to_brand
    "liver",
    "thryroid",
    "nitrogen",
    "acetylcholine",
    # hails/mmlu_no_train/clinical_knowledge - brand_to_generic
    "duration",
    "sustain",
    "copd",
    # hails/mmlu_no_train/clinical_knowledge - generic_to_brand
    "lactate",
    "water",
    "liver",
    "alcohol",
    "urea",
    "thyroid",
    # hails/mmlu_no_train/college_medicine - brand_to_generic
    "sustain",
    "nephron",
    "rid",
    "excel",
    # hails/mmlu_no_train/college_medicine - generic_to_brand
    "oxygen",
    "glucose",
    "water",
    "lactate",
    "urea",
    "potassium",
    "alcohol",
    "arginine",
    "glutamine",
    "testosterone",
    "tyrosine",
    "ethanol",
    "progesterone",
    "isoleucine",
    "liver",
    "choline",
    "glycine",
    "glutamate",
    "nitrogen",
    "amylase",
    "alanine",
    "acetylcholine",
    "leucine",
    # hails/mmlu_no_train/medical_genetics - brand_to_generic
    "suppressor",
    # hails/mmlu_no_train/medical_genetics - generic_to_brand
    "phenylalanine",
    "liver",
    # hails/mmlu_no_train/professional_medicine - brand_to_generic
    # hails/mmlu_no_train/professional_medicine - generic_to_brand
    "oxygen",
    "thryoid",
    "liver",
    "alcohol",
    "water",
    "amylase",
    "potassium",
    # hails/mmlu_no_train/college_biology - brand_to_generic
    "matrix",
    "keystone",
    "symmetry",
    # hails/mmlu_no_train/college_biology - generic_to_brand
    "water",
    "nitrogen",
    "starch",
    "sulfur",
    "glucose",
    "alcohol",
    "phosphorus",
    "tyrosine",
    "cysteine",
    "acetylcholine",
    "sucrose",
    "ethanol",
    "urea",
    "oxygen",
]

In [None]:
# Convert generic and brand to string
tradename_pairs_deduped["generic"] = tradename_pairs_deduped["generic"].astype(str)
tradename_pairs_deduped["brand"] = tradename_pairs_deduped["brand"].astype(str)


# Exclusion criteria function
def is_excluded(name, overlapping_words):
    # Check for multiple words
    if len(name.split()) > 1:
        return True
    # Check for special characters (except spaces and hyphens)
    if any(char for char in name if not char.isalnum() and char not in {" ", "-"}):
        return True
    # Check for numbers
    if any(char.isdigit() for char in name):
        return True
    # Check for "obsolete" or "withdrawn"
    if (
        "obsolete" in name.lower()
        or "withdrawn" in name.lower()
        or "discontinued" in name.lower()
        or "containing" in name.lower()
    ):
        return True
    # Check for overlapping words
    if any(word in name.lower() for word in overlapping_words):
        return True
    return False


# Apply exclusion criteria to both columns
filtered_tradename_pairs = tradename_pairs_deduped[
    ~tradename_pairs_deduped["generic"].apply(is_excluded, args=(overlapping_words,))
    & ~tradename_pairs_deduped["brand"].apply(is_excluded, args=(overlapping_words,))
]

print("Original pairs length: ", len(tradename_pairs_deduped))
print("Filtered pairs length: ", len(filtered_tradename_pairs))

### Create brand_to_generic csv and generic_to_brand csv

- brand to generic should search for every brand name and return a single generic name
- generic to brand we should search for every generic name and return a single brand name

- brand to generic is many to one
- generic to brand is one to many

Therefore we will create two csvs, where for generic to brand we will filter the dataset to include only a random brand name for each generic name.


In [None]:
# print number unique generic values
print(
    f"Number of unique generic values: {filtered_tradename_pairs['generic'].nunique()}"
)
print(f"Number of unique brand values: {filtered_tradename_pairs['brand'].nunique()}")

In [None]:
medmcqa = [
    "isoflurane",
    "cyproterone",
    "tamsulosin",
    "tetracycline",
    "acetazolamide",
    "cevimeline",
    "salbutamol",
    "albendazole",
    "prednisolone",
    "pentazocine",
    "ceftazidime",
    "ampicillin",
    "norgestimate",
    "eptifibatide",
    "metronidazole",
    "clonidine",
    "didanosine",
    "docusate",
    "fenfluramine",
    "amikacin",
    "lamivudine",
    "modafinil",
    "ifosfamide",
    "dextran",
    "pioglitazone",
    "lidocaine",
    "methylprednisolone",
    "octreotide",
    "desflurane",
    "ibuprofen",
    "cilastatin",
    "ofloxacin",
    "ethambutol",
    "secretin",
    "ketorolac",
    "copper",
    "cefotaxime",
    "tizanidine",
    "imiquimod",
    "tolbutamide",
    "estramustine",
    "vincristine",
    "paracetamol",
    "oxybutynin",
    "digoxin",
    "dapsone",
    "carteolol",
    "remifentanil",
    "hydrocortisone",
    "sulindac",
    "fomepizole",
    "pancuronium",
    "nevirapine",
    "probenecid",
    "bosentan",
    "griseofulvin",
    "edta",
    "tetanus",
    "emtricitabine",
    "halothane",
    "quetiapine",
    "trypsin",
    "midazolam",
    "rasburicase",
    "dihydroxyacetone",
    "promethazine",
    "amoxapine",
    "glutaraldehyde",
    "famotidine",
    "etoposide",
    "trihexyphenidyl",
    "adapalene",
    "loratadine",
    "mebendazole",
    "physostigmine",
    "daptomycin",
    "mefloquine",
    "panitumumab",
    "cobalamin",
    "aprepitant",
    "ezetimibe",
    "topotecan",
    "benzocaine",
    "carvedilol",
    "deoxycholate",
    "desmopressin",
    "carbamide",
    "codeine",
    "triamcinolone",
    "ropinirole",
    "podophyllin",
    "nitisinone",
    "prasugrel",
    "doxepin",
    "ritonavir",
    "glycerin",
    "sitagliptin",
    "triclosan",
    "alfentanil",
    "framycetin",
    "cabergoline",
    "methotrexate",
    "nitrofurantoin",
    "tacrolimus",
    "azathioprine",
    "fondaparinux",
    "zonisamide",
    "nesiritide",
    "omalizumab",
    "vancomycin",
    "zidovudine",
    "entacapone",
    "anastrozole",
    "lenalidomide",
    "procainamide",
    "tadalafil",
    "histamine",
    "imatinib",
    "retinol",
    "hyaluronidase",
    "ramipril",
    "acetate",
    "ketamine",
    "phenol",
    "cellulose",
    "radium",
    "cisatracurium",
    "biotin",
    "omeprazole",
    "dipivefrine",
    "edrophonium",
    "morphine",
    "gabapentin",
    "bethanechol",
    "misoprostol",
    "procaine",
    "quinine",
    "frovatriptan",
    "cefaclor",
    "apraclonidine",
    "pindolol",
    "bupivacaine",
    "repaglinide",
    "olanzapine",
    "quinidine",
    "collagenase",
    "vecuronium",
    "ciclesonide",
    "malathion",
    "bromocriptine",
    "vorapaxar",
    "indomethacin",
    "enalapril",
    "teicoplanin",
    "fluorescein",
    "cetuximab",
    "acyclovir",
    "ethionamide",
    "timolol",
    "methysergide",
    "clotrimazole",
    "desogestrel",
    "taurine",
    "cisplatin",
    "naloxone",
    "trimethoprim",
    "levodopa",
    "paromomycin",
    "sorbitol",
    "glucagon",
    "amitriptyline",
    "pralidoxime",
    "spironolactone",
    "pepsin",
    "tocilizumab",
    "furosemide",
    "pethidine",
    "teriparatide",
    "tamoxifen",
    "cytarabine",
    "erythropoietin",
    "methylphenidate",
    "tazarotene",
    "dibucaine",
    "nefazodone",
    "theophylline",
    "basiliximab",
    "aspartate",
    "chlorpheniramine",
    "stanozolol",
    "exenatide",
    "nystatin",
    "celecoxib",
    "hydralazine",
    "linezolid",
    "doxorubicin",
    "betamethasone",
    "resorcinol",
    "mesalazine",
    "chlorothiazide",
    "thrombin",
    "melatonin",
    "epinephrine",
    "prazosin",
    "mirtazapine",
    "ephedrine",
    "suxamethonium",
    "dextrose",
    "goserelin",
    "doripenem",
    "norepinephrine",
    "sacubitril",
    "fluoxetine",
    "clarithromycin",
    "abatacept",
    "adenosine",
    "pemoline",
    "nelfinavir",
    "cocaine",
    "levamisole",
    "allopurinol",
    "acarbose",
    "glucosamine",
    "clomiphene",
    "fluconazole",
    "papaverine",
    "pramlintide",
    "bisoprolol",
    "acebutolol",
    "abciximab",
    "trimipramine",
    "terbinafine",
    "naproxen",
    "naltrexone",
    "pectin",
    "mannitol",
    "fulvestrant",
    "raloxifene",
    "loperamide",
    "tropicamide",
    "epoprostenol",
    "benazepril",
    "phenytoin",
    "topiramate",
    "zafirlukast",
    "econazole",
    "tryptophan",
    "nimodipine",
    "bisacodyl",
    "sevoflurane",
    "ranitidine",
    "fructose",
    "amiodarone",
    "orlistat",
    "thyroxine",
    "levofloxacin",
    "ubiquinone",
    "gallamine",
    "clozapine",
    "trospium",
    "alprazolam",
    "ipratropium",
    "ciprofloxacin",
    "ceftriaxone",
    "minocycline",
    "flumazenil",
    "flutamide",
    "tetrabenazine",
    "tolterodine",
    "metformin",
    "chlorpromazine",
    "lamotrigine",
    "primidone",
    "ropivacaine",
    "pirenzepine",
    "ticagrelor",
    "vasopressin",
    "fenoldopam",
    "pyrazinamide",
    "tolazoline",
    "ambenonium",
    "miltefosine",
    "salmeterol",
    "brimonidine",
    "carboplatin",
    "propofol",
    "simvastatin",
    "pegvisomant",
    "ibutilide",
    "rizatriptan",
    "dinoprostone",
    "orciprenaline",
    "itraconazole",
    "bimatoprost",
    "leflunomide",
    "clopidogrel",
    "calcitriol",
    "zileuton",
    "valproate",
    "metoprolol",
    "icatibant",
    "phentolamine",
    "dicloxacillin",
    "trifluoperazine",
    "pegloticase",
    "foscarnet",
    "ticlopidine",
    "sufentanil",
    "plasminogen",
    "sertraline",
    "atracurium",
    "mepivacaine",
    "desipramine",
    "pramipexole",
    "cefotetan",
    "paba",
    "prochlorperazine",
    "butorphanol",
    "acitretin",
    "baclofen",
    "diphenoxylate",
    "atropine",
    "indinavir",
    "phenoxybenzamine",
    "colchicine",
    "niclosamide",
    "celiprolol",
    "fluphenazine",
    "perphenazine",
    "carbamazepine",
    "chlorhexidine",
    "nifurtimox",
    "beclomethasone",
    "rifampin",
    "procarbazine",
    "neostigmine",
    "fentanyl",
    "amlodipine",
    "penicillamine",
    "methoxyflurane",
    "nadolol",
    "isotretinoin",
    "flecainide",
    "sildenafil",
    "ziprasidone",
    "etonogestrel",
    "diphenhydramine",
    "apixaban",
    "azithromycin",
    "clindamycin",
    "lorcaserin",
    "gallium",
    "moxifloxacin",
    "carbidopa",
    "chloroprocaine",
    "diltiazem",
    "captopril",
    "sulfamethoxazole",
    "ketoconazole",
    "paclitaxel",
    "lecithin",
    "capecitabine",
    "fibrinogen",
    "insulin",
    "nafarelin",
    "verapamil",
    "alogliptin",
    "cimetidine",
    "estriol",
    "chloroquine",
    "bendamustine",
    "nandrolone",
    "dorzolamide",
    "betaxolol",
    "clomipramine",
    "terazosin",
    "sulfasalazine",
    "olsalazine",
    "enflurane",
    "imipramine",
    "acetone",
    "gatifloxacin",
    "carboprost",
    "rocuronium",
    "risedronate",
    "irbesartan",
    "selegiline",
    "dehydroepiandrosterone",
    "naratriptan",
    "triprolidine",
    "mivacurium",
    "letrozole",
    "vigabatrin",
    "senna",
    "nitroprusside",
    "yohimbine",
    "miconazole",
    "pentamidine",
    "ceftizoxime",
    "cephalothin",
    "stavudine",
    "tirofiban",
    "alprostadil",
    "novobiocin",
    "cyclosporine",
    "atenolol",
    "nicotine",
    "budesonide",
    "hypochlorite",
    "triptorelin",
    "sumatriptan",
    "clofazimine",
    "liothyronine",
    "doxycycline",
    "methocarbamol",
    "chlorpropamide",
    "mupirocin",
    "amphetamine",
    "ivermectin",
    "terbutaline",
    "thiamine",
    "cycloserine",
    "chlorambucil",
    "formoterol",
    "erythromycin",
    "cinnarizine",
    "methadone",
    "phenylbutazone",
    "diazepam",
    "latanoprost",
    "danazol",
    "nizatidine",
    "efavirenz",
    "tibolone",
    "dobutamine",
    "esmolol",
    "oseltamivir",
    "docetaxel",
    "histidine",
    "warfarin",
    "etomidate",
    "nifedipine",
    "cyclophosphamide",
    "ulipristal",
    "proguanil",
    "dexamethasone",
    "propranolol",
    "isoniazid",
    "montelukast",
    "pilocarpine",
    "metyrapone",
    "metoclopramide",
    "piroxicam",
    "aripiprazole",
    "belladonna",
    "aspirin",
    "methyldopa",
    "cefoperazone",
    "bran",
    "amantadine",
    "mifepristone",
    "trimethaphan",
    "saquinavir",
    "oxytocin",
]

medqa = [
    "aliskiren",
    "isoflurane",
    "tetracycline",
    "acetazolamide",
    "rituximab",
    "salbutamol",
    "albendazole",
    "prednisolone",
    "ampicillin",
    "clavulanate",
    "phenobarbital",
    "metronidazole",
    "chlordiazepoxide",
    "ribavirin",
    "clonidine",
    "docusate",
    "phenylephrine",
    "ramelteon",
    "nitroglycerin",
    "lamivudine",
    "modafinil",
    "ifosfamide",
    "cefpodoxime",
    "pioglitazone",
    "darunavir",
    "lidocaine",
    "methylprednisolone",
    "octreotide",
    "desflurane",
    "dolutegravir",
    "ibuprofen",
    "ethambutol",
    "dihydroergotamine",
    "ketorolac",
    "copper",
    "cefotaxime",
    "vincristine",
    "oxybutynin",
    "digoxin",
    "dapsone",
    "dasatinib",
    "tazobactam",
    "hydrocortisone",
    "fomepizole",
    "pantoprazole",
    "edta",
    "griseofulvin",
    "tetanus",
    "caffeine",
    "dextroamphetamine",
    "emtricitabine",
    "quetiapine",
    "ergocalciferol",
    "midazolam",
    "rasburicase",
    "trihexyphenidyl",
    "etoposide",
    "calamine",
    "loratadine",
    "mebendazole",
    "physostigmine",
    "bacitracin",
    "mefloquine",
    "cobalamin",
    "miglitol",
    "carvedilol",
    "lorazepam",
    "tolvaptan",
    "codeine",
    "eplerenone",
    "triamcinolone",
    "bupropion",
    "ritonavir",
    "infliximab",
    "lansoprazole",
    "sitagliptin",
    "methotrexate",
    "nitrofurantoin",
    "azathioprine",
    "fondaparinux",
    "nesiritide",
    "zidovudine",
    "vancomycin",
    "liraglutide",
    "anastrozole",
    "histamine",
    "imatinib",
    "ramipril",
    "acetate",
    "ketamine",
    "phenol",
    "omeprazole",
    "lovastatin",
    "morphine",
    "gabapentin",
    "bethanechol",
    "losartan",
    "fexofenadine",
    "misoprostol",
    "cladribine",
    "midodrine",
    "scopolamine",
    "labetalol",
    "telmisartan",
    "olanzapine",
    "nortriptyline",
    "fenofibrate",
    "bromocriptine",
    "indomethacin",
    "enalapril",
    "acyclovir",
    "rivaroxaban",
    "methysergide",
    "deferoxamine",
    "cisplatin",
    "naloxone",
    "trimethoprim",
    "benztropine",
    "paromomycin",
    "sorbitol",
    "glucagon",
    "amitriptyline",
    "spironolactone",
    "allantoin",
    "furosemide",
    "pyrantel",
    "teriparatide",
    "tamoxifen",
    "erythropoietin",
    "methylphenidate",
    "theophylline",
    "lactulose",
    "aspartate",
    "trazodone",
    "exenatide",
    "methenamine",
    "hydralazine",
    "doxorubicin",
    "betamethasone",
    "prednisone",
    "oxaliplatin",
    "alendronate",
    "thrombin",
    "melatonin",
    "epinephrine",
    "prazosin",
    "hydrochlorothiazide",
    "dextrose",
    "goserelin",
    "citalopram",
    "meloxicam",
    "mepolizumab",
    "norepinephrine",
    "entecavir",
    "fluoxetine",
    "clarithromycin",
    "paroxetine",
    "adenosine",
    "escitalopram",
    "cocaine",
    "allopurinol",
    "fludarabine",
    "tiotropium",
    "lisinopril",
    "fluorouracil",
    "glucosamine",
    "glipizide",
    "cyproheptadine",
    "bisoprolol",
    "fluconazole",
    "pramlintide",
    "ganciclovir",
    "naproxen",
    "etanercept",
    "naltrexone",
    "mannitol",
    "disulfiram",
    "raloxifene",
    "loperamide",
    "phenytoin",
    "chlorthalidone",
    "tryptophan",
    "sevoflurane",
    "ranitidine",
    "ketotifen",
    "fructose",
    "tramadol",
    "amiodarone",
    "levofloxacin",
    "thyroxine",
    "donepezil",
    "clozapine",
    "oxybate",
    "oxycodone",
    "leucovorin",
    "alprazolam",
    "ipratropium",
    "galantamine",
    "sofosbuvir",
    "maraviroc",
    "ciprofloxacin",
    "succimer",
    "ceftriaxone",
    "thalidomide",
    "tolterodine",
    "metformin",
    "chlorpromazine",
    "lamotrigine",
    "primidone",
    "ertapenem",
    "ethinylestradiol",
    "vasopressin",
    "pyrazinamide",
    "empagliflozin",
    "salmeterol",
    "carboplatin",
    "propofol",
    "simvastatin",
    "romiplostim",
    "itraconazole",
    "atorvastatin",
    "clopidogrel",
    "calcitriol",
    "rivastigmine",
    "zileuton",
    "valproate",
    "oxymetazoline",
    "metoprolol",
    "phentolamine",
    "cromolyn",
    "dicloxacillin",
    "sevelamer",
    "plasminogen",
    "sertraline",
    "atracurium",
    "pramipexole",
    "cefotetan",
    "baclofen",
    "diphenoxylate",
    "atropine",
    "piperacillin",
    "duloxetine",
    "phenoxybenzamine",
    "povidone-iodine",
    "phenelzine",
    "colchicine",
    "fluticasone",
    "fluphenazine",
    "fluvastatin",
    "carbamazepine",
    "fluorometholone",
    "benzalkonium",
    "chlorhexidine",
    "sulfadiazine",
    "rifampin",
    "beclomethasone",
    "cephalexin",
    "neostigmine",
    "amlodipine",
    "enoxaparin",
    "moxonidine",
    "isotretinoin",
    "sildenafil",
    "diphenhydramine",
    "apixaban",
    "azithromycin",
    "pravastatin",
    "temazepam",
    "moxifloxacin",
    "riluzole",
    "diltiazem",
    "captopril",
    "sulfamethoxazole",
    "flucytosine",
    "valsartan",
    "lecithin",
    "simeprevir",
    "colesevelam",
    "aldesleukin",
    "fibrinogen",
    "rifaximin",
    "cilostazol",
    "insulin",
    "memantine",
    "verapamil",
    "diethylstilbestrol",
    "cimetidine",
    "gentamicin",
    "estriol",
    "sulbactam",
    "chloroquine",
    "febuxostat",
    "anakinra",
    "clomipramine",
    "imipramine",
    "demeclocycline",
    "ruxolitinib",
    "selegiline",
    "dehydroepiandrosterone",
    "nitroprusside",
    "alteplase",
    "hydromorphone",
    "cyclosporine",
    "atenolol",
    "nicotine",
    "budesonide",
    "hypochlorite",
    "sumatriptan",
    "streptokinase",
    "diclofenac",
    "chloramphenicol",
    "doxycycline",
    "voriconazole",
    "ivermectin",
    "isosorbide",
    "terbutaline",
    "thiamine",
    "hydroxychloroquine",
    "erythromycin",
    "methadone",
    "ketoprofen",
    "triazolam",
    "tetrahydrobiopterin",
    "diazepam",
    "rosuvastatin",
    "fludrocortisone",
    "efavirenz",
    "esmolol",
    "dobutamine",
    "oseltamivir",
    "warfarin",
    "triamterene",
    "nifedipine",
    "etomidate",
    "cyclophosphamide",
    "proguanil",
    "dexamethasone",
    "propranolol",
    "isoniazid",
    "montelukast",
    "metyrapone",
    "metoclopramide",
    "aspirin",
    "mesna",
    "amantadine",
    "povidone",
    "mifepristone",
    "dantrolene",
    "oxytocin",
]

In [None]:
all_generic = list(set(medmcqa + medqa))
len(all_generic)

In [None]:
sub = [
    "insulin",
    "fibrinogen",
    "copper",
    "aspirin",
    "ibuprofen" "plasminogen",
    "dehydroepiandrosterone",
    "aspartate",
    "adenosine",
    "histamine",
    "plasminogen",
    "acetate",
    "histidine",
    "bronchial",
    "cocaine",
    "amphetamine",
    "thiamine",
    "cellulose",
    "fibrinogen",
    "tetanus",
    "melatonin",
    "Radium",
    "Phenol",
    "Iodine",
]
# remove sub from all_generic
all_generic = [i for i in all_generic if i not in sub]
len(all_generic)

In [None]:
# Create brand to generic mapping (many to one)
brand_to_generic = filtered_tradename_pairs.copy()

# Create generic to brand mapping (one to many)
generic_to_brand = filtered_tradename_pairs.copy()

In [None]:
len(generic_to_brand), len(brand_to_generic)

In [None]:
# keep rows where generic is in all_generic
brand_to_generic = brand_to_generic[brand_to_generic["generic"].isin(all_generic)]
# keep rows where brand is in all_generic
generic_to_brand = generic_to_brand[generic_to_brand["generic"].isin(all_generic)]

In [None]:
len(brand_to_generic), len(generic_to_brand)

## Use API to check brand names

In [None]:
import openai
import json
from tqdm import tqdm


# Function to send a message to the OpenAI GPT model
def gpt_chat(model, message, temperature=0.3):
    response = openai.ChatCompletion.create(
        model=model,
        messages=[{"role": "user", "content": message}],
        temperature=temperature,
    )
    return response.choices[0].message["content"]


# Initialize results list
results = []

# Loop through the brand and generic drug pairs
for brand, gen in tqdm(brand_to_generic[["brand", "generic"]].values):
    # Create the message
    message = f"Is drug {brand}'s main component {gen} and is it used mainly for humans? Please only answer Yes or No!"

    # Send the message to the model
    response = gpt_chat(model="gpt-4o", message=message, temperature=0.3)

    # Append the results to the list
    results.append(
        {"brand": brand, "generic": gen, "message": message, "response": response}
    )

    # Write results to file each iteration
    with open("results.json", "w") as f:
        json.dump(results, f)

In [None]:
results_df = pd.DataFrame(results)
len(results_df)

In [None]:
results_df["response"] = results_df["response"].str.lower().str.rstrip(".")
results_df

In [None]:
results_df = results_df[results_df["response"].str.lower() == "yes"]
len(results_df)

In [None]:
# make a new dataframe with only the brand and generic columns, where each generic map to a list of brand
generic_to_brand = (
    generic_to_brand.groupby("generic")["brand"].apply(list).reset_index()
)
len(generic_to_brand)

In [48]:
generic_to_brand["brand"] = [i[0] for i in generic_to_brand["brand"]]
generic_to_brand.head()

Unnamed: 0,generic,brand
0,abatacept,[orencia]
1,abciximab,[reopro]
2,acarbose,[precose]
3,acebutolol,[sectral]
4,acetazolamide,[diamox]


In [None]:
for brand, gen in tqdm(generic_to_brand[["brand", "generic"]].values):
    # Create the message
    message = f"generic: {gen} \n brand: {brand} \n please return a python list the best brand drugs that are not Combination drugs mainly contains the generic, make sure they are used for human."

In [53]:
import cohere

co = cohere.Client(
    api_key="",  # This is your trial API key
)
for brand, gen in tqdm(generic_to_brand[["brand", "generic"]].values):
    # Create the message
    message = f"generic: {gen} brand: {brand} \n please return a python list the best brand drugs that are not Combination drugs mainly contains the generic, make sure they are used for human."
    # Send the message to the model
    response = co.chat(
        model="command-r-plus",
        message=message,
        temperature=0.3,
        # connectors=[{"id":"web-search"}],
    )
    # Append the results to the list
    results.append(
        {"brand": brand, "generic": gen, "message": message, "response": response.text}
    )
    # write result in file each iteration
    with open("results_rag.json", "w") as f:
        json.dump(results, f)

100%|██████████| 581/581 [31:31<00:00,  3.26s/it]  


In [61]:
generic_to_brand.head()

Unnamed: 0,generic,brand
0,abatacept,[orencia]
1,abciximab,[reopro]
2,acarbose,[precose]
3,acebutolol,[sectral]
4,acetazolamide,[diamox]


In [68]:
rag_df = pd.DataFrame(results[-581:])
rag_df.to_csv("rag_results.csv", index=False)