In [22]:
import pandas as pd

# pd.set_ max col
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)
pd.set_option("display.max_rows", 500)

# Define the path to the UMLS files
umls_dir = "../UMLS/META"

conso_file = f"{umls_dir}/MRCONSO.RRF"  # -> https://www.ncbi.nlm.nih.gov/books/NBK9685/table/ch03.T.concept_names_and_sources_file_mr/?report=objectonly
rel_file = f"{umls_dir}/MRREL.RRF"  # -> https://www.ncbi.nlm.nih.gov/books/NBK9685/table/ch03.T.related_concepts_file_mrrel_rrf/?report=objectonly
mrsty_file = f"{umls_dir}/MRSTY.RRF"

In [23]:
# Load the data files into DataFrames with specific column lengths and encodings
mrconso_columns = [
    "CUI",
    "LAT",
    "TS",
    "LUI",
    "STT",
    "SUI",
    "ISPREF",
    "AUI",
    "SAUI",
    "SCUI",
    "SDUI",
    "SAB",
    "TTY",
    "CODE",
    "STR",
    "SRL",
    "SUPPRESS",
    "CVF",
]

mrrel_columns = [
    "CUI1",
    "AUI1",
    "STYPE1",
    "REL",
    "CUI2",
    "AUI2",
    "STYPE2",
    "RELA",
    "RUI",
    "SRUI",
    "SAB",
    "SL",
    "RG",
    "DIR",
    "SUPPRESS",
    "CVF",
]

mrsty_columns = ["CUI", "TUI", "STN", "STY", "ATUI", "CVF"]


# Load MRCONSO.RRF
mrconso = pd.read_csv(
    conso_file,
    sep="|",
    header=None,
    names=mrconso_columns,
    encoding="utf-8",
    nrows=100000,
    index_col=False,
)

# Load MRREL.RRF
mrrel = pd.read_csv(
    rel_file,
    sep="|",
    header=None,
    names=mrrel_columns,
    encoding="utf-8",
    nrows=100000,
    index_col=False,
)

# Load MRSTY.RRF for semantic types
mrsty = pd.read_csv(
    mrsty_file,
    sep="|",
    header=None,
    names=mrsty_columns,
    encoding="utf-8",
    nrows=100000,
    index_col=False,
)

In [24]:
# Show the first few rows of each DataFrame
print("MRCONSO.RRF")
print(mrconso.head())
print("\n" * 2)
print("-" * 50)
print("\n" * 2)

print("MRREL.RRF")
print(mrrel.head())
print("\n" * 2)
print("-" * 50)
print("\n" * 2)

print("MRSTY.RRF")
print(mrsty.head())
print("\n" * 2)
print("-" * 50)
print("\n" * 2)

MRCONSO.RRF
        CUI  LAT TS       LUI STT       SUI ISPREF        AUI  SAUI      SCUI     SDUI     SAB  TTY     CODE                                                STR  SRL SUPPRESS    CVF
0  C0000005  ENG  P  L0000005  PF  S0007492      Y  A26634265   NaN  M0019694  D012711     MSH  PEP  D012711                     (131)I-Macroaggregated Albumin    0        N  256.0
1  C0000005  ENG  S  L0270109  PF  S0007491      Y  A26634266   NaN  M0019694  D012711     MSH   ET  D012711                                         (131)I-MAA    0        N  256.0
2  C0000005  FRE  P  L6220710  PF  S7133957      Y  A13433185   NaN  M0019694  D012711  MSHFRE  PEP  D012711      Macroagrégats d'albumine marquée à l'iode 131    3        N    NaN
3  C0000005  FRE  S  L6215648  PF  S7133916      Y  A27488794   NaN  M0019694  D012711  MSHFRE   ET  D012711                                          MAA-I 131    3        N    NaN
4  C0000005  FRE  S  L6215656  PF  S7133956      Y  A27614225   NaN  M0019694  D012

## Side effects list

In [25]:
# Define relevant T-codes for drugs and side effects
drug_tuis = ["T121", "T200"]  # Add any other relevant T-codes for drugs
side_effect_tuis = ["T047", "T184"]

# Filter MRSTY for CUIs with the relevant TUI for drugs and side effects
drug_cuis = mrsty[mrsty["TUI"].isin(drug_tuis)]["CUI"].unique()
side_effect_cuis = mrsty[mrsty["TUI"].isin(side_effect_tuis)]["CUI"].unique()

print("Drug CUIs")
print(drug_cuis)

# Define causative relationships
se_relationships = ["associated_with", "cause_of"]

# Filter MRREL for relevant relationships where CUI1 (drug) causes CUI2 (side effect)
drug_side_effect_rels = mrrel[
    (mrrel["CUI1"].isin(drug_cuis))
    & (mrrel["CUI2"].isin(side_effect_cuis))
    & (mrrel["RELA"].isin(se_relationships))
]
print(drug_side_effect_rels.head())

Drug CUIs
['C0000005' 'C0000039' 'C0000096' ... 'C0136761' 'C0136764' 'C0136766']
Empty DataFrame
Columns: [CUI1, AUI1, STYPE1, REL, CUI2, AUI2, STYPE2, RELA, RUI, SRUI, SAB, SL, RG, DIR, SUPPRESS, CVF]
Index: []


In [26]:
# Filter MRCONSO for relevant rows (English and preferred terms)
mrconso_filtered = mrconso[(mrconso["LAT"] == "ENG")]

# Create a mapping from CUI to STR (name)
cui_to_name = mrconso_filtered.set_index("CUI")["STR"].to_dict()

# Map CUI1 and CUI2 to their corresponding names
drug_side_effect_rels["Drug Name"] = drug_side_effect_rels["CUI1"].map(cui_to_name)
drug_side_effect_rels["Side Effect Name"] = drug_side_effect_rels["CUI2"].map(
    cui_to_name
)

# Filter out rows with NaN side effect names
drug_side_effect_rels = drug_side_effect_rels.dropna(subset=["Side Effect Name"])

# Group side effects by drug and remove duplicates
side_effects_per_drug = (
    drug_side_effect_rels.groupby("Drug Name")["Side Effect Name"]
    .apply(lambda x: list(set(x)))  # Remove duplicates
    .reset_index()
)

# Display the first few rows to verify
print(side_effects_per_drug.head(20))

# Save the side effects per drug to a CSV file
side_effects_per_drug.to_csv("side_effects_per_drug.csv", index=False)

Empty DataFrame
Columns: [Drug Name, Side Effect Name]
Index: []
