In [3]:
import pandas as pd

# Define the path to the UMLS files
umls_dir = "../UMLS/META"

conso_file = f"{umls_dir}/MRCONSO.RRF"  # -> https://www.ncbi.nlm.nih.gov/books/NBK9685/table/ch03.T.concept_names_and_sources_file_mr/?report=objectonly
rel_file = f"{umls_dir}/MRREL.RRF"  # -> https://www.ncbi.nlm.nih.gov/books/NBK9685/table/ch03.T.related_concepts_file_mrrel_rrf/?report=objectonly
mrsty_file = f"{umls_dir}/MRSTY.RRF"

In [22]:
# Load the data files into DataFrames with specific column lengths and encodings
mrconso_columns = [
    "CUI",
    "LAT",
    "TS",
    "LUI",
    "STT",
    "SUI",
    "ISPREF",
    "AUI",
    "SAUI",
    "SCUI",
    "SDUI",
    "SAB",
    "TTY",
    "CODE",
    "STR",
    "SRL",
    "SUPPRESS",
    "CVF",
]

mrrel_columns = [
    "CUI1",
    "AUI1",
    "STYPE1",
    "REL",
    "CUI2",
    "AUI2",
    "STYPE2",
    "RELA",
    "RUI",
    "SRUI",
    "SAB",
    "SL",
    "RG",
    "DIR",
    "SUPPRESS",
    "CVF",
]

mrsty_columns = ["CUI", "TUI", "STN", "STY", "ATUI", "CVF"]


# Load MRCONSO.RRF
mrconso = pd.read_csv(
    conso_file,
    sep="|",
    header=None,
    names=mrconso_columns,
    encoding="utf-8",
    nrows=1000,
    index_col=False,
)

# Load MRREL.RRF
mrrel = pd.read_csv(
    rel_file,
    sep="|",
    header=None,
    names=mrrel_columns,
    encoding="utf-8",
    nrows=1000,
    index_col=False,
)

# Load MRSTY.RRF for semantic types
mrsty = pd.read_csv(
    mrsty_file,
    sep="|",
    header=None,
    names=mrsty_columns,
    encoding="utf-8",
    nrows=1000,
    index_col=False,
)

In [23]:
# Show the first few rows of each DataFrame
print("MRCONSO.RRF")
print(mrconso.head())
print("\n" * 2)
print("-" * 50)
print("\n" * 2)

print("MRREL.RRF")
print(mrrel.head())
print("\n" * 2)
print("-" * 50)
print("\n" * 2)

print("MRSTY.RRF")
print(mrsty.head())
print("\n" * 2)
print("-" * 50)
print("\n" * 2)

MRCONSO.RRF
        CUI  LAT TS       LUI STT       SUI ISPREF        AUI  SAUI      SCUI  \
0  C0000005  ENG  P  L0000005  PF  S0007492      Y  A26634265   NaN  M0019694   
1  C0000005  ENG  S  L0270109  PF  S0007491      Y  A26634266   NaN  M0019694   
2  C0000005  FRE  P  L6220710  PF  S7133957      Y  A13433185   NaN  M0019694   
3  C0000005  FRE  S  L6215648  PF  S7133916      Y  A27488794   NaN  M0019694   
4  C0000005  FRE  S  L6215656  PF  S7133956      Y  A27614225   NaN  M0019694   

      SDUI     SAB  TTY     CODE  \
0  D012711     MSH  PEP  D012711   
1  D012711     MSH   ET  D012711   
2  D012711  MSHFRE  PEP  D012711   
3  D012711  MSHFRE   ET  D012711   
4  D012711  MSHFRE   ET  D012711   

                                                 STR  SRL SUPPRESS    CVF  
0                     (131)I-Macroaggregated Albumin    0        N  256.0  
1                                         (131)I-MAA    0        N  256.0  
2      Macroagrégats d'albumine marquée à l'iode 131    

In [24]:
# See Semantic_network table SRDEF for full codes list

# Relevant T-codes for side effects
## T047 - Disease or Syndrome: A condition characterized by abnormal functioning of one or more systems or parts of an organism.
## T184 - Sign or Symptom: Observable manifestation of a disease or condition experienced by the patient.

# Relevant T-codes for side effects
side_effect_tuis = ["T047", "T184"]

# Filter MRSTY for CUIs with the relevant TUI for side effects
side_effect_cuis = mrsty[mrsty["TUI"].isin(side_effect_tuis)]["CUI"].unique()

print(side_effect_cuis)

['C0000727' 'C0000729' 'C0000737' 'C0000744' 'C0000774' 'C0000809'
 'C0000814' 'C0000823' 'C0000833' 'C0000848' 'C0000880' 'C0000889'
 'C0001083' 'C0001125' 'C0001126' 'C0001127' 'C0001139' 'C0001142'
 'C0001144' 'C0001145' 'C0001163' 'C0001169' 'C0001175' 'C0001197'
 'C0001202' 'C0001206' 'C0001207' 'C0001231' 'C0001247' 'C0001249'
 'C0001255' 'C0001261' 'C0001263' 'C0001264' 'C0001265' 'C0001304'
 'C0001305' 'C0001306' 'C0001308' 'C0001309' 'C0001311' 'C0001314'
 'C0001327' 'C0001338' 'C0001339' 'C0001342' 'C0001344' 'C0001360'
 'C0001361' 'C0001363' 'C0001365' 'C0001396' 'C0001403' 'C0001416'
 'C0001427' 'C0001485' 'C0001486' 'C0001487' 'C0001519' 'C0001529'
 'C0001576' 'C0001577' 'C0001614' 'C0001621' 'C0001622' 'C0001623'
 'C0001627' 'C0001678' 'C0001684' 'C0001726' 'C0001727' 'C0001733'
 'C0001748' 'C0001752' 'C0001768' 'C0001787' 'C0001824' 'C0001828'
 'C0001849' 'C0001857' 'C0001860']


In [None]:
# Define causative relationships
causative_relationships = ["causes", "caused_by"]

# Filter MRREL for relevant relationships where CUI1 (drug) causes CUI2 (side effect)
drug_side_effect_rels = mrrel[
    (mrrel["CUI2"].isin(side_effect_cuis))
    & (mrrel["RELA"].isin(causative_relationships))
]

In [7]:
drug_semantic_types = ["Pharmacologic Substance"]  # Example type, add more if needed
side_effect_semantic_types = [
    "Disease or Syndrome",
    "Finding",
]  # Example types, add more if needed

# Filter MRSTY to get CUIs for drugs and side effects
drug_cuis = mrsty[mrsty["STY"].isin(drug_semantic_types)]["CUI"]
side_effect_cuis = mrsty[mrsty["STY"].isin(side_effect_semantic_types)]["CUI"]

print(drug_cuis)
print("\n" * 2)
print("-" * 50)

print(side_effect_cuis)
print("\n" * 2)
print("-" * 50)

Series([], Name: CUI, dtype: object)



--------------------------------------------------
Series([], Name: CUI, dtype: object)



--------------------------------------------------


In [None]:
# Define the causative relationship types
causative_relationships = ["causes", "caused_by"]