In [None]:
import pandas as pd

# List of your files
file_list = [
    'high_confidence.csv',
    'product_adverse_effect.csv',
    'product_label.csv',
    'product_to_rxnorm.csv',
    'vocab_meddra_adverse_effect.csv',
    'vocab_rxnorm_ingredient.csv',
    'vocab_rxnorm_ingredient_to_product.csv',
    'vocab_rxnorm_product.csv'
]

for fname in file_list:
    print(f"\n\n===== {fname} =====")
    try:
        df = pd.read_csv(fname)
        print(f"Shape: {df.shape} (rows, columns)")
        print(f"Columns: {list(df.columns)}")
        print("Sample rows:")
        print(df.head(3))
        # For linkage/ID columns, print basic stats
        for col in df.columns:
            if ('id' in col or 'name' in col) and df[col].nunique() < 30:
                print(f"Value counts for {col}:")
                print(df[col].value_counts())
            elif ('id' in col or 'name' in col):
                print(f"Unique values in {col}: {df[col].nunique()}")
        print('----------------')
    except Exception as e:
        print(f"Error reading {fname}: {e}")



In [None]:
import pandas as pd

# Load once
df_label = pd.read_csv('product_label.csv')
df_adveff = pd.read_csv('product_adverse_effect.csv')
df_label2rx = pd.read_csv('product_to_rxnorm.csv')
df_rxprod = pd.read_csv('vocab_rxnorm_product.csv')
df_rxprod2ing = pd.read_csv('vocab_rxnorm_ingredient_to_product.csv')
df_rxing = pd.read_csv('vocab_rxnorm_ingredient.csv')
df_meddra = pd.read_csv('vocab_meddra_adverse_effect.csv')

# ---- STEP 1. Build key-value mappings (dictionaries) ----
# Product name → label_id(s)
name_to_label_ids = df_label.groupby('source_product_name')['label_id'].apply(list).to_dict()
# Label_id → product info
label_id_to_info = df_label.set_index('label_id').to_dict('index')

# Label_id → RxNorm Product ID
label_id_to_rx = df_label2rx.set_index('label_id')['rxnorm_product_id'].to_dict()
# RxNorm Product ID → ingredient_id(s)
rxprod_to_ings = df_rxprod2ing.groupby('product_id')['ingredient_id'].apply(list).to_dict()
# Ingredient_id → ingredient name
ing_id_to_name = df_rxing.set_index('rxnorm_id')['rxnorm_name'].to_dict()

# Label_id → effect_meddra_id(s)
label_id_to_effects = df_adveff.groupby('product_label_id')['effect_meddra_id'].apply(list).to_dict()
# Meddra id → name
meddra_id_to_name = df_meddra.set_index('meddra_id')['meddra_name'].to_dict()

# Lowercased name → label_ids, for fuzzy search
lower_name_map = {}
for name, lid_list in name_to_label_ids.items():
    if not isinstance(name, str): continue
    lower_name_map[name.lower()] = lid_list

# ---- STEP 2. Fast search function ----
def get_drug_info_fast(query_name):
    # Fuzzy/partial search
    results = []
    q = query_name.lower()
    # Find all matching product names
    matches = [name for name in lower_name_map if q in name]
    for name in matches:
        for label_id in lower_name_map[name]:
            info = label_id_to_info[label_id]
            print(f"\n=== {info['source_product_name']} ===")
            print({k: info[k] for k in info})

            # RxNorm Product ID(s)/ingredient(s)
            rxprod = label_id_to_rx.get(label_id)
            if rxprod:
                in_ids = rxprod_to_ings.get(str(rxprod), [])
                in_names = [ing_id_to_name.get(int(iid)) or ing_id_to_name.get(str(iid)) for iid in in_ids]
                print("Ingredients:", in_names)
            else:
                print("No RxNorm product mapped.")
            
            # Adverse Effects
            effect_ids = label_id_to_effects.get(label_id, [])
            effect_names = [meddra_id_to_name.get(int(eid)) or meddra_id_to_name.get(str(eid)) for eid in effect_ids]
            print("Adverse Effects:", effect_names[:10])  # Avoid printing a huge list
            results.append(label_id)
    if not results:
        print("No match found for", query_name)

# Example usage:
get_drug_info_fast("atorvastatin")   # English
#get_drug_info_fast("アトルバスタチン") # Japanese



In [9]:
import pandas as pd

# Load all CSVs (assumes files are in your working directory)
df_label = pd.read_csv('product_label.csv')
df_adveff = pd.read_csv('product_adverse_effect.csv')
df_label2rx = pd.read_csv('product_to_rxnorm.csv')
df_rxprod2ing = pd.read_csv('vocab_rxnorm_ingredient_to_product.csv')
df_rxing = pd.read_csv('vocab_rxnorm_ingredient.csv')
df_meddra = pd.read_csv('vocab_meddra_adverse_effect.csv')
df_rxprod = pd.read_csv('vocab_rxnorm_product.csv')

# Build dictionaries for fast lookup
label_id_to_info = df_label.set_index('label_id').to_dict('index')
label_id_to_rx = df_label2rx.set_index('label_id')['rxnorm_product_id'].to_dict()
rxprod_to_ings = df_rxprod2ing.groupby('product_id')['ingredient_id'].apply(list).to_dict()
ing_id_to_name = df_rxing.set_index('rxnorm_id')['rxnorm_name'].to_dict()
label_id_to_effects = df_adveff.groupby('product_label_id')['effect_meddra_id'].apply(list).to_dict()
meddra_id_to_name = df_meddra.set_index('meddra_id')['meddra_name'].to_dict()
rxprod_id_to_name = df_rxprod.set_index('rxnorm_id')['rxnorm_name'].to_dict()

def get_info_by_label_id(label_id):
    info = label_id_to_info.get(label_id)
    if not info:
        print(f"No info found for label_id {label_id}")
        return

    print(f"\n=== Product Label ID: {label_id} ===")
    print("Product Label Metadata:")
    print(info)

    # RxNorm Product
    rxprod_id = label_id_to_rx.get(label_id)
    print("RxNorm Product ID:", rxprod_id)
    rxprod_name = rxprod_id_to_name.get(str(rxprod_id), None)
    print("RxNorm Product Name:", rxprod_name)

    # Ingredients
    ing_ids = rxprod_to_ings.get(str(rxprod_id), [])
    ingredients = [ing_id_to_name.get(int(iid)) or ing_id_to_name.get(str(iid)) for iid in ing_ids]
    print("Ingredients:", ingredients)

    # Adverse Effects
    effect_ids = label_id_to_effects.get(label_id, [])
    effect_names = [meddra_id_to_name.get(int(eid)) or meddra_id_to_name.get(str(eid)) for eid in effect_ids]
    print("Adverse Effects:", effect_names[:10], f"...({len(effect_names)} total)" if len(effect_names) > 10 else "")

# Example usage:
get_info_by_label_id(2)  # Replace 270 with the label_id you want to search



  df_adveff = pd.read_csv('product_adverse_effect.csv')



=== Product Label ID: 2 ===
Product Label Metadata:
{'source': 'JP', 'source_product_name': 'ピオグリタゾン錠15mg「武田テバ」 (後発品)\nピオグリタゾン錠30mg「武田テバ」 (後発品)', 'source_product_id': '00068619', 'source_label_url': 'https://www.kegg.jp/medicus-bin/japic_med?japic_code=00068619'}
RxNorm Product ID: 317573
RxNorm Product Name: pioglitazone 15 MG Oral Tablet
Ingredients: ['pioglitazone']
Adverse Effects: [None, None, 'Cardiac failure', 'Cardiac failure', 'Oedema', None, None, 'Cardiac failure', 'Shortness of breath', 'Palpitations'] ...(123 total)
