**Imports and Setup**

Install packages

In [3]:
!pip install -q torch torchvision transformers datasets pillow pandas scikit-learn tqdm matplotlib huggingface_hub

Imports

In [4]:
import os
import zipfile
import pandas as pd
import numpy as np
import torch
from PIL import Image
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from transformers import CLIPProcessor, CLIPModel
import matplotlib.pyplot as plt
from google.colab import drive

Connetcting Dataset from drive

In [6]:
drive.mount('/content/drive')

#path of datasets
base_dir = "/content/drive/MyDrive/499A Datasets"

Mounted at /content/drive


Helper Function for safe extraction of Datasets

In [7]:
def safe_extract(zip_path, extract_to):
    """
    Safely extracts a single zip file with error handling.
    Works for datasets with normal folder structures (no nested zips).
    """
    if not os.path.exists(zip_path):
        print(f"File not found: {zip_path}")
        return False

    try:
        if not os.path.exists(extract_to):
            os.makedirs(extract_to, exist_ok=True)

        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extract_to)

        print(f"Extracted: {os.path.basename(zip_path)} → {extract_to}")
        return True

    except zipfile.BadZipFile:
        print(f"Error: Cannot extract {zip_path}. File may be corrupted.")
    except Exception as e:
        print(f"Unexpected error while extracting {zip_path}: {e}")

    return False

Optional: because of directory name cahnged

In [8]:
old_path = os.path.join(base_dir, "Dermnet (1)")
new_path = os.path.join(base_dir, "Dermnet")

# Check before renaming
if os.path.exists(old_path):
    if not os.path.exists(new_path):
        os.rename(old_path, new_path)
        print(f"Renamed '{old_path}' → '{new_path}'")
    else:
        print(f"Folder '{new_path}' already exists. Not renaming to avoid conflict.")
else:
    print(f"Folder not found: {old_path}")

Folder not found: /content/drive/MyDrive/499A Datasets/Dermnet (1)


Extracting zip files for preprocessing of datasets

In [11]:
skincon_zip = os.path.join(base_dir, "Skincon/skincon.zip")
dermnet_zip = os.path.join(base_dir, "Dermnet/dermnet.zip")
herb_zip    = os.path.join(base_dir, "Herb2.0/herb2.zip")

# Extraction folders
skincon_extract = os.path.join(base_dir, "Skincon/extracted")
dermnet_extract = os.path.join(base_dir, "Dermnet/extracted")
herb_extract    = os.path.join(base_dir, "Herb2.0/extracted")

# extraction
print("🔹 Starting extraction...\n")
safe_extract(skincon_zip, skincon_extract)
safe_extract(dermnet_zip, dermnet_extract)
safe_extract(herb_zip, herb_extract)

# Verify extracted folder structures
print("\nVerifying extracted datasets...")

for dataset_name, extract_path in [
    ("Skincon", skincon_extract),
    ("Dermnet", dermnet_extract),
    ("Herb2.0", herb_extract)
]:
    if os.path.exists(extract_path):
        print(f"\n{dataset_name} structure:")
        for root, dirs, _ in os.walk(extract_path):
            print(f"  {root}")
            for d in dirs[:5]:
                print(f"    {d}")
            break
    else:
        print(f"{dataset_name}: Extraction folder not found.")

🔹 Starting extraction...

Extracted: skincon.zip → /content/drive/MyDrive/499A Datasets/Skincon/extracted
Extracted: dermnet.zip → /content/drive/MyDrive/499A Datasets/Dermnet/extracted
Extracted: herb2.zip → /content/drive/MyDrive/499A Datasets/Herb2.0/extracted

Verifying extracted datasets...

Skincon structure:
  /content/drive/MyDrive/499A Datasets/Skincon/extracted

Dermnet structure:
  /content/drive/MyDrive/499A Datasets/Dermnet/extracted
    test
    train

Herb2.0 structure:
  /content/drive/MyDrive/499A Datasets/Herb2.0/extracted


# **Preprocessing**

**Dermnet Dataset**

In [12]:
def count_images_per_class(base_path):
    """
    Counts number of images per class inside train/ and test/ folders.
    """
    for subset in ['train', 'test']:
        subset_path = os.path.join(base_path, subset)
        if not os.path.exists(subset_path):
            print(f"Missing folder: {subset_path}")
            continue

        print(f"\nImage counts for {subset.upper()}:")
        total_images = 0
        for class_name in sorted(os.listdir(subset_path)):
            class_path = os.path.join(subset_path, class_name)
            if os.path.isdir(class_path):
                num_images = len([
                    f for f in os.listdir(class_path)
                    if f.lower().endswith(('.jpg', '.jpeg', '.png'))
                ])
                print(f"  {class_name[:45]:45} → {num_images:5d} images")
                total_images += num_images
        print(f"  Total images in {subset}: {total_images:,}")

# Run for Dermnet
dermnet_base = "/content/drive/MyDrive/499A Datasets/Dermnet/extracted"
count_images_per_class(dermnet_base)



Image counts for TRAIN:
  Acne and Rosacea Photos                       →   840 images
  Actinic Keratosis Basal Cell Carcinoma and ot →  1149 images
  Atopic Dermatitis Photos                      →   489 images
  Bullous Disease Photos                        →   448 images
  Cellulitis Impetigo and other Bacterial Infec →   288 images
  Eczema Photos                                 →  1235 images
  Exanthems and Drug Eruptions                  →   404 images
  Hair Loss Photos Alopecia and other Hair Dise →   239 images
  Herpes HPV and other STDs Photos              →   405 images
  Light Diseases and Disorders of Pigmentation  →   568 images
  Lupus and other Connective Tissue diseases    →   420 images
  Melanoma Skin Cancer Nevi and Moles           →   463 images
  Nail Fungus and other Nail Disease            →  1040 images
  Poison Ivy Photos and other Contact Dermatiti →   260 images
  Psoriasis pictures Lichen Planus and related  →  1405 images
  Scabies Lyme Disease and oth

In [13]:
# Base path
dermnet_base = "/content/drive/MyDrive/499A Datasets/Dermnet/extracted"

# Output path
dermnet_preproc = "/content/drive/MyDrive/499A Datasets/Dermnet/preprocessed"
os.makedirs(dermnet_preproc, exist_ok=True)

Defining parameters

In [14]:
IMAGE_SIZE = (224, 224)
VALID_EXT = (".jpg", ".jpeg", ".png")

Image preprocessing function

In [15]:
def preprocess_images(src_root, dest_root):
    records = []
    os.makedirs(dest_root, exist_ok=True)

    for subset in ["train", "test"]:
        subset_src = os.path.join(src_root, subset)
        subset_dest = os.path.join(dest_root, subset)
        os.makedirs(subset_dest, exist_ok=True)

        for cls in tqdm(sorted(os.listdir(subset_src)), desc=f"Processing {subset}"):
            cls_src = os.path.join(subset_src, cls)
            if not os.path.isdir(cls_src):
                continue

            cls_name_clean = cls.strip().replace("/", "-").replace("\\", "-")
            cls_dest = os.path.join(subset_dest, cls_name_clean)
            os.makedirs(cls_dest, exist_ok=True)

            for fname in os.listdir(cls_src):
                if fname.lower().endswith(VALID_EXT):
                    src_path = os.path.join(cls_src, fname)
                    dest_path = os.path.join(cls_dest, fname)

                    try:
                        img = Image.open(src_path).convert("RGB")
                        img = img.resize(IMAGE_SIZE)
                        img.save(dest_path, "JPEG", quality=95)

                        records.append({
                            "subset": subset,
                            "class_name": cls_name_clean,
                            "image_path": dest_path
                        })
                    except Exception as e:
                        print(f"Skipped {src_path}: {e}")

    return pd.DataFrame(records)

Run and save metadata

In [16]:
df_dermnet = preprocess_images(dermnet_base, dermnet_preproc)
df_dermnet.to_csv(os.path.join(dermnet_preproc, "metadata.csv"), index=False)

print("\nDermNet preprocessing complete.")
print(df_dermnet.head())
print(f"Total images processed: {len(df_dermnet)}")

Processing train: 100%|██████████| 23/23 [10:23<00:00, 27.11s/it]
Processing test: 100%|██████████| 23/23 [02:02<00:00,  5.32s/it]



DermNet preprocessing complete.
  subset               class_name  \
0  train  Acne and Rosacea Photos   
1  train  Acne and Rosacea Photos   
2  train  Acne and Rosacea Photos   
3  train  Acne and Rosacea Photos   
4  train  Acne and Rosacea Photos   

                                          image_path  
0  /content/drive/MyDrive/499A Datasets/Dermnet/p...  
1  /content/drive/MyDrive/499A Datasets/Dermnet/p...  
2  /content/drive/MyDrive/499A Datasets/Dermnet/p...  
3  /content/drive/MyDrive/499A Datasets/Dermnet/p...  
4  /content/drive/MyDrive/499A Datasets/Dermnet/p...  
Total images processed: 19559


Load metadata and inspects class name

In [17]:
dermnet_meta_path = "/content/drive/MyDrive/499A Datasets/Dermnet/preprocessed/metadata.csv"
df = pd.read_csv(dermnet_meta_path)

print("Unique classes:", len(df['class_name'].unique()))
print(df['class_name'].unique()[:5])

Unique classes: 23
['Acne and Rosacea Photos'
 'Actinic Keratosis Basal Cell Carcinoma and other Malignant Lesions'
 'Atopic Dermatitis Photos' 'Bullous Disease Photos'
 'Cellulitis Impetigo and other Bacterial Infections']


Clean Class names

In [18]:
def clean_class_name(name):
    name = name.lower().replace("photos", "").replace("pictures", "")
    name = name.replace("images", "").replace("disease", "").replace("diseases", "")
    name = name.replace("and", "&").replace("  ", " ").strip()
    return name

Normalize each class name with consice medical phrase

In [19]:
df["class_name_clean"] = df["class_name"].apply(clean_class_name)

Creates descriptive text prompts

In [20]:
import random

templates = [
    "Image of human skin showing {}.",
    "Clinical photo depicting {}.",
    "Dermatological image illustrating {}.",
    "A patient presenting with {}.",
    "Close-up skin lesion associated with {}.",
    "Photo of dermatologic condition: {}.",
]

def generate_prompts(condition_name):
    # produce 2–3 varied captions per entry
    n = random.randint(2, 3)
    condition = condition_name
    return [t.format(condition) for t in random.sample(templates, n)]


Expand metadata with augmented text

In [21]:
rows = []
for _, row in df.iterrows():
    for caption in generate_prompts(row["class_name_clean"]):
        rows.append({
            "subset": row["subset"],
            "class_name": row["class_name"],
            "image_path": row["image_path"],
            "text_prompt": caption
        })

df_aug = pd.DataFrame(rows)
print("Generated augmented captions:", len(df_aug))
df_aug.head(3)


Generated augmented captions: 48956


Unnamed: 0,subset,class_name,image_path,text_prompt
0,train,Acne and Rosacea Photos,/content/drive/MyDrive/499A Datasets/Dermnet/p...,Photo of dermatologic condition: acne & rosacea.
1,train,Acne and Rosacea Photos,/content/drive/MyDrive/499A Datasets/Dermnet/p...,Clinical photo depicting acne & rosacea.
2,train,Acne and Rosacea Photos,/content/drive/MyDrive/499A Datasets/Dermnet/p...,Image of human skin showing acne & rosacea.


Saving updated metadata

In [22]:
aug_path = "/content/drive/MyDrive/499A Datasets/Dermnet/preprocessed/metadata_augmented.csv"
df_aug.to_csv(aug_path, index=False)
print(f"Augmented metadata saved to {aug_path}")


Augmented metadata saved to /content/drive/MyDrive/499A Datasets/Dermnet/preprocessed/metadata_augmented.csv


**Skincon Dataset**

In [23]:
!ls -R "/content/drive/MyDrive/499A Datasets/Skincon/extracted" | head -n 50

/content/drive/MyDrive/499A Datasets/Skincon/extracted:
fitzpatrick17k.csv
image (1).csv
image.csv


Dataset's attributes and entities

In [24]:
csv_path = "/content/drive/MyDrive/499A Datasets/Skincon/extracted/fitzpatrick17k.csv"
df = pd.read_csv(csv_path)

print("Columns:", df.columns.tolist())
df.head(3)

Columns: ['md5hash', 'fitzpatrick_scale', 'fitzpatrick_centaur', 'label', 'nine_partition_label', 'three_partition_label', 'qc', 'url', 'url_alphanum']


Unnamed: 0,md5hash,fitzpatrick_scale,fitzpatrick_centaur,label,nine_partition_label,three_partition_label,qc,url,url_alphanum
0,5e82a45bc5d78bd24ae9202d194423f8,3,3,drug induced pigmentary changes,inflammatory,non-neoplastic,,https://www.dermaamin.com/site/images/clinical...,httpwwwdermaamincomsiteimagesclinicalpicmminoc...
1,fa2911a9b13b6f8af79cb700937cc14f,1,1,photodermatoses,inflammatory,non-neoplastic,,https://www.dermaamin.com/site/images/clinical...,httpwwwdermaamincomsiteimagesclinicalpicpphoto...
2,d2bac3c9e4499032ca8e9b07c7d3bc40,2,3,dermatofibroma,benign dermal,benign,,https://www.dermaamin.com/site/images/clinical...,httpwwwdermaamincomsiteimagesclinicalpicdderma...


In [25]:
import requests
from io import BytesIO

In [26]:
# Base paths
skincon_raw_csv = "/content/drive/MyDrive/499A Datasets/Skincon/extracted/fitzpatrick17k.csv"
skincon_preprocessed = "/content/drive/MyDrive/499A Datasets/Skincon/preprocessed"
os.makedirs(skincon_preprocessed, exist_ok=True)

# Load metadata
df = pd.read_csv(skincon_raw_csv)
print(f"Loaded {len(df)} SkinCon entries.")

# Clean NaN URLs or labels
df = df.dropna(subset=['url', 'label'])
print(f"Remaining valid entries: {len(df)}")

# Prepare folders
image_dir = os.path.join(skincon_preprocessed, "images")
os.makedirs(image_dir, exist_ok=True)

Loaded 16577 SkinCon entries.
Remaining valid entries: 16536


Image preprocessing funtion

In [27]:
def download_and_preprocess(row):
    url = row['url']
    label = row['label'].strip().replace(" ", "_").lower()
    label_dir = os.path.join(image_dir, label)
    os.makedirs(label_dir, exist_ok=True)

    image_name = f"{row['md5hash']}.jpg"
    image_path = os.path.join(label_dir, image_name)

    if os.path.exists(image_path):
        return image_path

    try:
        response = requests.get(url, timeout=8)
        img = Image.open(BytesIO(response.content)).convert("RGB")
        img = img.resize((224, 224))
        img.save(image_path, "JPEG", quality=90)
        return image_path
    except Exception as e:
        return None

Download and preprocess all images

In [28]:
tqdm.pandas()
df["image_path"] = df.progress_apply(download_and_preprocess, axis=1)
df = df.dropna(subset=["image_path"])

print(f"Images successfully downloaded and resized: {len(df)}")

100%|██████████| 16536/16536 [2:39:07<00:00,  1.73it/s]


Images successfully downloaded and resized: 16518


Image and image(1)

In [37]:

csv1 = "/content/drive/MyDrive/499A Datasets/Skincon/extracted/image.csv"
csv2 = "/content/drive/MyDrive/499A Datasets/Skincon/extracted/image (1).csv"

# Read only first few rows
for path in [csv1, csv2]:
    print(f"\n{path}")
    try:
        df_temp = pd.read_csv(path)
        print("Columns:", df_temp.columns.tolist())
        print(df_temp.head(2))
    except Exception as e:
        print("Error reading:", e)


/content/drive/MyDrive/499A Datasets/Skincon/extracted/image.csv
Columns: ['Unnamed: 0', 'ImageID', 'Vesicle', 'Papule', 'Macule', 'Plaque', 'Abscess', 'Pustule', 'Bulla', 'Patch', 'Nodule', 'Ulcer', 'Crust', 'Erosion', 'Excoriation', 'Atrophy', 'Exudate', 'Purpura/Petechiae', 'Fissure', 'Induration', 'Xerosis', 'Telangiectasia', 'Scale', 'Scar', 'Friable', 'Sclerosis', 'Pedunculated', 'Exophytic/Fungating', 'Warty/Papillomatous', 'Dome-shaped', 'Flat topped', 'Brown(Hyperpigmentation)', 'Translucent', 'White(Hypopigmentation)', 'Purple', 'Yellow', 'Black', 'Erythema', 'Comedo', 'Lichenification', 'Blue', 'Umbilicated', 'Poikiloderma', 'Salmon', 'Wheal', 'Acuminate', 'Burrow', 'Gray', 'Pigmented', 'Cyst', 'Do not consider this image']
   Unnamed: 0                               ImageID  Vesicle  Papule  Macule  \
0           0  eb0cbb277ba6b206c5fafc66ab8c46f9.jpg        0       0       0   
1           1  bb3d08781eb23890a9909201deed8c85.jpg        0       0       0   

   Plaque  Ab

imports and setup

In [40]:
from torchvision import transforms

paths and metadata load

In [41]:
# Paths
fitzpatrick_csv_path = "/content/drive/MyDrive/499A Datasets/Skincon/extracted/fitzpatrick17k.csv"
image_csv_path = "/content/drive/MyDrive/499A Datasets/Skincon/extracted/image.csv"
image1_csv_path = "/content/drive/MyDrive/499A Datasets/Skincon/extracted/image (1).csv"
preprocessed_img_dir = "/content/drive/MyDrive/499A Datasets/Skincon/preprocessed/images"
output_preprocessed_csv = "/content/drive/MyDrive/499A Datasets/Skincon/preprocessed/skincon_preprocessed.csv"

# Load metadata csvs
df_meta = pd.read_csv(fitzpatrick_csv_path)
df_image = pd.read_csv(image_csv_path)
df_image1 = pd.read_csv(image1_csv_path)

In [42]:
# Create image_path column based on your downloaded images folder structure
def get_image_path(md5hash):
    # Images saved as md5hash.jpg in class folders (label normalized)
    label = df_meta.loc[df_meta['md5hash'] == md5hash, 'label'].values[0].strip().replace(" ", "_").lower()
    img_path = os.path.join(preprocessed_img_dir, label, f"{md5hash}.jpg")
    return img_path if os.path.exists(img_path) else None

print("Generating image paths...")
df_meta['image_path'] = df_meta['md5hash'].progress_apply(get_image_path)

# Filter out missing images
df_meta = df_meta.dropna(subset=['image_path']).reset_index(drop=True)

Generating image paths...


100%|██████████| 16577/16577 [00:54<00:00, 303.78it/s]


Prepare morphology from image and image(1)

In [43]:
# Merge the two morphology annotation files
df_morph = pd.concat([df_image, df_image1], ignore_index=True)

# Clean 'Unnamed: 0' columns if present
df_morph = df_morph.drop(columns=[col for col in df_morph.columns if "Unnamed" in col], errors='ignore')

# Normalize ImageID format (some have .jpg, some .png)
df_morph['ImageID'] = df_morph['ImageID'].str.lower().str.strip()

In [44]:
# Create a mapping from md5hash to filename
df_meta['filename'] = df_meta['md5hash'].str.lower() + ".jpg"

# Since morphology data has ImageID as filename, we join on filename
df_merged = pd.merge(df_meta, df_morph, left_on='filename', right_on='ImageID', how='left')

print(f"Merged morphology features: {df_merged.shape}")

# Fill missing morphology features with 0 (meaning no lesion features annotated)
morph_cols = df_morph.columns.drop('ImageID')
df_merged[morph_cols] = df_merged[morph_cols].fillna(0).astype(int)

Merged morphology features: (16518, 61)


In [45]:
# Generate descriptive text prompts for VLM models
def create_text_prompt(row):
    label = row['label'].strip().lower()
    morph_features = [col.replace('_', ' ') for col in morph_cols if row[col] == 1]

    # Combine label and morphology features if available
    if morph_features:
        morph_text = ", ".join(morph_features)
        prompt = f"Image of human skin showing {label} with {morph_text}"
    else:
        prompt = f"Image of human skin showing {label}"
    return prompt

print("Creating text prompts...")
df_merged['text_prompt'] = df_merged.progress_apply(create_text_prompt, axis=1)

Creating text prompts...


100%|██████████| 16518/16518 [00:01<00:00, 8369.97it/s]


Lightweight augmentation setup(apply only during training)

In [46]:
augmentation_transforms = transforms.Compose([
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomRotation(degrees=15),
    transforms.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1, hue=0.05),
])

Sample function to apply augmentation on image during training

In [47]:
def augment_image(image_path):
    img = Image.open(image_path).convert('RGB')
    img = img.resize((224, 224))
    img = augmentation_transforms(img)
    return img

Final metadata

In [48]:
df_merged.to_csv(output_preprocessed_csv, index=False)

print(f"SkinCon preprocessing complete. Output saved to:\n{output_preprocessed_csv}")
print(f"Total images processed: {len(df_merged)}")

SkinCon preprocessing complete. Output saved to:
/content/drive/MyDrive/499A Datasets/Skincon/preprocessed/skincon_preprocessed.csv
Total images processed: 16518


**Herb2.0**

In [65]:
base_dir = "/content/drive/MyDrive/499A Datasets"
herb_dir = os.path.join(base_dir, "Herb2.0/extracted")

In [70]:
def load_csv(path):
    return pd.read_csv(path, encoding="utf-8", on_bad_lines="skip", engine="python", delimiter="\t")

disease = load_csv(os.path.join(herb_dir, "_MConverter.eu_HERB_disease_info_v2 - Copy.csv"))
herb = load_csv(os.path.join(herb_dir, "_MConverter.eu_HERB_herb_info_v2.csv"))
ingredient = load_csv(os.path.join(herb_dir, "_MConverter.eu_HERB_ingredient_info_v2.csv"))

print("Shapes →", disease.shape, herb.shape, ingredient.shape)

Shapes → (30170, 18) (6892, 19) (44595, 24)


In [71]:
print("Disease CSV head:")
print(disease.head())

print("\nHerb CSV head:")
print(herb.head())

print("\nIngredient CSV head:")
print(ingredient.head())

Disease CSV head:
    Disease_id         Disease_name  \
0  HBDIS000001       Abdomen, Acute   
1  HBDIS000002     Abdominal Cramps   
2  HBDIS000003    Abdomen Distended   
3  HBDIS000004       Abdominal Mass   
4  HBDIS000005  Abdominal Neoplasms   

                                  Disease_alias_name DisGeNET_disease_type  \
0                      Acute Abdomen; Abdomen, Acute             phenotype   
1           Infantile Colic; Colic; Abdominal Cramps             phenotype   
2  Distended Abdomen; Belly Bloating; Bloating; A...             phenotype   
3                                     Abdominal Mass             phenotype   
4                                Abdominal Neoplasms                 group   

    UMLS_disease_type                                 MeSH_disease_class  \
0     Sign or Symptom        Pathological Conditions, Signs and Symptoms   
1     Sign or Symptom  Congenital, Hereditary, and Neonatal Diseases ...   
2             Finding                          Dig

Columns name

In [72]:
print("Disease columns:")
print(disease.columns.tolist())

print("\nHerb columns:")
print(herb.columns.tolist())

print("\nIngredient columns:")
print(ingredient.columns.tolist())


Disease columns:
['Disease_id', 'Disease_name', 'Disease_alias_name', 'DisGeNET_disease_type', 'UMLS_disease_type', 'MeSH_disease_class', 'HPO_disease_class', 'DO_disease_class', 'UMLS_disease_type_id', 'MeSH_disease_class_id', 'HPO_disease_class_id', 'DO_disease_class_id', 'DisGeNET_id', 'MeSH_id', 'HPO_id', 'DO_id', 'ICD10_id', 'OMIM_id']

Herb columns:
['Herb_id', 'Herb_pinyin_name', 'Herb_cn_name', 'Herb_alias_name', 'Herb_en_name', 'Herb_latin_name', 'Properties', 'Meridians', 'UsePart', 'Function', 'Indication', 'Toxicity', 'Clinical_manifestations', 'Therapeutic_en_class', 'Therapeutic_cn_class', 'SymMap_id', 'TCMID_id', 'TCMSP_id', 'TCM_ID_id']

Ingredient columns:
['Ingredient_id', 'Ingredient_name', 'Ingredient_alias_name', 'Molecular_formula', 'Canonical_smiles', 'Isomeric_smiles', 'InChI', 'InChIKey', 'MolWt', 'NumHAcceptors', 'NumHDonors', 'MolLogP', 'NumRotatableBonds', 'Drug_likeness', 'OB_score', 'CAS_id', 'SymMap_id', 'TCMID_id', 'TCMSP_id', 'TCM_ID_id', 'PubChem_id', 

Select necessrary columns

In [73]:
disease = disease[[
    "Disease_id", "Disease_name", "Disease_alias_name",
    "UMLS_disease_type", "MeSH_disease_class"
]]

herb = herb[[
    "Herb_id", "Herb_en_name", "Herb_latin_name",
    "Properties", "Meridians", "UsePart",
    "Function", "Indication", "Toxicity"
]]

ingredient = ingredient[[
    "Ingredient_id", "Ingredient_name", "Ingredient_alias_name",
    "Molecular_formula", "Drug_likeness", "OB_score",
    "CAS_id", "PubChem_id"
]]

Basic Cleaning

In [74]:
for df in [disease, herb, ingredient]:
    df.drop_duplicates(inplace=True)
    df.replace("NA", np.nan, inplace=True)
    df.fillna("", inplace=True)

  df.fillna("", inplace=True)


Merging

In [75]:
merged = herb.merge(ingredient, how="outer", left_index=True, right_index=True, suffixes=("_herb", "_ing"))
merged = merged.merge(disease, how="outer", left_index=True, right_index=True)

print("Merged shape:", merged.shape)

Merged shape: (44595, 22)


A sigle text column for VLM

In [76]:
def join_text(row):
    fields = [
        row.get("Herb_en_name", ""), row.get("Herb_latin_name", ""),
        row.get("Function", ""), row.get("Indication", ""),
        row.get("Properties", ""), row.get("Meridians", ""),
        row.get("Ingredient_name", ""), row.get("Molecular_formula", ""),
        row.get("Disease_name", ""), row.get("UMLS_disease_type", ""),
        row.get("MeSH_disease_class", ""), row.get("Toxicity", "")
    ]
    return " | ".join([str(x).strip() for x in fields if str(x).strip()])

merged["text_for_vlm"] = merged.apply(join_text, axis=1)
merged = merged[merged["text_for_vlm"].str.len() > 0].reset_index(drop=True)

In [77]:
from pathlib import Path
pre_dir = os.path.join(base_dir, "Preprocessed")
Path(pre_dir).mkdir(parents=True, exist_ok=True)

csv_path = os.path.join(pre_dir, "herb2_final_clean.csv")
merged.to_csv(csv_path, index=False)
print("Cleaned HERB 2.0 saved to:", csv_path)

Cleaned HERB 2.0 saved to: /content/drive/MyDrive/499A Datasets/Preprocessed/herb2_final_clean.csv


In [78]:
print("Rows:", len(merged))
print("Unique herbs:", merged['Herb_en_name'].nunique())
print("Unique ingredients:", merged['Ingredient_name'].nunique())
print("Unique diseases:", merged['Disease_name'].nunique())

merged.head(5)

Rows: 44595
Unique herbs: 6255
Unique ingredients: 44541
Unique diseases: 30170


Unnamed: 0,Herb_id,Herb_en_name,Herb_latin_name,Properties,Meridians,UsePart,Function,Indication,Toxicity,Ingredient_id,...,Drug_likeness,OB_score,CAS_id,PubChem_id,Disease_id,Disease_name,Disease_alias_name,UMLS_disease_type,MeSH_disease_class,text_for_vlm
0,HERB000001,Abyssinia Coralbean,Erythrina Abyssinica,,,,,,,HBIN000001,...,0.247,,,13878128.0,HBDIS000001,"Abdomen, Acute","Acute Abdomen; Abdomen, Acute",Sign or Symptom,"Pathological Conditions, Signs and Symptoms",Abyssinia Coralbean | Erythrina Abyssinica | O...
1,HERB000002,Abyssinia Harrisonia,Harrisonia Abyssinica,,,,,,,HBIN000002,...,0.521,43.636,552-70-5,6602484.0,HBDIS000002,Abdominal Cramps,Infantile Colic; Colic; Abdominal Cramps,Sign or Symptom,"Congenital, Hereditary, and Neonatal Diseases ...",Abyssinia Harrisonia | Harrisonia Abyssinica |...
2,HERB000003,Algerian Cottonthistle,Onopordum Algeriense,,,,,,,HBIN000003,...,,,,,HBDIS000003,Abdomen Distended,Distended Abdomen; Belly Bloating; Bloating; A...,Finding,Digestive System Diseases,Algerian Cottonthistle | Onopordum Algeriense ...
3,HERB000004,Algerian Statice,Limonium Bonduellii,,,,,,,HBIN000004,...,,,,,HBDIS000004,Abdominal Mass,Abdominal Mass,Finding,Digestive System Diseases,Algerian Statice | Limonium Bonduellii | 0-eth...
4,HERB000005,Algerian Iris,Iris Unguicularis,,,,,,,HBIN000005,...,,,,,HBDIS000005,Abdominal Neoplasms,Abdominal Neoplasms,Neoplastic Process,Neoplasms,Algerian Iris | Iris Unguicularis | 0-methylac...
