In [1]:
import pandas as pd
import numpy as np
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, Dataset, random_split
from collections import Counter
from torch import nn, optim
import torch

In [2]:
import os
from collections import Counter
from PIL import Image

root_dir = "/kaggle/input/images"

def count_images(directory):
    image_count = 0
    for root, _, files in os.walk(directory):
        image_count += len([f for f in files if f.lower().endswith(('.png', '.jpg', '.jpeg'))])
    return image_count

# List top-level sets
sets = [item for item in os.listdir(root_dir) if not item.startswith('.')]

for dr in sets:
    path = os.path.join(root_dir, dr)
    print(f"\n{dr}:")
    
    # Get subdirectories under each set
    subdirs = [item for item in os.listdir(path) if os.path.isdir(os.path.join(path, item)) and not item.startswith('.')]
    
    for sd in subdirs:
        sub_path = os.path.join(path, sd)
        img_count = count_images(sub_path)
        print(f"   {sd}: {img_count} images")



First Set:
   100x Normal Oral Cavity Histopathological Images: 89 images
   100x OSCC Histopathological Images: 439 images

Second Set:
   400x OSCC Histopathological Images: 495 images
   400x Normal Oral Cavity Histopathological Images: 201 images


In [3]:
import torchvision.transforms as transforms

train_transforms = transforms.Compose([
    transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(brightness = 0.2, contrast = 0.2, saturation = 0.2),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

val_transforms = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

Creating train and val datasets

In [4]:
class OralHistopathDataset(Dataset):
    def __init__(self, base_dir, magnifications = ['100x', '400x'], transform=None):
        self.base_dir = base_dir
        self.magnifications = magnifications
        self.transform = transform
        self.samples = []
        self.sets = ['First Set', 'Second Set']
        
        #Collect all image + path labels
        for s in sets:
            udir = os.path.join(base_dir, s)
            for mag in magnifications:
                for label, class_name in enumerate([
                    'Normal Oral Cavity Histopathological Images',
                    'OSCC Histopathological Images'
                ]):
                    
                    bdir = udir + '/' + mag + ' ' + class_name
                    try: 

                        for fname in os.listdir(bdir):
                            if fname.lower().endswith(('.png', '.jpg', '.jpeg')):
                                path = os.path.join(bdir, fname)
                                self.samples.append((path, label))

                    except FileNotFoundError:
                        print('File not found error')
                            
    def __len__(self):
        return len(self.samples)
            
    def __getitem__(self, index):
        path, label = self.samples[idx]
        image = Image.open(path).convert('RGB')

        if self.transform:
            image = self.transform(image)


        return image, label

Loading the datasets

In [5]:
full_dataset = OralHistopathDataset(base_dir = '/kaggle/input/images')


File not found error
File not found error
File not found error
File not found error


In [6]:
full_dataset.__len__()

1224

In [7]:
train_size = int(0.8 * len(full_dataset))
val_size = len(full_dataset) - train_size

train_dataset, val_dataset = random_split(
    full_dataset,
    [train_size, val_size],
    generator = torch.Generator().manual_seed(42)
)

train_dataset.dataset.transform = train_transforms
val_dataset.dataset.transform = val_transforms

In [8]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=4)

In [9]:
!pip install transformers accelerate torchvision pillow

from transformers import Blip2Processor, Blip2ForConditionalGeneration
import torch
from PIL import Image
import os
import pandas as pd

device = "cuda" if torch.cuda.is_available() else "cpu"

processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained(
    "Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16
).to(device)

Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Downloading huggingface_hub-0.36.0-py3-none-any.whl.metadata (14 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nv

2025-11-08 23:59:21.200084: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1762646361.574139      39 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1762646361.646996      39 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/432 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/882 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/548 [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/68.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/10.0G [00:00<?, ?B/s]



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

In [None]:
from tqdm import tqdm

captions = []
batch_size = 4  # Adjust based on GPU memory
base_dir = '/kaggle/input/images' 
magnifications = ['100x', '400x'] 
sets = ['First Set', 'Second Set']

for s in sets:
    udir = os.path.join(base_dir, s)
    for mag in magnifications:
        for label, class_name in enumerate([
            'Normal Oral Cavity Histopathological Images',
            'OSCC Histopathological Images'
        ]):
            
            bdir = os.path.join(udir, f"{mag} {class_name}")
            if not os.path.exists(bdir):
                print(f"Skipping missing folder: {bdir}")
                continue

            img_paths = [
                os.path.join(bdir, f)
                for f in os.listdir(bdir)
                if f.lower().endswith(('.png', '.jpg', '.jpeg'))
            ]

            for i in tqdm(range(0, len(img_paths), batch_size), desc=f"{mag} {class_name}"):
                batch_paths = img_paths[i:i+batch_size]
                images = [Image.open(p).convert('RGB') for p in batch_paths]

                prompts = [
                    f"Describe this histopathology image and explain why it is classified as {label}."
                    for _ in batch_paths
                ]

                inputs = processor(images, text=prompts, return_tensors='pt', padding=True).to(device)

                with torch.no_grad():
                    outputs = model.generate(**inputs, max_new_tokens=40)

                decoded = processor.batch_decode(outputs, skip_special_tokens=True)

                for path, caption in zip(batch_paths, decoded):
                    captions.append({
                        'image_path': path,
                        'label': label,
                        'generated_caption': caption
                    })
                    if len(captions) % 100 == 0:
                        pd.DataFrame(captions).to_csv("captions_progress.csv", index=False)

100x Normal Oral Cavity Histopathological Images: 100%|██████████| 23/23 [1:10:36<00:00, 184.21s/it]
100x OSCC Histopathological Images:  59%|█████▉    | 65/110 [3:26:51<2:24:01, 192.03s/it]

In [None]:
# from transformers import AutoProcessor, AutoModelForImageTextToText  # or the correct class
# import torch

# model_name = "Qwen/Qwen2-VL-2B"

# processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
# model = AutoModelForImageTextToText.from_pretrained(model_name, trust_remote_code=True)

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)


In [None]:
!pip install --upgrade transformers
from transformers import AutoProcessor, AutoModelForVision2Seq

processor = AutoProcessor.from_pretrained('unsloth/Qwen2-VL-2B', trust_remote_code = True)
model = AutoModelForVision2Seq.from_pretrained('unsloth/Qwen2-VL-2B', trust_remote_code = True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
# !pip install unsloth

# import unsloth
# from unsloth import FastVisionModel

# model_name = 'unsloth/Qwen2-VL-2B'

# model, tokenizer = FastVisionModel.from_pretrained(
#     model_name = model_name,
#     load_in_4bit = False,
#     use_gradient_checkpointing = 'unsloth'
# )

In [None]:
device

Optimizer, Scheduler and loss

In [None]:
normal_count = 89 + 201
oscc_count = 439 + 495

total = normal_count + oscc_count
weight_normal = total/(2*normal_count)
weight_oscc = total/(2*oscc_count)

class_weights = torch.tensor([weight_normal, weight_oscc], dtype = torch.float)

criterion = nn.CrossEntropyLoss(weight = class_weights)

In [None]:

optimizer = optim.AdamW(model.parameters(), lr = 1e-5, weight_decay = 0.01)

scheduler = optim.lr_scheduler.StepLR(optimizer, step_size = 3, gamma = 0.1)

In [None]:
print(model)