In [1]:
!pip install pytorch-lightning torch torchvision transformers numpy




In [2]:
!git clone https://github.com/ultralytics/yolov5

Cloning into 'yolov5'...
remote: Enumerating objects: 17129, done.[K
remote: Counting objects: 100% (73/73), done.[K
remote: Compressing objects: 100% (57/57), done.[K
remote: Total 17129 (delta 39), reused 21 (delta 16), pack-reused 17056 (from 3)[K
Receiving objects: 100% (17129/17129), 15.81 MiB | 35.04 MiB/s, done.
Resolving deltas: 100% (11741/11741), done.


In [3]:
%cd yolov5
!pip install -r /kaggle/working/yolov5/requirements.txt

/kaggle/working/yolov5
Collecting thop>=0.1.1 (from -r /kaggle/working/yolov5/requirements.txt (line 14))
  Downloading thop-0.1.1.post2209072238-py3-none-any.whl.metadata (2.7 kB)
Collecting ultralytics>=8.2.34 (from -r /kaggle/working/yolov5/requirements.txt (line 18))
  Downloading ultralytics-8.3.59-py3-none-any.whl.metadata (35 kB)
Collecting ultralytics-thop>=2.0.0 (from ultralytics>=8.2.34->-r /kaggle/working/yolov5/requirements.txt (line 18))
  Downloading ultralytics_thop-2.0.13-py3-none-any.whl.metadata (9.4 kB)
Downloading thop-0.1.1.post2209072238-py3-none-any.whl (15 kB)
Downloading ultralytics-8.3.59-py3-none-any.whl (906 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m906.8/906.8 kB[0m [31m25.2 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hDownloading ultralytics_thop-2.0.13-py3-none-any.whl (26 kB)
Installing collected packages: ultralytics-thop, thop, ultralytics
Successfully installed thop-0.1.1.post2209072238 ultralytics-8.3.59 ultralytics-thop-2.0

In [4]:
import os
import json
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import pytorch_lightning as pl

# Define paths
video_dir = "/kaggle/input/cholec-train-data/CholecT50/videos"
label_dir = "/kaggle/input/cholec-train-data/CholecT50/labels"

# Training and testing video sub-folder names
train_videos = ["VID01", "VID02", "VID04", "VID05", "VID06", "VID08", "VID10", "VID12", "VID13", "VID14"]
test_videos = ["VID92", "VID96", "VID103", "VID110", "VID111"]

def parse_annotations(label_path):
    """Parse JSON annotations and extract bounding boxes and triplet data."""
    with open(label_path, 'r') as file:
        data = json.load(file)
    annotations = data['annotations']

    parsed_data = []
    for frame_id, triplets in annotations.items():
        for triplet in triplets:
            parsed_data.append({
                'frame_id': int(frame_id),
                'triplet_id': triplet[0],
                'instrument_id': triplet[1],
                'verb_id': triplet[8],
                'target_id': triplet[9],
                'phase_id': triplet[14],
                'bbox': triplet[2:6]
            })
    return parsed_data

def create_dataset(video_list, video_dir, label_dir):
    """Create a DataFrame for the dataset."""
    data = []
    for video_id in tqdm(video_list, desc="Processing Videos"):
        video_path = Path(video_dir) / video_id
        label_path = Path(label_dir) / f"{video_id}.json"

        annotations = parse_annotations(label_path)
        for entry in annotations:
            frame_path = video_path / f"{entry['frame_id']}.png"
            data.append({
                'video_id': video_id,
                'frame_path': str(frame_path),
                'triplet_id': entry['triplet_id'],
                'instrument_id': entry['instrument_id'],
                'verb_id': entry['verb_id'],
                'target_id': entry['target_id'],
                'phase_id': entry['phase_id'],
                'bbox': entry['bbox']
            })
    return pd.DataFrame(data)

# Prepare Train and Test DataFrames
train_df = create_dataset(train_videos, video_dir, label_dir)
test_df = create_dataset(test_videos, video_dir, label_dir)

# Save DataFrames to CSV for verification
train_df.to_csv("train_data.csv", index=False)
test_df.to_csv("test_data.csv", index=False)

# Print DataFrame samples
print("Training Data Sample:")
print(train_df.head())
print("\nTesting Data Sample:")
print(test_df.head())


Processing Videos: 100%|██████████| 10/10 [00:00<00:00, 19.92it/s]
Processing Videos: 100%|██████████| 5/5 [00:00<00:00, 30.22it/s]


Training Data Sample:
  video_id                                         frame_path  triplet_id  \
0    VID01  /kaggle/input/cholec-train-data/CholecT50/vide...           7   
1    VID01  /kaggle/input/cholec-train-data/CholecT50/vide...           7   
2    VID01  /kaggle/input/cholec-train-data/CholecT50/vide...           7   
3    VID01  /kaggle/input/cholec-train-data/CholecT50/vide...           7   
4    VID01  /kaggle/input/cholec-train-data/CholecT50/vide...           7   

   instrument_id  verb_id  target_id  phase_id             bbox  
0              0        0        1.0         0  [1, -1, -1, -1]  
1              0        0        1.0         0  [1, -1, -1, -1]  
2              0        0        1.0         0  [1, -1, -1, -1]  
3              0        0        1.0         0  [1, -1, -1, -1]  
4              0        0        1.0         0  [1, -1, -1, -1]  

Testing Data Sample:
  video_id                                         frame_path  triplet_id  \
0    VID92  /kaggle/

In [6]:
train_df.iloc()[0][1]

  train_df.iloc()[0][1]


'/kaggle/input/cholec-train-data/CholecT50/videos/VID01/0.png'

In [7]:
# Load class names from class_list.txt
class_list_path = "/kaggle/input/finetuning/m2cai16-tool-locations/class_list.txt"
with open(class_list_path, "r") as file:
    classes = [line.strip() for line in file.readlines()]

# Print loaded class names
print("Loaded class names:")
print(classes)


Loaded class names:
['1 Grasper', '2 Bipolar', '3 Hook', '4 Scissors', '5 Clipper', '6 Irrigator', '7 SpecimenBag']


In [8]:
import xml.etree.ElementTree as ET
import os

annotations_dir = "/kaggle/input/finetuning/m2cai16-tool-locations/Annotations"
unique_classes = set()

for annotation_file in os.listdir(annotations_dir)[:10]:  # Check first 10 files
    annotation_path = os.path.join(annotations_dir, annotation_file)
    tree = ET.parse(annotation_path)
    root = tree.getroot()

    for obj in root.findall("object"):
        class_name = obj.find("name").text
        unique_classes.add(class_name)

print("Classes found in XML annotations:")
print(unique_classes)


Classes found in XML annotations:
{'Irrigator', 'Scissors', 'Clipper', 'Grasper', 'Bipolar', 'SpecimenBag'}


In [9]:
import os
import cv2
import xml.etree.ElementTree as ET
from tqdm import tqdm
from sklearn.model_selection import train_test_split

# Define paths
annotations_dir = "/kaggle/input/finetuning/m2cai16-tool-locations/Annotations"
images_dir = "/kaggle/input/finetuning/m2cai16-tool-locations/JPEGImages"
class_list_path = "/kaggle/input/finetuning/m2cai16-tool-locations/class_list.txt"
output_dir = "/kaggle/working/finetuning_yolo_dataset"

# Load and clean class names (strip numeric prefixes)
with open(class_list_path, "r") as file:
    classes = [line.strip().split(' ', 1)[1] for line in file.readlines()]

print("Processed class names from class_list.txt:")
print(classes)

# Validate class names against XML
xml_classes = set()
for xml_file in os.listdir(annotations_dir):
    tree = ET.parse(os.path.join(annotations_dir, xml_file))
    for obj in tree.getroot().findall("object"):
        xml_classes.add(obj.find("name").text)

print("Classes found in XML annotations:")
print(xml_classes)

# Check if all XML classes exist in class_list.txt
missing_classes = xml_classes - set(classes)
if missing_classes:
    print(f"Warning: Missing classes in class_list.txt: {missing_classes}")
else:
    print("All XML classes are present in class_list.txt.")

# Ensure output directories exist
os.makedirs(output_dir, exist_ok=True)
os.makedirs(f"{output_dir}/labels/train", exist_ok=True)
os.makedirs(f"{output_dir}/labels/val", exist_ok=True)
os.makedirs(f"{output_dir}/images/train", exist_ok=True)
os.makedirs(f"{output_dir}/images/val", exist_ok=True)

# Function to convert Pascal VOC annotations to YOLO format
def voc_to_yolo(annotation_path, image_width, image_height):
    tree = ET.parse(annotation_path)
    root = tree.getroot()
    yolo_annotations = []

    for obj in root.findall("object"):
        class_name = obj.find("name").text
        if class_name not in classes:
            continue  # Skip unknown classes

        class_id = classes.index(class_name)
        bbox = obj.find("bndbox")
        xmin = float(bbox.find("xmin").text)
        ymin = float(bbox.find("ymin").text)
        xmax = float(bbox.find("xmax").text)
        ymax = float(bbox.find("ymax").text)

        # Convert to YOLO format
        x_center = ((xmin + xmax) / 2) / image_width
        y_center = ((ymin + ymax) / 2) / image_height
        width = (xmax - xmin) / image_width
        height = (ymax - ymin) / image_height
        yolo_annotations.append(f"{class_id} {x_center} {y_center} {width} {height}")

    return yolo_annotations

# Split data into train and val sets
annotation_files = os.listdir(annotations_dir)
train_files, val_files = train_test_split(annotation_files, test_size=0.2, random_state=42)

# Process and save annotations in YOLO format
for split, files in [("train", train_files), ("val", val_files)]:
    for annotation_file in tqdm(files, desc=f"Processing {split} data"):
        annotation_path = os.path.join(annotations_dir, annotation_file)
        image_file = annotation_file.replace(".xml", ".jpg")
        image_path = os.path.join(images_dir, image_file)
        
        # Get image dimensions
        if not os.path.exists(image_path):
            print(f"Image not found: {image_path}")
            continue
        
        image = cv2.imread(image_path)
        height, width, _ = image.shape

        # Convert annotations
        yolo_annotations = voc_to_yolo(annotation_path, width, height)
        
        # Save annotations
        with open(f"{output_dir}/labels/{split}/{image_file.replace('.jpg', '.txt')}", "w") as file:
            file.write("\n".join(yolo_annotations))
        
        # Copy images to the corresponding directory
        os.system(f"cp {image_path} {output_dir}/images/{split}/")

# Save dataset.yaml for YOLOv5 training
dataset_yaml = f"""
train: {output_dir}/images/train
val: {output_dir}/images/val

nc: {len(classes)}
names: {classes}
"""
with open(f"{output_dir}/dataset.yaml", "w") as file:
    file.write(dataset_yaml)


Processed class names from class_list.txt:
['Grasper', 'Bipolar', 'Hook', 'Scissors', 'Clipper', 'Irrigator', 'SpecimenBag']
Classes found in XML annotations:
{'Irrigator', 'Scissors', 'Clipper', 'Hook', 'Grasper', 'Bipolar', 'SpecimenBag'}
All XML classes are present in class_list.txt.


Processing train data: 100%|██████████| 2248/2248 [00:38<00:00, 59.05it/s]
Processing val data: 100%|██████████| 563/563 [00:09<00:00, 61.99it/s]


In [10]:
import os

# Disable W&B visualization by setting the environment variable
os.environ["WANDB_MODE"] = "disabled"


In [13]:
# !rm /kaggle/working/finetuning_yolo_dataset/labels/train.cache


In [11]:
!python train.py --img 640 --batch 16 --epochs 25 --data /kaggle/working/finetuning_yolo_dataset/dataset.yaml --weights /kaggle/input/model-weight-1/best.pt --project /kaggle/working/runs --name finetune_yolov5


Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.
2025-01-12 17:36:18.429560: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-01-12 17:36:18.860403: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-01-12 17:36:18.983954: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[34m[1mtrain: [0mweights=/kaggle/input/model-weight

In [6]:
import os
from tqdm import tqdm
import torch
from yolov5.utils.general import non_max_suppression
from yolov5.models.common import DetectMultiBackend
from yolov5.utils.dataloaders import LoadImages
from yolov5.utils.torch_utils import select_device

# Ensure all paths have the correct 6-digit format
train_df['frame_path'] = train_df['frame_path'].apply(
    lambda x: os.path.join(
        os.path.dirname(x),
        f"{int(os.path.basename(x).split('.')[0]):06d}.png"
    )
)

# Validate paths
print("Validating file paths...")
missing_files = train_df[~train_df['frame_path'].apply(os.path.exists)]
if not missing_files.empty:
    print(f"Warning: {len(missing_files)} files are missing.")
    print(missing_files.head())
else:
    print("All file paths are valid.")

# Set model path and device
model_path = "/kaggle/input/fintune-weight/finetuned.pt"
device = select_device('cuda' if torch.cuda.is_available() else 'cpu')

# Load YOLOv5 model
model = DetectMultiBackend(model_path, device=device)

# Function to get bounding boxes for instruments
def get_bounding_boxes(image_path):
    dataset = LoadImages(image_path)
    results = []
    
    for path, img, img0, vid_cap, _ in dataset:
        img = torch.from_numpy(img).to(device).float() / 255.0  # Normalize
        if len(img.shape) == 3:
            img = img.unsqueeze(0)  # Add batch dimension
        
        # Perform inference
        pred = model(img)
        pred = non_max_suppression(pred, conf_thres=0.25, iou_thres=0.45)
        
        for det in pred:
            if det is not None and len(det):
                for *box, conf, cls in det:
                    results.append(box)  # Add bounding box coordinates
    return results

# Apply YOLOv5 on train_df to get bounding boxes
bbox_list = []
for image_path in tqdm(train_df['frame_path'], desc="Processing Images"):
    bboxes = get_bounding_boxes(image_path)
    bbox_list.append(bboxes)

# Save bounding box coordinates in train_df
train_df['bbox'] = bbox_list

# Save train_df with bounding boxes
train_df.to_csv("/kaggle/working/train_df_with_bboxes.csv", index=False)

print("Bounding boxes added to train_df and saved as train_df_with_bboxes.csv.")


Validating file paths...


YOLOv5 🚀 v7.0-395-g6420a1db Python-3.10.12 torch-2.4.1+cu121 CPU



All file paths are valid.


Fusing layers... 
Model summary: 157 layers, 7029004 parameters, 0 gradients, 15.8 GFLOPs
Processing Images: 100%|██████████| 29253/29253 [57:33<00:00,  8.47it/s] 


Bounding boxes added to train_df and saved as train_df_with_bboxes.csv.


In [16]:
import os
import torch

# Force environment variable for GPU
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Check GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")


Using device: cuda


In [17]:
!nvidia-smi


Sun Jan 12 18:08:20 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   42C    P8             10W /   70W |       1MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  Tesla T4                       Off |   00