In [1]:
import torch
import cv2
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image

# Installer les bibliothèques nécessaires pour Grounding DINO
!pip install transformers timm

# Importer les modules nécessaires
from transformers import DetrImageProcessor, DetrForObjectDetection
from transformers import CLIPProcessor, CLIPModel

# Vérifier si le GPU est disponible
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using device: {device}')

# Charger le modèle Grounding DINO et le processeur
dino_model = DetrForObjectDetection.from_pretrained('facebook/detr-resnet-50').to(device)
dino_processor = DetrImageProcessor.from_pretrained('facebook/detr-resnet-50')

# Charger le modèle CLIP pour la description en langage naturel
clip_model = CLIPModel.from_pretrained('openai/clip-vit-base-patch32').to(device)
clip_processor = CLIPProcessor.from_pretrained('openai/clip-vit-base-patch32')

# Définir une fonction pour traiter les images
def process_frame(frame, dino_model, dino_processor, clip_model, clip_processor, description):
    # Prétraiter l'image pour Grounding DINO
    img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    inputs = dino_processor(images=img, return_tensors="pt").to(device)
    
    # Préparer la description en langage naturel
    text_inputs = clip_processor(text=[description], return_tensors="pt").to(device)
    
    # Exécuter le modèle Grounding DINO sur l'image
    with torch.no_grad():
        outputs = dino_model(**inputs)
    
    # Obtenir les prédictions et les boîtes de délimitation
    pred_logits = outputs.logits.softmax(-1)
    pred_boxes = outputs.pred_boxes
    
    # Préparer les descriptions pour la correspondance
    with torch.no_grad():
        text_features = clip_model.get_text_features(**text_inputs)
        
        # Adapter la taille des images pour le modèle CLIP
        pixel_values = torch.nn.functional.interpolate(inputs['pixel_values'], size=(224, 224))
        image_features = clip_model.get_image_features(pixel_values=pixel_values)
    
    # Calculer les scores de correspondance
    text_features = text_features / text_features.norm(dim=-1, keepdim=True)
    image_features = image_features / image_features.norm(dim=-1, keepdim=True)
    similarity = torch.matmul(image_features, text_features.T)
    
    # S'assurer que le tensor de similarité est de dimension correcte
    if similarity.dim() == 2:
        similarity = similarity.squeeze(1)
    
    threshold = 0.9  # Seuil de confiance
    boxes = []
    for score, box in zip(similarity, pred_boxes):
        if score.item() > threshold:
            box = box.cpu().numpy()
            boxes.append([*box, score.item(), 0])  # 0 comme placeholder pour la classe
    
    return boxes

# Définir une fonction pour annoter les images avec les boîtes de délimitation détectées
def annotate_frame(frame, boxes):
    for box in boxes:
        x1, y1, x2, y2, conf, cls = box
        # Dessiner la boîte de délimitation
        cv2.rectangle(frame, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 0), 2)
        # Ajouter l'étiquette
        cv2.putText(frame, f'dolphin {conf:.2f}', (int(x1), int(y1) - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)
    
    return frame

# Charger le fichier vidéo
video_path = '/kaggle/input/exp-01-jun-2024-1145-cam1-4-mp4/Exp_01_Jun_2024_1145_cam1-4.mp4'
cap = cv2.VideoCapture(video_path)

# Obtenir les propriétés de la vidéo
fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

# Définir le codec et créer un objet VideoWriter
output_path = '/kaggle/working/output_video.mp4'
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

# Vérifier si la vidéo s'est chargée avec succès
if not cap.isOpened():
    print("Error opening video stream or file")

# Description en langage naturel pour la détection des dauphins
description = "a dolphin"

# Traiter la vidéo image par image
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break
    
    # Diviser l'image en quatre quadrants
    height, width, _ = frame.shape
    half_height, half_width = height // 2, width // 2
    
    quadrants = [
        frame[0:half_height, 0:half_width],
        frame[0:half_height, half_width:width],
        frame[half_height:height, 0:half_width],
        frame[half_height:height, half_width:width]
    ]
    
    # Traiter chaque quadrant
    for i, quadrant in enumerate(quadrants):
        # Assurer que le cadre du quadrant est dans le bon format
        quadrant_rgb = cv2.cvtColor(quadrant, cv2.COLOR_BGR2RGB)
        boxes = process_frame(quadrant_rgb, dino_model, dino_processor, clip_model, clip_processor, description)
        quadrants[i] = annotate_frame(quadrant, boxes)
    
    # Combiner les quadrants en une seule image
    top_row = np.hstack((quadrants[0], quadrants[1]))
    bottom_row = np.hstack((quadrants[2], quadrants[3]))
    combined_frame = np.vstack((top_row, bottom_row))
    
    # Écrire l'image dans le fichier vidéo de sortie
    out.write(combined_frame)

# Libérer les objets de capture et d'écriture vidéo
cap.release()
out.release()

print("Video processing complete. The output video is saved to /kaggle/working/output_video.mp4")




2024-06-08 12:43:26.331410: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-08 12:43:26.331509: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-08 12:43:26.475347: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Using device: cuda


config.json:   0%|          | 0.00/4.59k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/167M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/detr-resnet-50 were not used when initializing DetrForObjectDetection: ['model.backbone.conv_encoder.model.layer1.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing DetrForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DetrForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


preprocessor_config.json:   0%|          | 0.00/290 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

Video processing complete. The output video is saved to /kaggle/working/output_video.mp4
