In [1]:
# 1. Gerekli K√ºt√ºphaneleri Y√ºkle (TPU ortamƒ±nda bazen eksik olabilir)
!pip install transformers accelerate -q

import os
import cv2
import torch
import numpy as np
import requests
from PIL import Image
from google.colab import files
from sklearn.cluster import KMeans
from torchvision import transforms
from transformers import Blip2Processor, Blip2ForConditionalGeneration

# --- TPU ƒ∞√áƒ∞N √ñZEL KISIM BA≈ûLANGICI ---
# CUDA yerine XLA k√ºt√ºphanesini kullanmaya √ßalƒ±≈üƒ±yoruz
try:
    import torch_xla.core.xla_model as xm
    device = xm.xla_device()
    print(f"‚úÖ TPU Ba≈üarƒ±yla Aktif Edildi! Cihaz: {device}")
except ImportError:
    print("‚ö†Ô∏è TPU k√ºt√ºphanesi (torch_xla) bulunamadƒ±. CPU kullanƒ±lƒ±yor.")
    device = "cpu"
# --- TPU ƒ∞√áƒ∞N √ñZEL KISIM Bƒ∞Tƒ∞≈ûƒ∞ ---

# 3. Video Y√ºkleme Kontrol√º
video_path = "/content/video.mp4"
if not os.path.exists(video_path):
    print("Video bulunamadƒ±, l√ºtfen y√ºkleyiniz:")
    uploaded = files.upload()

# --- MODEL Y√úKLEME ---
print("DINOv2 Modeli y√ºkleniyor...")
dinov2 = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitl14')
dinov2.to(device)
dinov2.eval()

transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
])

# --- Vƒ∞DEO OKUMA ---
cap = cv2.VideoCapture(video_path)
frames = []
frame_ids = []
frame_count = 0

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # HIZLANDIRMA: Her 30. kareyi al
    if frame_count % 30 == 0:
        frames.append(frame)
        frame_ids.append(frame_count)
    frame_count += 1
cap.release()

if len(frames) == 0:
    raise ValueError("Hi√ß kare okunamadƒ±!")

# --- EMBEDDING √áIKARMA (TPU KULLANILACAK) ---
print(f"TPU √ºzerinde {len(frames)} kare i≈üleniyor...")
embeddings = []

for i, frame in enumerate(frames):
    image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

    # Tens√∂r√º TPU cihazƒ±na g√∂nderiyoruz (.to(device))
    image_tensor = transform(image).unsqueeze(0).to(device)

    with torch.no_grad():
        embedding = dinov2(image_tensor)

        # TPU'dan sonucu geri alƒ±rken CPU'ya √ßekmemiz lazƒ±m (.cpu())
        embeddings.append(embedding.cpu().numpy().reshape(-1))

    if i % 10 == 0:
        # TPU i≈ülemlerini senkronize etmek bazen gerekebilir
        xm.mark_step()

embeddings = np.array(embeddings)

# --- K√úMELEME ---
NUM_CLUSTERS = min(5, len(frames))
kmeans = KMeans(n_clusters=NUM_CLUSTERS, random_state=42)
labels = kmeans.fit_predict(embeddings)
keyframe_indices = []

for cluster_id in range(NUM_CLUSTERS):
    cluster_indices = np.where(labels == cluster_id)[0]
    cluster_embeddings = embeddings[cluster_indices]
    center = kmeans.cluster_centers_[cluster_id]
    distances = np.linalg.norm(cluster_embeddings - center, axis=1)
    best_local_idx = np.argmin(distances)
    keyframe_indices.append(cluster_indices[best_local_idx])

keyframe_indices = sorted(keyframe_indices)

# --- CAPTIONING ---
# Belleƒüi bo≈üalt
del dinov2
# TPU belleƒüini temizlemek biraz farklƒ±dƒ±r ama garbage collector yeterli olabilir
import gc
gc.collect()

print("BLIP-2 Modeli y√ºkleniyor...")
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")

# Modeli TPU'ya atƒ±yoruz
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b").to(device)

print("\n--- SONU√áLAR ---")
for idx in keyframe_indices:
    frame = frames[idx]
    image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

    inputs = processor(image, return_tensors="pt").to(device)

    with torch.no_grad():
        output = model.generate(**inputs, max_new_tokens=30)

    caption = processor.decode(output[0], skip_special_tokens=True)
    print(f"Frame {frame_ids[idx]}: {caption}")

ModuleNotFoundError: No module named 'cv2'

In [9]:
!pip install opencv-python-headless

Collecting opencv-python-headless
  Downloading opencv_python_headless-4.13.0.90-cp37-abi3-manylinux_2_28_x86_64.whl.metadata (19 kB)
Downloading opencv_python_headless-4.13.0.90-cp37-abi3-manylinux_2_28_x86_64.whl (62.5 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m62.5/62.5 MB[0m [31m61.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: opencv-python-headless
Successfully installed opencv-python-headless-4.13.0.90


In [10]:
import cv2
import torch
import numpy as np
import os
from sklearn.cluster import KMeans
import gc
from PIL import Image
from torchvision import transforms
from transformers import BlipProcessor, BlipForConditionalGeneration
from google.colab import files

# 1. Cihazƒ± Belirle (H100 de bir "cuda" cihazƒ±dƒ±r)
device = "cuda" if torch.cuda.is_available() else "cpu"

# 2. Hangi GPU'nun √ßalƒ±≈ütƒ±ƒüƒ±nƒ± kontrol et
if device == "cuda":
    gpu_name = torch.cuda.get_device_name(0)
    print(f"üöÄ KULLANILAN GPU: {gpu_name}")

    # H100 i√ßin Hƒ±zlandƒ±rma Ayarlarƒ±
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True
    print("‚úÖ H100 i√ßin TF32 (TensorFloat-32) modu a√ßƒ±ldƒ±!")
else:
    print("‚ö†Ô∏è Dƒ∞KKAT: GPU bulunamadƒ±! Ayarlardan GPU se√ßtiƒüine emin misin?")

# 3. Video Y√ºkleme Kontrol√º
video_path = "/content/video.mp4"
if not os.path.exists(video_path):
    print("Video bulunamadƒ±, l√ºtfen y√ºkleyiniz:")
    uploaded = files.upload()

# 4. Model Y√ºkleme (Small Model - Bellek Dostu)
print("DINOv2 (Small) Modeli Y√ºkleniyor...")
dinov2 = torch.hub.load('facebookresearch/dinov2', 'dinov2_vits14')
dinov2.to(device)
dinov2.eval()

transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
])

# 5. Video Okuma ve √ñrnekleme (Sampling)
cap = cv2.VideoCapture(video_path)
frames = []
frame_ids = []
frame_count = 0
SKIP_FRAMES = 30  # Her 30 karede bir √∂rnek al (Yakla≈üƒ±k 1 FPS)

print("Video okunuyor ve √∂rnekleniyor (Her 30. kare)...")
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    if frame_count % SKIP_FRAMES == 0:
        frames.append(frame)
        frame_ids.append(frame_count)
    frame_count += 1

cap.release()
print(f"Toplam Kare: {frame_count}, ƒ∞≈ülenecek Kare Sayƒ±sƒ±: {len(frames)}")

if len(frames) == 0:
    raise ValueError("Video okunamadƒ±!")

# 6. Embedding √áƒ±karma
embeddings = []
print("Embeddingler √ßƒ±karƒ±lƒ±yor...")

for i, frame in enumerate(frames):
    image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    image_tensor = transform(image).unsqueeze(0).to(device)

    with torch.no_grad():
        embedding = dinov2(image_tensor)
        # GPU sonucunu CPU'ya alƒ±p numpy array'e √ßeviriyoruz
        embeddings.append(embedding.cpu().numpy().reshape(-1))

embeddings = np.array(embeddings)
print(f"Embedding Shape: {embeddings.shape}")

# 7. K√ºmeleme (Keyframe Se√ßimi)
NUM_CLUSTERS = min(10, len(frames))
print(f"{NUM_CLUSTERS} adet anahtar kare se√ßiliyor...")

kmeans = KMeans(n_clusters=NUM_CLUSTERS, random_state=42)
labels = kmeans.fit_predict(embeddings)
keyframe_indices = []

for cluster_id in range(NUM_CLUSTERS):
    cluster_indices = np.where(labels == cluster_id)[0]
    cluster_embeddings = embeddings[cluster_indices]
    center = kmeans.cluster_centers_[cluster_id]
    distances = np.linalg.norm(cluster_embeddings - center, axis=1)
    best = cluster_indices[np.argmin(distances)]
    keyframe_indices.append(best)

keyframe_indices = sorted(keyframe_indices)

# 8. Bellek Temizliƒüi (Captioning √ñncesi)
del dinov2
del embeddings
gc.collect()
torch.cuda.empty_cache()  # GPU belleƒüini temizle

# 9. Captioning (Hafif Model ile)
print("Caption Modeli (BLIP-Large) Y√ºkleniyor...")
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to(device)

print("\n--- SONU√áLAR ---")
for idx in keyframe_indices:
    frame = frames[idx]
    image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    inputs = processor(image, return_tensors="pt").to(device)

    with torch.no_grad():
        output = model.generate(**inputs, max_new_tokens=50)

    caption = processor.decode(output[0], skip_special_tokens=True)
    print(f"Frame {frame_ids[idx]}: {caption}")



‚ö†Ô∏è Dƒ∞KKAT: GPU bulunamadƒ±! Ayarlardan GPU se√ßtiƒüine emin misin?
Video bulunamadƒ±, l√ºtfen y√ºkleyiniz:


Saving video.mp4 to video.mp4
DINOv2 (Small) Modeli Y√ºkleniyor...
Downloading: "https://github.com/facebookresearch/dinov2/zipball/main" to /root/.cache/torch/hub/main.zip




Downloading: "https://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_pretrain.pth" to /root/.cache/torch/hub/checkpoints/dinov2_vits14_pretrain.pth


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 84.2M/84.2M [00:00<00:00, 285MB/s]


Video okunuyor ve √∂rnekleniyor (Her 30. kare)...
Toplam Kare: 11087, ƒ∞≈ülenecek Kare Sayƒ±sƒ±: 370
Embeddingler √ßƒ±karƒ±lƒ±yor...
Embedding Shape: (370, 384)
10 adet anahtar kare se√ßiliyor...


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Caption Modeli (BLIP-Large) Y√ºkleniyor...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/445 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/527 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.88G [00:00<?, ?B/s]


--- SONU√áLAR ---
Frame 480: arafed building with a sign that says surve grosy bye boy bab bab iridid
Frame 1410: there is a large green object on the ground next to a pile of trash
Frame 2370: there is a picture of a chair that has a yellow apple on it
Frame 2970: there is a bathroom with a toilet and a sink in it
Frame 3930: there is a television screen showing a television with a bunch of boxes
Frame 4440: a close up of a chair and a bag on the ground
Frame 4860: there is a man walking down a hallway with a dog
Frame 5700: a close up of a person standing on a stair case
Frame 9180: araf yassim hasi hasi tit haberde
Frame 10500: a close up of a hallway with a bed and a table


In [None]:
import cv2
import torch
import numpy as np
import os
from sklearn.cluster import KMeans
from PIL import Image
from transformers import Blip2Processor, Blip2ForConditionalGeneration
from google.colab import files

# --- AYARLAR ---
SKIP_FRAMES = 30       # Her 30 karede bir i≈ülem yap (Yakla≈üƒ±k 1 FPS)
BLUR_THRESHOLD = 100.0 # Bulanƒ±klƒ±k e≈üiƒüi (D√º≈ü√ºk deƒüerler daha bulanƒ±k kabul edilir)
MIN_CLUSTERS = 5       # En az ka√ß keyframe olsun
MAX_CLUSTERS = 20      # En fazla ka√ß keyframe olsun

# 1. Cihazƒ± Belirle
device = "cuda" if torch.cuda.is_available() else "cpu"

# 2. Hangi GPU'nun √ßalƒ±≈ütƒ±ƒüƒ±nƒ± kontrol et
if device == "cuda":
    gpu_name = torch.cuda.get_device_name(0)
    print(f"üöÄ KULLANILAN GPU: {gpu_name}")

    # H100 vb. i√ßin Hƒ±zlandƒ±rma Ayarlarƒ±
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True
else:
    print("‚ö†Ô∏è Dƒ∞KKAT: GPU bulunamadƒ±!")

# 3. Model Y√ºkleme (DINOv2 - Large)
print("DINOv2 (Large) Modeli Y√ºkleniyor...")
dinov2 = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitl14')
dinov2.to(device)
dinov2.eval()

# 4. Video ƒ∞≈üleme
video_path = "/content/video.mp4"
if not os.path.exists(video_path):
    print("Video bulunamadƒ±, l√ºtfen y√ºkleyiniz:")
    uploaded = files.upload()

cap = cv2.VideoCapture(video_path)
fps = cap.get(cv2.CAP_PROP_FPS)
if fps == 0: fps = 30 # Hata √∂nleyici

frames = []
frame_ids = []
frame_count = 0
valid_frame_count = 0

print("Video taranƒ±yor (√ñrnekleme + Bulanƒ±klƒ±k Kontrol√º)...")
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # 1. Adƒ±m: √ñrnekleme (Sadece belirli aralƒ±klarla kare al)
    if frame_count % SKIP_FRAMES == 0:
        # 2. Adƒ±m: Bulanƒ±klƒ±k Kontrol√º
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        laplacian_var = cv2.Laplacian(gray, cv2.CV_64F).var()

        if laplacian_var > BLUR_THRESHOLD:
            frames.append(frame)
            frame_ids.append(frame_count)
            valid_frame_count += 1

    frame_count += 1

cap.release()
print(f"Toplam Kare: {frame_count}")
print(f"Se√ßilen Net Kare Sayƒ±sƒ±: {len(frames)}")

if len(frames) == 0:
    raise ValueError("Hi√ß uygun kare bulunamadƒ±! E≈üik deƒüerini d√º≈ü√ºrmeyi deneyin.")

# 5. Embedding √áƒ±karma
embeddings = []
print("Embeddingler √ßƒ±karƒ±lƒ±yor...")

for frame in frames:
    image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    # DINOv2 i√ßin resize
    image = image.resize((224, 224))
    image_tensor = torch.tensor(np.array(image)).permute(2, 0, 1).float() / 255.0
    image_tensor = image_tensor.unsqueeze(0).to(device)

    with torch.no_grad():
        embedding = dinov2(image_tensor)

    embeddings.append(embedding.cpu().numpy()[0])

embeddings = np.array(embeddings)

# 6. Dinamik K√ºmeleme (Video s√ºresine g√∂re)
# √ñrnek: Her 10 saniye i√ßin 1 keyframe, ama min 5 max 20 arasƒ±nda tut.
duration_sec = frame_count / fps
dynamic_k = int(duration_sec / 10)
NUM_CLUSTERS = max(MIN_CLUSTERS, min(dynamic_k, MAX_CLUSTERS, len(frames)))

print(f"Video S√ºresi: {duration_sec:.1f}sn -> Hedeflenen Keyframe Sayƒ±sƒ±: {NUM_CLUSTERS}")

kmeans = KMeans(n_clusters=NUM_CLUSTERS, random_state=42)
labels = kmeans.fit_predict(embeddings)
keyframe_indices = []

for cluster_id in range(NUM_CLUSTERS):
    cluster_indices = np.where(labels == cluster_id)[0]
    cluster_embeddings = embeddings[cluster_indices]
    center = kmeans.cluster_centers_[cluster_id]
    distances = np.linalg.norm(cluster_embeddings - center, axis=1)
    best = cluster_indices[np.argmin(distances)]
    keyframe_indices.append(best)

keyframe_indices = sorted(keyframe_indices)

# 7. Captioning (BLIP-2)
print("BLIP-2 Modeli Y√ºkleniyor...")
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b").to(device)

print("\n--- SONU√áLAR ---")
for idx in keyframe_indices:
    frame = frames[idx]
    image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

    inputs = processor(image, return_tensors="pt").to(device)

    with torch.no_grad():
        output = model.generate(**inputs, max_new_tokens=30)

    caption = processor.decode(output[0], skip_special_tokens=True)
    print(f"Frame {frame_ids[idx]} ({frame_ids[idx]/fps:.1f}s): {caption}")

üöÄ KULLANILAN GPU: NVIDIA H100 80GB HBM3
DINOv2 (Large) Modeli Y√ºkleniyor...


Using cache found in /root/.cache/torch/hub/facebookresearch_dinov2_main


Video taranƒ±yor (√ñrnekleme + Bulanƒ±klƒ±k Kontrol√º)...
Toplam Kare: 11087
Se√ßilen Net Kare Sayƒ±sƒ±: 370
Embeddingler √ßƒ±karƒ±lƒ±yor...
Video S√ºresi: 443.5sn -> Hedeflenen Keyframe Sayƒ±sƒ±: 20
BLIP-2 Modeli Y√ºkleniyor...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]


--- SONU√áLAR ---
Frame 240 (9.6s): turkey's president says he will not stop fighting in syria

Frame 390 (15.6s): syrias al-quds hospital in the city of aleppo was destroyed by the regime

Frame 1050 (42.0s): a news screen showing the aftermath of a bombing

Frame 1410 (56.4s): a television screen showing a picture of a gun and a bag

Frame 2370 (94.8s): a television screen shows a chair with a yellow object on it

Frame 2820 (112.8s): a television screen showing a bathroom with a toilet and trash

Frame 3060 (122.4s): a television screen showing a room with a chair and a table

Frame 3870 (154.8s): a television screen showing a room with boxes and other items

Frame 4200 (168.0s): a man is walking through a hallway with a television screen

Frame 6090 (243.6s): a tv screen showing a man walking down a hallway

Frame 6240 (249.6s): a man in a black hoodie is standing in a hallway

Frame 6720 (268.8s): a man in black jacket and hoodie is holding a gun

Frame 6960 (278.4s): a televisio