In [46]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoImageProcessor, TimesformerForVideoClassification
import os
import cv2
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [20]:
# VIDEO_DIR = '/root/tatneft/datasets/violations_dataset/cuts1'
# LABELS_FILE = '/root/tatneft/datasets/violations_dataset/cuts1_train.txt'
# VAL_LABELS_FILE = '/root/tatneft/datasets/violations_dataset/cuts1_val.txt'


In [48]:
VIDEO_DIR = '/root/tatneft/datasets/violations_dataset/cuts1'
LABELS_FILE = '/root/tatneft/datasets/violations_dataset/cuts1_train.txt'
VAL_LABELS_FILE = '/root/tatneft/datasets/violations_dataset/cuts1_val.txt'
NUM_CLASSES = 12
BATCH_SIZE = 16
EPOCHS = 20
LEARNING_RATE = 2e-5
FRAME_COUNT = 8
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [49]:
model_name = "facebook/timesformer-base-finetuned-k400"
processor = AutoImageProcessor.from_pretrained(model_name)
model = TimesformerForVideoClassification.from_pretrained(model_name)
model.classifier = nn.Linear(model.classifier.in_features, NUM_CLASSES)  # Изменяем последний слой под наше число классов
model.to(DEVICE)

  return torch.load(checkpoint_file, map_location="cpu")


TimesformerForVideoClassification(
  (timesformer): TimesformerModel(
    (embeddings): TimesformerEmbeddings(
      (patch_embeddings): TimesformerPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (pos_drop): Dropout(p=0.0, inplace=False)
      (time_drop): Dropout(p=0.0, inplace=False)
    )
    (encoder): TimesformerEncoder(
      (layer): ModuleList(
        (0-11): 12 x TimesformerLayer(
          (drop_path): Identity()
          (attention): TimeSformerAttention(
            (attention): TimesformerSelfAttention(
              (qkv): Linear(in_features=768, out_features=2304, bias=True)
              (attn_drop): Dropout(p=0.0, inplace=False)
            )
            (output): TimesformerSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): TimesformerIntermediate(
            (dense

In [51]:
def load_labels(label_file):
    """
    Load video file paths and corresponding labels from a text file.
    
    Args:
        label_file (str): Path to the file containing video paths and labels
        
    Returns:
        list: List of tuples containing (video_path, label)
    """
    labels = []
    with open(label_file, 'r') as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) == 2:
                labels.append((parts[0], int(parts[1])))
    return labels

In [52]:
class VideoDataset(Dataset):
        """
        Custom PyTorch Dataset class for loading and processing video data.
    
        Args:
            video_dir (str): Directory containing video files
            label_file (str): File containing video labels
            frame_count (int): Number of frames to extract from each video
        """
    def __init__(self, video_dir, label_file, frame_count=8):
        self.video_dir = video_dir
        self.labels = load_labels(label_file)
        self.frame_count = frame_count

    def __len__(self):
        """Returns the total number of samples in the dataset."""
        return len(self.labels)

    def __getitem__(self, idx):
        """
        Load and process a single video sample.
        
        Args:
            idx (int): Index of the sample to load
            
        Returns:
            tuple: (processed_frames, label) where:
                - processed_frames: Tensor of shape (num_frames, channels, height, width)
                - label: Integer class label
        """
        video_path, label = self.labels[idx]
        video_path = os.path.join(self.video_dir, video_path)
        cap = cv2.VideoCapture(video_path)
        frames = []

        if not cap.isOpened(): # Handle case where video cannot be opened
            print(f"Ошибка: Не удалось открыть видео {video_path}")
            return self.__getitem__((idx + 1) % len(self.labels))
         
        # Calculate frame sampling step
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        step = max(1, total_frames // self.frame_count) if total_frames > 0 else 1

        for i in range(self.frame_count):
            cap.set(cv2.CAP_PROP_POS_FRAMES, i * step)
            ret, frame = cap.read()
            if not ret:
                break
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame = cv2.resize(frame, (224, 224))  # Приводим к нужному размеру
            frames.append(frame)
        cap.release()

        # Handle case where no frames were read
        if len(frames) == 0:
            print(f"Ошибка: Видео {video_path} не содержит кадров")
            return self.__getitem__((idx + 1) % len(self.labels))

        # Pad with last frame if we didn't get enough frames
        while len(frames) < self.frame_count:
            frames.append(frames[-1])

        # Преобразуем кадры в формат, ожидаемый моделью (num_frames, channels, height, width)
        frames = np.array(frames)  # (num_frames, height, width, channels)
        frames = np.transpose(frames, (0, 3, 1, 2))  # (num_frames, channels, height, width)
        frames = torch.tensor(frames, dtype=torch.float32) / 255.0  # Нормализуем

        return frames, torch.tensor(label)

In [53]:
def collate_fn(batch):
    """
    Custom collate function to properly batch samples.
    
    Args:
        batch (list): List of samples from the dataset
        
    Returns:
        tuple: (batched_frames, batched_labels) where:
            - batched_frames: Tensor of shape (batch_size, num_frames, channels, height, width)
            - batched_labels: Tensor of shape (batch_size,)
    """
    # Собираем батч: (frames, labels)
    frames, labels = zip(*batch)
    
    # Преобразуем список тензоров (num_frames, channels, height, width) в один тензор
    frames = torch.stack(frames)  # (batch_size, num_frames, channels, height, width)
    
    # Меняем порядок осей для модели: (batch_size, num_frames, channels, height, width)
    # Это формат, который ожидает TimeSformer
    labels = torch.stack(labels)
    
    return frames, labels

In [54]:
train_dataset = VideoDataset(VIDEO_DIR, LABELS_FILE, frame_count=FRAME_COUNT)
val_dataset = VideoDataset(VIDEO_DIR, VAL_LABELS_FILE, frame_count=FRAME_COUNT)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

# Оптимизатор и функция потерь
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss()

In [55]:
def evaluate(model, dataloader):
    """
    Evaluate model performance with per-class metrics
    
    Args:
        model: The trained model
        dataloader: DataLoader for evaluation
        
    Returns:
        tuple: (val_loss, overall_metrics, class_metrics) where:
            - val_loss: average validation loss
            - overall_metrics: dictionary with overall metrics
            - class_metrics: dictionary with metrics per class
    """
    model.eval()
    val_loss = 0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for videos, labels in dataloader:
            videos, labels = videos.to(DEVICE), labels.to(DEVICE)
            outputs = model(videos).logits
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            labels = labels.cpu().numpy()
            
            all_preds.extend(preds)
            all_labels.extend(labels)
    
    # Calculate overall metrics
    overall_metrics = {
        'loss': val_loss / len(dataloader),
        'accuracy': accuracy_score(all_labels, all_preds),
        'precision': precision_score(all_labels, all_preds, average='weighted', zero_division=0),
        'recall': recall_score(all_labels, all_preds, average='weighted', zero_division=0),
        'f1': f1_score(all_labels, all_preds, average='weighted', zero_division=0)
    }
    
    # Calculate per-class metrics
    class_metrics = {}
    for class_id in range(NUM_CLASSES):
        # Create binary labels for this class
        binary_labels = (np.array(all_labels) == class_id).astype(int)
        binary_preds = (np.array(all_preds) == class_id).astype(int)
        
        # Skip if no true samples for this class
        if sum(binary_labels) == 0:
            continue
            
        class_metrics[class_id] = {
            'precision': precision_score(binary_labels, binary_preds, zero_division=0),
            'recall': recall_score(binary_labels, binary_preds, zero_division=0),
            'f1': f1_score(binary_labels, binary_preds, zero_division=0),
            'support': sum(binary_labels)  # Number of actual samples
        }
    
    return val_loss / len(dataloader), overall_metrics, class_metrics

In [56]:
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    
    # Training phase
    for videos, labels in train_loader:
        # ... (keep existing training code the same)
    
    # Evaluation phase
    val_loss, overall_metrics, class_metrics = evaluate(model, val_loader)
    
    # Print metrics
    print(f"Epoch [{epoch+1}/{EPOCHS}]")
    print(f"  Training Loss: {total_loss / len(train_loader):.4f}")
    print("\nOverall Validation Metrics:")
    print(f"  Loss: {val_loss:.4f}")
    print(f"  Accuracy: {overall_metrics['accuracy']:.4f}")
    print(f"  Precision: {overall_metrics['precision']:.4f}")
    print(f"  Recall: {overall_metrics['recall']:.4f}")
    print(f"  F1-score: {overall_metrics['f1']:.4f}")
    
    print("\nPer-Class Validation Metrics:")
    for class_id, metrics in class_metrics.items():
        print(f"  Class {class_id}:")
        print(f"    Precision: {metrics['precision']:.4f}")
        print(f"    Recall: {metrics['recall']:.4f}")
        print(f"    F1-score: {metrics['f1']:.4f}")
        print(f"    Support: {metrics['support']}")
    
    print("-" * 50)
    
    # Save model checkpoints
    if (epoch + 1) % 2 == 0:
        torch.save(model.state_dict(), f"timesformer_epoch_{epoch+1}.pth")

пошло
пошло20
пошло20
пошло20
пошло20
пошло20
пошло20
пошло20
пошло20
пошло20
пошло20
пошло20
пошло20
пошло20
пошло20
пошло20
пошло20
пошло20
пошло20
пошло20
пошло20
пошло20
пошло20
пошло20
пошло20
пошло!!!
Эпоха [1/20]
  ▸ Потери на обучении: 1.9036
  ▸ Потери на валидации: 1.4612
  ▸ Accuracy: 0.6203
  ▸ Precision: 0.5163
  ▸ Recall: 0.6203
  ▸ F1-score: 0.5469
--------------------------------------------------
пошло
пошло20
пошло20
пошло20
пошло20
пошло20
пошло20
пошло20
пошло20
пошло20
пошло20
пошло20
пошло20
пошло20
пошло20
пошло20
пошло20
пошло20
пошло20
пошло20
пошло20
пошло20
пошло20
пошло20
пошло20
пошло!!!
Эпоха [2/20]
  ▸ Потери на обучении: 0.9573
  ▸ Потери на валидации: 0.7944
  ▸ Accuracy: 0.7722
  ▸ Precision: 0.7401
  ▸ Recall: 0.7722
  ▸ F1-score: 0.7493
--------------------------------------------------
пошло
пошло20
пошло20
пошло20
пошло20
пошло20
пошло20
пошло20
пошло20
пошло20
пошло20
пошло20
пошло20
пошло20
пошло20
пошло20
пошло20
пошло20
пошло20
пошло20
пошло20


In [None]:
torch.save(model.state_dict(), "timesformer_finetuned.pth")

#inference

In [2]:
import torch
import cv2
import numpy as np
from torch.utils.data import Dataset
import os
from transformers import TimesformerForVideoClassification


In [4]:
MODEL_PATH = "timesformer_epoch_20.pth"  # Путь к сохраненной модели
VIDEO_DIR = "/path/to/your/videos"  # Директория с видео для inference
NUM_CLASSES = 19  # Должно совпадать с обучением
FRAME_COUNT = 8  # Должно совпадать с обучением
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [5]:
model = TimesformerForVideoClassification.from_pretrained(
    "facebook/timesformer-base-finetuned-k400",
    num_labels=NUM_CLASSES,
    ignore_mismatched_sizes=True
)
model.classifier = torch.nn.Linear(model.classifier.in_features, NUM_CLASSES)
model.load_state_dict(torch.load(MODEL_PATH))
model.to(DEVICE)
model.eval()

  return torch.load(checkpoint_file, map_location="cpu")
Some weights of TimesformerForVideoClassification were not initialized from the model checkpoint at facebook/timesformer-base-finetuned-k400 and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([400, 768]) in the checkpoint and torch.Size([19, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([400]) in the checkpoint and torch.Size([19]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load(MODEL_PATH))


TimesformerForVideoClassification(
  (timesformer): TimesformerModel(
    (embeddings): TimesformerEmbeddings(
      (patch_embeddings): TimesformerPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (pos_drop): Dropout(p=0.0, inplace=False)
      (time_drop): Dropout(p=0.0, inplace=False)
    )
    (encoder): TimesformerEncoder(
      (layer): ModuleList(
        (0-11): 12 x TimesformerLayer(
          (drop_path): Identity()
          (attention): TimeSformerAttention(
            (attention): TimesformerSelfAttention(
              (qkv): Linear(in_features=768, out_features=2304, bias=True)
              (attn_drop): Dropout(p=0.0, inplace=False)
            )
            (output): TimesformerSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): TimesformerIntermediate(
            (dense

In [None]:
class InferenceVideoDataset(Dataset):

    def __init__(self, video_paths, frame_count=8):
        self.video_paths = video_paths
        self.frame_count = frame_count

    def __len__(self):
        return len(self.video_paths)

    def __getitem__(self, idx):
        video_path = self.video_paths[idx]
        cap = cv2.VideoCapture(video_path)
        frames = []

        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        step = max(1, total_frames // self.frame_count)

        for i in range(self.frame_count):
            cap.set(cv2.CAP_PROP_POS_FRAMES, i * step)
            ret, frame = cap.read()
            if not ret:
                break
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame = cv2.resize(frame, (224, 224))
            frames.append(frame)
        cap.release()

        # Если не хватает кадров, дублируем последний
        while len(frames) < self.frame_count:
            frames.append(frames[-1])

        # Преобразуем в формат для модели
        frames = np.array(frames)  # (num_frames, height, width, channels)
        frames = np.transpose(frames, (0, 3, 1, 2))  # (num_frames, channels, height, width)
        frames = torch.tensor(frames, dtype=torch.float32) / 255.0

        return frames, os.path.basename(video_path)


In [None]:
def predict_single_video(model, video_path):
    """
    Make prediction for a single video file.
    
    Args:
        model: Trained model
        video_path: Path to video file
        
    Returns:
        dict: Dictionary containing:
            - filename: Name of the video file
            - predicted_class: Predicted class index
            - confidence: Confidence score for prediction
            - probabilities: Array of probabilities for all classes
    """
    """Предсказание для одного видео"""
    dataset = InferenceVideoDataset([video_path], frame_count=FRAME_COUNT)
    frames, filename = dataset[0]
    
    with torch.no_grad():
        inputs = frames.unsqueeze(0).to(DEVICE)  # Добавляем batch dimension
        outputs = model(inputs).logits
        probs = torch.nn.functional.softmax(outputs, dim=1)
        pred_class = torch.argmax(probs).item()
        confidence = torch.max(probs).item()
    
    return {
        "filename": filename,
        "predicted_class": pred_class,
        "confidence": confidence,
        "probabilities": probs.cpu().numpy()[0]
    }


In [None]:
def predict_batch(model, video_dir):
    """
    Make predictions for all video files in a directory.
    
    Args:
        model: Trained model
        video_dir: Directory containing video files
        
    Returns:
        list: List of prediction dictionaries (same format as predict_single_video)
    """
    """Пакетное предсказание для всех видео в директории"""
    video_paths = [os.path.join(video_dir, f) for f in os.listdir(video_dir) 
                  if f.endswith(('.mp4', '.avi', '.mov'))]
    dataset = InferenceVideoDataset(video_paths, frame_count=FRAME_COUNT)
    
    results = []
    for frames, filename in dataset:
        with torch.no_grad():
            inputs = frames.unsqueeze(0).to(DEVICE)
            outputs = model(inputs).logits
            probs = torch.nn.functional.softmax(outputs, dim=1)
            pred_class = torch.argmax(probs).item()
            confidence = torch.max(probs).item()
        
        results.append({
            "filename": filename,
            "predicted_class": pred_class,
            "confidence": confidence,
            "probabilities": probs.cpu().numpy()[0]
        })
    
    return results


In [None]:
 # Вариант 1: Предсказание для одного видео
video_path = "/path/to/single/video.mp4"
result = predict_single_video(model, video_path)
print(f"Результат для {result['filename']}:")
print(f"Класс: {result['predicted_class']}, Уверенность: {result['confidence']:.2f}")
    

In [None]:
# Вариант 2: Пакетная обработка всех видео в директории
all_results = predict_batch(model, VIDEO_DIR)
for res in all_results:
    print(f"{res['filename']}: класс {res['predicted_class']} (уверенность: {res['confidence']:.2f})")