In [1]:
!pip install tf-keras
!pip install accelerate -U




[notice] A new release of pip is available: 23.2.1 -> 24.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 23.2.1 -> 24.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
print("hello")

hello


In [3]:
# !pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

^C


In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import ViTModel, ViTFeatureExtractor, TrainingArguments, Trainer
from datasets import load_metric
import numpy as np
import cv2




In [13]:
class CustomImageDataset(Dataset):
    def __init__(self, images, labels, feature_extractor):
        self.images = images
        self.labels = labels
        self.feature_extractor = feature_extractor

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        image_1 = self.images[idx][0]
        image_2 = self.images[idx][1]
        label = self.labels[idx]
        inputs_1 = self.feature_extractor(images=image_1, return_tensors="pt")
        inputs_2 = self.feature_extractor(images=image_2, return_tensors="pt")
        return {
            "pixel_values_1": inputs_1["pixel_values"].squeeze(), 
            "pixel_values_2": inputs_2["pixel_values"].squeeze(),# Remove batch dimension
            "labels": torch.tensor(label, dtype=torch.long)
        }

In [14]:
def extract_frames(path, video_filename):
    
    # Используем OpenCV для чтения видео
    cap = cv2.VideoCapture(path + video_filename)

    # Получаем исходную частоту кадров видео
    original_fps = cap.get(cv2.CAP_PROP_FPS)

    frames = []
    frame_count = 0

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        # Преобразуем кадр из BGR (OpenCV формат) в RGB
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

        # Изменяем размер кадра до 224x224 (или другого размера по вашему выбору)

        frame = cv2.resize(frame, (300, 300))
    
        # Проверяем, нужно ли сохранить текущий кадр
        if frame_count % int(original_fps ) == 0:
            frames.append(frame)  # Преобразуем в тензор и меняем порядок осей

        frame_count += 1

    cap.release()

    return np.stack(frames, axis=0)

In [15]:
import pandas as pd


def create_embedding_dataframe(video1_embeddings, video2_embeddings, interval1, interval2):
    start1, end1 = map(int, interval1.split('-'))
    start2, end2 = map(int, interval2.split('-'))

    if (end1 - start1) != (end2 - start2):
        raise ValueError("Интервалы должны быть одинаковой длины")

    frames1 = []
    frames2 = []
    is_match = []

    # Добавляем совпадающие пары из интервалов
    for i in range(end1 - start1):
        emb1 = video1_embeddings[start1 + i]
        emb2 = video2_embeddings[start2 + i]
        
        frames1.append(emb1)
        frames2.append(emb2)
        
        is_match.append(1)

    total_pairs = end1 - start1 + 1
    all_indices_video1 = set(range(len(video1_embeddings)))
    all_indices_video2 = set(range(len(video2_embeddings)))
    interval_indices_video1 = set(range(start1, end1 + 1))
    interval_indices_video2 = set(range(start2, end2 + 1))

    non_interval_indices_video1 = list(all_indices_video1 - interval_indices_video1)
    non_interval_indices_video2 = list(all_indices_video2 - interval_indices_video2)

    if len(non_interval_indices_video1) < total_pairs or len(non_interval_indices_video2) < total_pairs:
        raise ValueError("Недостаточно элементов вне интервала для создания случайных пар")

    random_pairs_added = 0
    while random_pairs_added < total_pairs:
        idx1 = np.random.choice(non_interval_indices_video1)
        idx2 = np.random.choice(non_interval_indices_video2)

        emb1 = video1_embeddings[idx1]
        emb2 = video2_embeddings[idx2]

        frames1.append(emb1)
        frames2.append(emb2)
        
        is_match.append(0)

        random_pairs_added += 1

    df_l = pd.DataFrame({
        'frames1': frames1,
        'frames2' : frames2,
        'is_match': is_match
    })

    df_l = df_l.sample(frac=1).reset_index(drop=True)

    return df_l

In [16]:
train_df = pd.DataFrame()
df = pd.read_csv("piracy_val.csv")
for _,row in df.iterrows():
    piracy_ = extract_frames('val/', row["ID_piracy"])
    license_ = extract_frames('index/', row["ID_license"])
    proxy_df = create_embedding_dataframe(piracy_, license_, row["segment"], row["segment.1"])
    train_df = pd.concat([train_df, proxy_df], ignore_index=True)

train_df.head(5)

Unnamed: 0,frames1,frames2,is_match
0,"[[[130, 156, 162], [132, 155, 162], [136, 157,...","[[[70, 48, 30], [77, 55, 37], [73, 51, 33], [7...",0
1,"[[[6, 0, 238], [6, 0, 238], [6, 0, 238], [6, 0...","[[[229, 227, 247], [229, 227, 247], [230, 228,...",0
2,"[[[131, 156, 167], [131, 156, 167], [131, 156,...","[[[170, 167, 154], [167, 168, 149], [166, 178,...",0
3,"[[[160, 140, 120], [160, 140, 120], [160, 140,...","[[[194, 194, 202], [194, 194, 202], [196, 196,...",1
4,"[[[129, 158, 174], [133, 158, 173], [135, 157,...","[[[119, 103, 116], [118, 101, 114], [115, 97, ...",1


In [17]:
def combine_columns_to_numpy(df_l, col1, col2):
    combined_list = []
    for _, row_l in df_l.iterrows():
        combined_array = np.stack((row_l[col1], row_l[col2]), axis=0)
        combined_list.append(combined_array)
    
    return np.array(combined_list)

In [18]:
from sklearn.model_selection import train_test_split

train_df.fillna(value=0, inplace=True)
y = train_df["is_match"].values
x = train_df[["frames1", "frames2"]]
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8)
images = np.array(combine_columns_to_numpy(x_train, "frames1", "frames2"), dtype=np.float32)


In [19]:
print(images.shape)

(2754, 2, 300, 300, 3)


In [20]:
# Пример данных
num_samples = 1000
num_classes = 2

# Загрузчик признаков
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')

# Создание датасета и загрузчика данных
dataset = CustomImageDataset(images, y_train, feature_extractor)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)




In [21]:
import torch.nn.functional as F
class ViTForImageClassification(torch.nn.Module):
    def __init__(self, num_classes=2):
        super(ViTForImageClassification, self).__init__()
        self.vit = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
        self.classifier = torch.nn.Linear(self.vit.config.hidden_size * 2, num_classes)

    def forward(self, pixel_values_1, pixel_values_2, labels=None):
        pixel_values_1 = pixel_values_1.to(device)
        pixel_values_2 = pixel_values_2.to(device)
        if labels is not None:
            labels = labels.to(device)
        outputs_1 = self.vit(pixel_values=pixel_values_1)
        outputs_2 = self.vit(pixel_values=pixel_values_2)
        
        # Extract the embeddings of the CLS token from both outputs
        cls_embedding_1 = outputs_1.last_hidden_state[:, 0, :]  # CLS token
        cls_embedding_2 = outputs_2.last_hidden_state[:, 0, :]  # CLS token
        
        # Concatenate the embeddings
        combined_embeddings = torch.cat((cls_embedding_1, cls_embedding_2), dim=1)
        
        # Pass the combined embeddings through the classifier
        logits = self.classifier(combined_embeddings)
        
        if labels is not None:
            loss = F.cross_entropy(logits, labels)
            return loss, logits
        
        return logits.detach().cpu().numpy()

model = ViTForImageClassification()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model.to(device)
# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

# Define accuracy metric
metric = load_metric("accuracy", trust_remote_code=True)

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return metric.compute(predictions=preds, references=p.label_ids)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    compute_metrics=compute_metrics,
)


cuda


In [22]:
trainer.train()


  context_layer = torch.nn.functional.scaled_dot_product_attention(


Step,Training Loss
10,0.6916
20,0.6835
30,0.6931
40,0.6807
50,0.6694
60,0.6638
70,0.6671
80,0.6343
90,0.6311
100,0.5867


TrainOutput(global_step=519, training_loss=0.25449335842118787, metrics={'train_runtime': 256.238, 'train_samples_per_second': 32.243, 'train_steps_per_second': 2.025, 'total_flos': 0.0, 'train_loss': 0.25449335842118787, 'epoch': 3.0})

In [27]:
# Пример данных для тестирования
images_test = np.array(combine_columns_to_numpy(x_test, "frames1", "frames2"), dtype=np.float32)
test_dataset = CustomImageDataset(images_test, y_test, feature_extractor)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=True)


trainer.evaluate(test_dataset)


{'eval_loss': 0.05075128749012947,
 'eval_accuracy': 0.9869375907111756,
 'eval_runtime': 13.8442,
 'eval_samples_per_second': 49.768,
 'eval_steps_per_second': 3.178,
 'epoch': 3.0}

In [24]:
# model.save_pretrained('./saved_model')

AttributeError: 'ViTForImageClassification' object has no attribute 'save_pretrained'

In [25]:
torch.save(model.state_dict(), 'vit_weights.pth')

In [None]:
model = ViTForImageClassification()  
model.load_state_dict(torch.load('model_weights.pth'))
model.eval()