## Fine-Tuning a Vivit Transformer para clasificacion binario de videos pornograficos o no pornograficos utilizando LSPD: A Large-Scale Pornographic Dataset for Detection and Classification.

In [None]:
!pip install av datasets transformers tokenizers torch tensorboard

Collecting av
  Downloading av-14.4.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.6 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.w

In [None]:
import numpy as np
import os
import av
import torch
from transformers import VivitImageProcessor, VivitModel, VivitConfig, VivitForVideoClassification, Trainer, TrainingArguments#, EarlyStoppingCallback
from datasets import Dataset, load_from_disk, load_metric
from sklearn.metrics import accuracy_score

### COMPROBAR GPU

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Mon Jun  9 04:39:20 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off |   00000000:00:04.0 Off |                    0 |
| N/A   36C    P0             44W /  400W |       5MiB /  40960MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

### CONECTAR CON GOOGLE DRIVE Y COPIAR DATASET A COLAB

In [None]:
from google.colab import drive
drive.mount('/content/drive')

### RECOLECCION DE DATOS (VIDEOS)

In [None]:
np.random.seed(0)

In [None]:
# Numero de frames a leer de cada video
NO_OF_FRAMES = 128

#Se muestreara un frame de cada número de frames
FRAME_SAMPLE_RATE = 8

In [None]:
def read_video_pyav(container, indices):

    '''
    Decodificar un video mediante descodificador PyAV.
    Args:
        container (`av.container.input.InputContainer`): contenedor PyAV.
        indices (`List[int]`): Lista de indices de frames para descodificar.
    Returna:
        result (np.ndarray): np.array de frames descodificados de la forma (num_frames, height, width, 3).
    '''

    frames = []
    container.seek(0)
    start_index = indices[0]
    end_index = indices[-1]

    for i, frame in enumerate(container.decode(video=0)):
        if i > end_index:
            break
        if i >= start_index and i in indices:
            #reformatted_frame = frame.reformat(width=224,height=224)
            frames.append(frame)

    new=np.stack([x.to_ndarray(format="rgb24") for x in frames])

    return new

In [None]:
def sample_frame_indices(clip_len, frame_sample_rate, seg_len):

    '''
    Muestrea un conjunto de indices de frames de un video.
    Args:
        no_of_frames (`int`): Numero total de frames para muestrear.
        frame_sample_rate (`int`): Muestrea cada n-th frame.
        seg_len (`int`): Indice maximo permitido del ultimo frame de la muestra.
    Returna:
        indices (`List[int]`): Lista de indices de frames muestreados.
    '''

    converted_len = int(clip_len * frame_sample_rate)

    end_idx = np.random.randint(converted_len, seg_len)
    start_idx = end_idx - converted_len

    indices = np.linspace(start_idx, end_idx, num=clip_len)
    indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)

    return indices

In [None]:
def frames_convert_and_create_dataset_dictionary(video_dir, format='.mp4'):

    '''
    Crea diccionario de videos y etiquetas.
    Args:
        video_dir (`str`): Direccion de la carpeta de todos los videos.
        format (`str`): Formato del video.
    Returna:
        Resultado (`List[Dict]`): Lista de diccionarios de videos sus etiquetas (pornografico o no pornografico).
    '''

    all_videos=[]

    video_files = [
            os.path.join(video_dir, f)
            for f in os.listdir(video_dir)
            if f.endswith(format)
    ]

    # Extraer el nombre de la etiqueta del nombre del archivo
    for file in video_files:
        if 'porn' in file.lower():
            label = 0
        elif 'non-porn' in file.lower():
            label = 1
        container = av.open(file)

        total_frames = container.streams.video[0].frames

        if total_frames > (NO_OF_FRAMES * FRAME_SAMPLE_RATE):
            indices = sample_frame_indices(no_of_frames=NO_OF_FRAMES, frame_sample_rate=FRAME_SAMPLE_RATE, seg_len=total_frames)
            video = read_video_pyav(container=container, indices=indices)
            all_videos.append({'video': video, 'labels': label})

    return all_videos

### PREPROCESAMIENTO DE VIDEOS

In [None]:
train_videos_path = "/LSPD/videos/"

In [None]:
def process_example(example, processor):

    '''
    Procesa el video a un formato esperado por Vivit y le asigna etiquetas.
    Args:
        example (`Dict{video, label}`): Diccionario con videos y etiquetas.
        processor (`VivitImageProcessor`): Procesador de imagenes del modelo Vivit.
    Returna:
        Resultado (`Dict{video, label}`): Diccionario con sus video procesados a un formato esperado por Vivit.
    '''

    inputs = processor(list(np.array(example['video'])), return_tensors='pt')
    inputs['labels'] = example['labels']
    return inputs


def create_vivit_dataset(list_of_dict, processor):

    '''
    Procesa cada video de la lista de diccionario a un formato esperadom poir Vivit .
    Args:
        list_of_dict (`List[Dict{video, label}]`): Lista de diccionarios de videos y sus etiquetas.
        processor (`VivitImageProcessor`): Procesador de imagenes del modelo Vivit.
    Returna:
        Resultado (`List[Dict{video, label}]`): Lista de diccionarios de videos procesados y sus etiquetas.
    '''

    processed_list_of_dict = list(map(process_example, list_of_dict))
    return processed_list_of_dict

In [None]:
def remove_bad_rows(dataset):

    '''
    Crea un lista con los indices de videos excluyendo aquellas filas que contengan videos con menos de 128 frames.
    Args:
        dataset (`List[Dict{video, label}]`): Lista de diccionarios de videos y sus etiquetas.
    Returna:
        Resultado (`List[Dict{video, label}]`): Lista de diccionarios de videos y sus etiquetas sin filas corruptas.
    '''

    bad_rows = []
    for i,row in enumerate(dataset):
     if torch.tensor(row['pixel_values']).shape[1] < 32:
         bad_rows.append(i)

    indices_to_keep = [i for i in range(len(dataset)) if i not in bad_rows]

    dataset = dataset.select(indices_to_keep)

    return dataset

In [None]:
def pre_process(path, processor):

    '''
    Crea un dataset tipo Hugging Face a partir de una lista de diccionarios de videos y etiquetas.
    Args:
        dataset (`List[Dict{video, label}]`): Lista de diccionarios de videos y sus etiquetas.
    Returna:
        Resultado (`Dataframe{video, label}`): Dataframe tipo HuggingFace conteniendo como registro videos y sus etiquetas.
    '''

    print("\nCreando lista de diccionarios...\n")
    list_of_dictionaries = frames_convert_and_create_dataset_dictionary(video_dir=path)

    print("\nPasando por VivitImageProcessor...\n")
    dataset = create_vivit_dataset(list_of_dictionaries, processor)

    print("\nConversión al Dataset de Hugging Face...\n")
    dataset_hf = Dataset.from_list(dataset)

    print("\nAgregar etiquetas de codificación de clases...\n")
    dataset_hf = dataset_hf.class_encode_column("labels")

    print("\nEncontrar y eliminar filas defectuosas...\n")
    dataset_hf = remove_bad_rows(dataset_hf)

    print("\nProcesando pixeles...\n")
    dataset_hf = dataset_hf.map(lambda x: {'pixel_values': torch.tensor(x['pixel_values']).to(device).squeeze()})

    print("\nTerminado.\n")

    return dataset_hf

In [None]:
image_processor = VivitImageProcessor.from_pretrained("google/vivit-b-16x2-kinetics400")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/401 [00:00<?, ?B/s]

**PARTICION DE LOS DATOS PARA ENTRENAMIENTO Y PRUEBAS**

In [None]:
train_eval_dataset = pre_process(train_videos_path, image_processor)
train_eval_dataset = train_eval_dataset.train_test_split(test_size=0.2)

In [None]:
train_eval_dataset.save_to_disk("./processed_datasets/train")

In [None]:
train_dataset = load_from_disk("./processed_datasets/train/train")
eval_dataset = load_from_disk("./processed_datasets/train/test")

### AJUSTE DE HIPERPARAMETROS

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_strategy="epoch",
    report_to=["tensorboard"],
    save_total_limit=2,
    load_best_model_at_end=True,
    weight_decay=0.01,
    learning_rate=1e-5,
    metric_for_best_model="eval_loss",
    optim="adamw_torch",
    greater_is_better=False,
    num_train_epochs=1,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    save_total_limit=3,
    seed=42,
)

**CONFIGURAR METRICA DE EVALUACION - PRECISION**

In [None]:
def compute_metrics(pred):
    logits, labels = pred
    logits = torch.tensor(logits)
    predictions = torch.argmax(logits, dim=-1)
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc}

In [None]:
model = VivitForVideoClassification.from_pretrained("google/vivit-b-16x2-kinetics400")

config.json:   0%|          | 0.00/18.6k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/356M [00:00<?, ?B/s]

In [None]:
model.config

VivitConfig {
  "architectures": [
    "ViViTForVideoClassification"
  ],
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu_fast",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19": "LABEL_19",
    "20": "LABEL_20",
    "21": "LABEL_21",
    "22": "LABEL_22",
    "23": "LABEL_23",
    "24": "LABEL_24",
    "25": "LABEL_25",
    "26": "LABEL_26",
    "27": "LABEL_27",
    "28": "LABEL_28",
    "29": "LABEL_29",
    "30": "LABEL_30",
    "31": "LABEL_31",
    "32": "LABEL_32",
    "33": "LABEL_33",
    "34": "LABEL_34",
    "35": "LABEL_35",
    "36": "LABEL_3

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-05, betas=(0.9, 0.999), eps=1e-08)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    #callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
    processing_class=image_processor,
    optimizers=(optimizer, None),
    compute_metrics=compute_metrics
)

**SCRIPT PARA ASEGURARSE DE QUE EL ENTRENAMIENTO NO SEA INTERRUMPUDO POR INACTIVIDAD**

```javascript
function ConnectButton(){
    console.log("Connect pushed");
    document.querySelector("#top-toolbar > colab-connect-button").shadowRoot.querySelector("#connect").click()
}
setInterval(ConnectButton, 60000);
```

### AFINAR Y OBTENER RESULTADOS

In [None]:
train_results = trainer.train()

### GUARDAR MODELO

In [None]:
trainer.save_model("./vivit_finetuned_deception_detection")
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)
trainer.save_state()

### DATASET

Phan, D.-D., Nguyen, T.-T., Nguyen, Q.-H., Tran, H., Nguyen, K.-N.-K., & Vu, D.-L. (2022). LSPD: A Large-Scale Pornographic Dataset for Detection and Classification. International Journal of Intelligent Engineering and Systems, 15(1). https://doi.org/10.22266/ijies2022.0228.19