In [1]:
# !pip install transformers
# !pip install av
# !pip install matplotlib 
!pip install -U scikit-learn

Collecting scikit-learn
  Using cached scikit_learn-1.6.1-cp39-cp39-win_amd64.whl.metadata (15 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Using cached scipy-1.13.1-cp39-cp39-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Using cached scikit_learn-1.6.1-cp39-cp39-win_amd64.whl (11.2 MB)
Using cached joblib-1.4.2-py3-none-any.whl (301 kB)
Using cached scipy-1.13.1-cp39-cp39-win_amd64.whl (46.2 MB)
Downloading threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scipy, joblib, scikit-learn
Successfully installed joblib-1.4.2 scikit-learn-1.6.1 scipy-1.13.1 threadpoolctl-3.6.0


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import os
import tqdm
import av
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import torch.nn as nn
import torch.optim as optim

In [2]:
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# os.mkdir('results')

torch.manual_seed(0)
np.random.seed(0)

In [3]:
# Download UCF-101 dataset and labels
# Download data
# !curl -L -o Diving48_rgb.tar.gz https://nextcloud.nrp-nautilus.io/s/eqKMRFHqNCrP77L/download/Diving48_rgb.tar.gz
# !unrar x UCF101.rar
# !rm UCF101.rar

In [4]:
# Download train & test split
# !curl -L https://www.crcv.ucf.edu/data/UCF101/UCF101TrainTestSplits-RecognitionTask.zip -O UCF101TrainTestSplits-RecognitionTask.zip
# !unzip -q UCF101TrainTestSplits-RecognitionTask.zip
# !rm UCF101TrainTestSplits-RecognitionTask.zip

In [5]:
# !copy /b ./ucfTrainTestlist/testlist01.txt + ./ucfTrainTestlist/testlist02.txt + ./ucfTrainTestlist/testlist03.txt ./ucfTrainTestlist/testlist.txt
# !copy /b ./ucfTrainTestlist/trainlist01.txt + ./ucfTrainTestlist/trainlist02.txt + ./ucfTrainTestlist/trainlist03.txt ./ucfTrainTestlist/trainlist.txt

In [6]:
# UCF_CLASSES = ['ApplyEyeMakeup','ApplyLipstick', 'Archery', 'BabyCrawling', 'BalanceBeam', 'BandMarching', 'BaseballPitch', 'Basketball', 'BasketballDunk', 'BenchPress', 'Biking', 'Billiards', 'BlowDryHair', 'BlowingCandles', 'BodyWeightSquats', 'Bowling', 'BoxingPunchingBag', 'BoxingSpeedBag', 'BreastStroke', 'BrushingTeeth', 'CleanAndJerk', 'CliffDiving', 'CricketBowling', 'CricketShot', 'CuttingInKitchen', 'Diving', 'Drumming', 'Fencing', 'FieldHockeyPenalty', 'FloorGymnastics', 'FrisbeeCatch', 'FrontCrawl', 'GolfSwing', 'Haircut', 'Hammering', 'HammerThrow', 'HandstandPushups', 'HandstandWalking', 'HeadMassage', 'HighJump', 'HorseRace', 'HorseRiding', 'HulaHoop', 'IceDancing', 'JavelinThrow', 'JugglingBalls', 'JumpingJack', 'JumpRope', 'Kayaking', 'Knitting', 'LongJump', 'Lunges', 'MilitaryParade', 'Mixing', 'MoppingFloor', 'Nunchucks', 'ParallelBars', 'PizzaTossing', 'PlayingCello', 'PlayingDaf', 'PlayingDhol', 'PlayingFlute', 'PlayingGuitar', 'PlayingPiano', 'PlayingSitar', 'PlayingTabla', 'PlayingViolin', 'PoleVault', 'PommelHorse', 'PullUps', 'Punch', 'PushUps', 'Rafting', 'RockClimbingIndoor', 'RopeClimbing', 'Rowing', 'SalsaSpin', 'ShavingBeard', 'Shotput', 'SkateBoarding', 'Skiing', 'Skijet', 'SkyDiving', 'SoccerJuggling', 'SoccerPenalty', 'StillRings', 'SumoWrestling', 'Surfing', 'Swing', 'TableTennisShot', 'TaiChi', 'TennisSwing', 'ThrowDiscus', 'TrampolineJumping', 'Typing', 'UnevenBars', 'VolleyballSpiking', 'WalkingWithDog', 'WallPushups', 'WritingOnBoard', 'YoYo']

In [7]:
# ucf_train_df = pd.read_csv('ucfTrainTestlist/trainlist.txt', sep=' ', header=None)
# ucf_train_df.columns = ['id', 'label']

# ucf_valid_df = ucf_train_df.sample(frac=0.2)
# ucf_valid_df['id'] = ucf_valid_df['id'].apply(lambda x: f"./UCF-101/UCF-101/{x}")

# ucf_train_df = ucf_train_df.drop(ucf_valid_df.index)
# ucf_train_df['id'] = ucf_train_df['id'].apply(lambda x: f"./UCF-101/UCF-101/{x}")
# ucf_train_df['label'] = ucf_train_df['label'].apply(lambda x: x-1)
# ucf_valid_df['label'] = ucf_valid_df['label'].apply(lambda x: x-1)

# print(ucf_train_df.head())
# print("Number of rows : ", ucf_train_df.shape[0])

In [8]:
diving_train_df = pd.read_csv('./diving48_train_list_videos.txt', sep=' ', header=None)
diving_train_df.columns = ['id', 'label']

diving_train_df['id'] = diving_train_df['id'].apply(lambda x: f".\\diving48\\diving48\\{x}")

diving_train_df, diving_valid_df = train_test_split(diving_train_df, test_size=0.2, random_state=42)

print(diving_train_df.head())
print("Number of rows : ", diving_train_df.shape[0])

                                                 id  label
14311  .\diving48\diving48\19/siEI_jaSmd4_00081.mp4     34
9870   .\diving48\diving48\22/Bb0ZiYVNtDs_00199.mp4     22
553    .\diving48\diving48\19/VNvb5oLOpLg_00570.mp4      2
8116   .\diving48\diving48\21/sk8TafuB3lU_01024.mp4     12
7248   .\diving48\diving48\15/sk8TafuB3lU_00109.mp4     15
Number of rows :  12021


In [9]:
print(diving_valid_df.head())
print("Number of rows : ", diving_valid_df.shape[0])

                                                 id  label
2919   .\diving48\diving48\35/nOlRwoxsDJ0_00761.mp4     26
7414    .\diving48\diving48\1/sk8TafuB3lU_00295.mp4     17
3344   .\diving48\diving48\34/3N1kUtqJ25A_00004.mp4     34
11010  .\diving48\diving48\46/9jZYYtzYqwE_00021.mp4     46
8582   .\diving48\diving48\35/xbQCwTHcGN8_00007.mp4     35
Number of rows :  3006


In [10]:
diving_test_df = pd.read_csv('./diving48_val_list_videos.txt', sep=' ', header=None)
diving_test_df.columns = ['id', 'label']

diving_test_df['id'] = diving_test_df['id'].apply(lambda x: f"./diving48/diving48/{x}")

print(diving_test_df.head())
print("Number of rows : ", diving_test_df.shape[0])

                                             id  label
0  ./diving48/diving48/26/rRw7peH60Yw_00000.mp4     26
1  ./diving48/diving48/33/rRw7peH60Yw_00001.mp4     33
2  ./diving48/diving48/27/rRw7peH60Yw_00002.mp4     27
3  ./diving48/diving48/33/rRw7peH60Yw_00003.mp4     33
4  ./diving48/diving48/26/rRw7peH60Yw_00004.mp4     26
Number of rows :  1970


In [11]:
# ucf_class_df = pd.read_csv('ucfTrainTestlist/classInd.txt', sep=' ', header=None)
# ucf_class_df.columns = ['label', 'label_name']

# ucf_test_df = pd.read_csv('ucfTrainTestlist/testlist.txt', sep=' ', header=None)
# ucf_test_df.columns = ['id']
# ucf_test_df['label'] = ucf_test_df['id'].str.split('/').str[0]

# label_mapping = dict(zip(ucf_class_df['label_name'], ucf_class_df['label']))
# ucf_test_df['label'] = ucf_test_df['label'].map(label_mapping)

# ucf_test_label_df = ucf_test_df[['id', 'label']]
# ucf_test_df = ucf_test_df.drop(columns=['label'])

# ucf_test_label_df['label'] = ucf_test_label_df['label'].apply(lambda x: x-1)
# ucf_test_label_df['id'] = ucf_test_label_df['id'].apply(lambda x: f"./UCF-101/UCF-101/{x}")

# # N_CALL_UCF = ucf_test_df['label'].nunique()

# print(ucf_test_label_df.head())
# print("Number of rows : ", ucf_test_label_df.shape[0])

In [12]:
def read_video_pyav(container, indices):
    '''
    ...     Decode the video with PyAV decoder.
    ...     Args:
    ...         container (`av.container.input.InputContainer`): PyAV container.
    ...         indices (`List[int]`): List of frame indices to decode.
    ...     Returns:
    ...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
    ...     '''
    frames = []
    container.seek(0)
    start_index = indices[0]
    end_index = indices[-1]
    for i, frame in enumerate(container.decode(video=0)):
        if i > end_index:
            break
        if i >= start_index and i in indices:
            frames.append(frame)

    if len(frames) == 0 :
        pass

    return np.stack([x.to_ndarray(format="rgb24") for x in frames])

In [13]:
def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
    '''
        ...     Sample a given number of frame indices from the video.
        ...     Args:
        ...         clip_len (`int`): Total number of frames to sample.
        ...         frame_sample_rate (`int`): Sample every n-th frame.
        ...         seg_len (`int`): Maximum allowed index of sample's last frame.
        ...     Returns:
        ...         indices (`List[int]`): List of sampled frame indices
    '''
    converted_len = int(clip_len * frame_sample_rate)

    while converted_len >= seg_len:
        # You could either adjust clip_len or frame_sample_rate, or both
        # For example, reduce clip_len to fit the available frames:
        frame_sample_rate = seg_len // clip_len
        # Recalculate converted_len based on the adjusted clip_len
        converted_len = clip_len * frame_sample_rate


        if converted_len == seg_len:
            frame_sample_rate -= 1
            converted_len = clip_len * frame_sample_rate

    end_idx = np.random.randint(converted_len, seg_len)
    start_idx = end_idx - converted_len
    indices = np.linspace(start_idx, end_idx, num=clip_len)
    indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
    return indices

In [14]:
def format_video(video_path):
    container = av.open(video_path)
    seg_len = int(container.streams.video[0].frames)
    indices = sample_frame_indices(clip_len=8, frame_sample_rate=4, seg_len=seg_len)
    video = read_video_pyav(container, indices)
    return video

In [15]:
class CustomImageDataset(Dataset):
    def __init__(self, df):
        """
        Args:
            image_data (list or np.array): Preprocessed image data, should be in shape (num_samples, height, width, channels).
            labels (list or np.array): Labels corresponding to the images.
        """
        self.df = df

    def __len__(self):
        # Return the total number of samples
        return self.df.shape[0]

    def __getitem__(self, idx):
        # Retrieve the image and label at index `idx`
        row = self.df.iloc[idx]
        image = row['id']
        label = int(row['label'])

        # If your image needs to be converted to a torch tensor
        # image = torch.tensor(image, dtype=torch.float32)  # Adjust dtype if necessary

        # Depending on your label format, convert the label
        label = torch.tensor(label, dtype=torch.long)  # Assuming it's a classification problem

        return image, label

In [16]:
# data = [ucf_train_df, ucf_valid_df, ucf_test_label_df]

data = [diving_train_df, diving_valid_df, diving_test_df]

train_dataset = CustomImageDataset(data[0])
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)

valisation_dataset = CustomImageDataset(data[1])
val_loader = DataLoader(valisation_dataset, batch_size=1, shuffle=True)

evaluation_dataset = CustomImageDataset(data[2])
evaluation_loader = DataLoader(evaluation_dataset, batch_size=1, shuffle=True)

In [17]:
def evaluation_run(model, image_processor, criterion, evaluation_set):
    model.eval()

    with torch.no_grad():
        running_loss = 0.0
        correct = 0
        total = 0

        for vid_id, labels in tqdm.tqdm(evaluation_set):
            vid_id = vid_id[0]

            try:
                images = format_video(vid_id)

                images = torch.tensor(images, dtype=torch.float32)

                images = torch.squeeze(images)
                inputs = image_processor(list(images), return_tensors="pt")
                inputs = inputs.to(DEVICE)
                labels = labels.to(DEVICE)


                outputs = model(**inputs)
                logits = outputs['logits']

            except Exception as e:
                total += 1 
                continue


            running_loss += criterion(logits, labels)
            _, predicted = torch.max(logits, 1)
            total += 1
            correct += (predicted == labels).sum().item()

    validation_loss = running_loss / len(evaluation_set)
    accuracy = (100 * correct) / total
    print(f"Evaluation : Loss: {validation_loss:.4f}, Accuracy: {accuracy:.2f}%")
    return validation_loss

In [18]:
BATCH_SIZE = 16

def validation_run(model, image_processor, criterion, validation_set):
    model.eval()

    with torch.no_grad():
        running_loss = 0.0
        correct = 0
        total = 0

        for vid_id, labels in tqdm.tqdm(validation_set):
            vid_id = vid_id[0]

            try:
                images = format_video(vid_id)

                images = torch.tensor(images, dtype=torch.float32)

                images = torch.squeeze(images)
                inputs = image_processor(list(images), return_tensors="pt")
                inputs = inputs.to(DEVICE)
                labels = labels.to(DEVICE)


                outputs = model(**inputs)
                logits = outputs['logits']
            except Exception as e:
                continue

            running_loss += criterion(logits, labels)
            _, predicted = torch.max(logits, 1)
            total += 1
            correct += (predicted == labels).sum().item()

    validation_loss = running_loss / len(validation_set)
    accuracy = (100 * correct) / total
    print(f"Validation : Loss: {validation_loss:.4f}, Accuracy: {accuracy:.2f}%")
    return validation_loss

def train_model(model, image_processor, training_dataloader, criterion, optimizer, num_epochs=10, validation_dataloader=val_loader):
    validation_loss = []
    training_loss = []

    for epoch in range(num_epochs):
        i = 1
        running_loss = 0.0
        correct = 0
        total = 0

        loss = 0
        model.train(True)

        for vid_id, labels in tqdm.tqdm(training_dataloader):
            vid_id = vid_id[0]

            try:
                images = format_video(vid_id)

                images = torch.tensor(images, dtype=torch.float32)
                images = torch.squeeze(images)

                inputs = image_processor(list(images), return_tensors="pt")
                inputs = inputs.to(DEVICE)
                labels = labels.to(DEVICE)

                outputs = model(**inputs)
                logits = outputs['logits']
            except Exception as e:
                total += 1
                torch.cuda.empty_cache()
                # print(e)
                continue
            
            # Calculer la perte
            loss = criterion(logits, labels) / BATCH_SIZE
            loss.backward()

            # Rétropropagation de la perte
            if (i+1) % BATCH_SIZE == 0:
              # Mettre à jour les paramètres du modèle
              optimizer.step()
              optimizer.zero_grad()

            i += 1

            # Calcul des statistiques
            running_loss += loss.item()
            _, predicted = torch.max(logits, 1)
            total += 1
            try:
                correct += (predicted == labels).sum().item()
            except Exception as e:
                continue
        
        optimizer.step()
        optimizer.zero_grad()
        
        # Affichage des statistiques après chaque époque
        epoch_loss = running_loss / len(training_dataloader)
        accuracy = (100 * correct) / total
        print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {epoch_loss:.4f}, Accuracy: {accuracy:.2f}%")
        val_loss = validation_run(model, image_processor, criterion, validation_dataloader)

        validation_loss.append(val_loss)
        training_loss.append(epoch_loss)

        torch.save(model.state_dict(), f"./results/training_{TRAINING}/weights_epoch%d.pt"%epoch)

    return training_loss, validation_loss

In [19]:
import sys
sys.modules.pop('implementations.llora_timesformer', None)

In [21]:
from transformers import AutoImageProcessor,TimesformerForVideoClassification

# import importlib
# impl_mod = importlib.import_module('implementations.prefix_timesformer')
# importlib.reload(impl_mod)
# TimesformerForVideoClassification = impl_mod.TimesformerForVideoClassification

TRAINING = 'DIVING_CLASSIFIER' # Modify this value each run
# os.mkdir(f"./results/training_{TRAINING}")

image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base")
model = TimesformerForVideoClassification.from_pretrained("facebook/timesformer-base-finetuned-k400", num_labels=48, ignore_mismatched_sizes=True)
model.train(True)
model.to(DEVICE)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Some weights of TimesformerForVideoClassification were not initialized from the model checkpoint at facebook/timesformer-base-finetuned-k400 and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([400, 768]) in the checkpoint and torch.Size([48, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([400]) in the checkpoint and torch.Size([48]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TimesformerForVideoClassification(
  (timesformer): TimesformerModel(
    (embeddings): TimesformerEmbeddings(
      (patch_embeddings): TimesformerPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (pos_drop): Dropout(p=0.0, inplace=False)
      (time_drop): Dropout(p=0.0, inplace=False)
    )
    (encoder): TimesformerEncoder(
      (layer): ModuleList(
        (0-11): 12 x TimesformerLayer(
          (drop_path): Identity()
          (attention): TimeSformerAttention(
            (attention): TimesformerSelfAttention(
              (qkv): Linear(in_features=768, out_features=2304, bias=True)
              (attn_drop): Dropout(p=0.0, inplace=False)
            )
            (output): TimesformerSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): TimesformerIntermediate(
            (dense

In [22]:
for name, param in model.named_parameters():
    # if 'prefix' not in name and 'classifier' not in name:
    if 'classifier' not in name:
        param.requires_grad = False
    else :
        print(name)

classifier.weight
classifier.bias


In [23]:
model_parameters = filter(lambda p: p.requires_grad, model.parameters())
params = sum([np.prod(p.size()) for p in model_parameters])

print(params)

36912


In [24]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.005)

# Set the scheduler to decay the LR by 10x at epochs 11 and 14
scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[], gamma=0.1)

In [25]:
validation_loss, training_loss = train_model(model, image_processor, train_loader, criterion, optimizer, num_epochs=5)

100%|██████████| 12021/12021 [25:58<00:00,  7.71it/s]


Epoch [1/5], Loss: 0.2286, Accuracy: 18.58%


 74%|███████▍  | 2222/3006 [04:45<01:40,  7.81it/s]It looks like you are trying to rescale already rescaled images. If the input images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again.
100%|██████████| 3006/3006 [06:26<00:00,  7.77it/s]


Validation : Loss: 3.6006, Accuracy: 19.96%


100%|██████████| 12021/12021 [32:25<00:00,  6.18it/s]


Epoch [2/5], Loss: 0.2061, Accuracy: 24.95%


100%|██████████| 3006/3006 [08:40<00:00,  5.78it/s]


Validation : Loss: 3.2832, Accuracy: 27.18%


100%|██████████| 12021/12021 [28:27<00:00,  7.04it/s]


Epoch [3/5], Loss: 0.1975, Accuracy: 28.35%


100%|██████████| 3006/3006 [07:12<00:00,  6.95it/s]


Validation : Loss: 3.7449, Accuracy: 26.11%


100%|██████████| 12021/12021 [29:59<00:00,  6.68it/s] 


Epoch [4/5], Loss: 0.1969, Accuracy: 30.10%


100%|██████████| 3006/3006 [07:11<00:00,  6.97it/s]


Validation : Loss: 3.5003, Accuracy: 27.15%


100%|██████████| 12021/12021 [30:25<00:00,  6.58it/s]  


Epoch [5/5], Loss: 0.1912, Accuracy: 31.88%


100%|██████████| 3006/3006 [07:07<00:00,  7.03it/s]


Validation : Loss: 3.4961, Accuracy: 29.08%


In [26]:
evaluation_loss = evaluation_run(model, image_processor, criterion, evaluation_loader)

100%|██████████| 1970/1970 [04:41<00:00,  7.00it/s]

Evaluation : Loss: 4.6691, Accuracy: 16.50%



