Split des vidéo en set train, test et val

In [12]:
import os
import glob
import random
import subprocess
from pathlib import Path

############################################
# PARAMÈTRES
############################################

input_points = "segments_points"
input_nonpoints = "segments_temps_hors_jeu"

output_points = "frames_points"
output_nonpoints = "frames_nonpoints"

# ratios de split
train_ratio = 0.7
val_ratio = 0.15
test_ratio = 0.15

############################################
# FONCTION D'EXTRACTION DE FRAMES
############################################

def extract_frames(video_path, output_folder):
    """
    Extrait les frames d'une vidéo dans un sous-dossier dédié.
    Les images seront nommées frame_00001.jpg, frame_00002.jpg…
    """
    os.makedirs(output_folder, exist_ok=True)

    cmd = [
        "ffmpeg",
        "-i", video_path,
        "-qscale:v", "2",
        os.path.join(output_folder, "frame_%05d.jpg"),
        "-y"
    ]

    subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)


############################################
# RÉCUPÉRATION DES VIDÉOS
############################################

def list_videos(folder):
    return sorted(glob.glob(os.path.join(folder, "*.mp4")))

videos_points = list_videos(input_points)
videos_nonpoints = list_videos(input_nonpoints)

print(f"{len(videos_points)} vidéos points détectées.")
print(f"{len(videos_nonpoints)} vidéos non-points détectées.")


############################################
# SHUFFLE ET SPLIT
############################################

def split_list(lst, train_ratio, val_ratio):
    random.shuffle(lst)
    n = len(lst)
    n_train = int(n * train_ratio)
    n_val = int(n * val_ratio)
    train = lst[:n_train]
    val = lst[n_train:n_train+n_val]
    test = lst[n_train+n_val:]
    return train, val, test

points_train, points_val, points_test = split_list(videos_points, train_ratio, val_ratio)
nonpoints_train, nonpoints_val, nonpoints_test = split_list(videos_nonpoints, train_ratio, val_ratio)

############################################
# EXTRACTION DES FRAMES
############################################

def process_split(videos, output_root, class_label):
    csv_entries = []

    for video in videos:
        name = Path(video).stem  # exemple point_001
        output_folder = os.path.join(output_root, name)
        extract_frames(video, output_folder)

        csv_entries.append((output_folder, class_label))

    return csv_entries

print("Extraction frames pour TRAIN…")
train_entries = []
train_entries += process_split(points_train, output_points, 1)
train_entries += process_split(nonpoints_train, output_nonpoints, 0)

print("Extraction frames pour VAL…")
val_entries = []
val_entries += process_split(points_val, output_points, 1)
val_entries += process_split(nonpoints_val, output_nonpoints, 0)

print("Extraction frames pour TEST…")
test_entries = []
test_entries += process_split(points_test, output_points, 1)
test_entries += process_split(nonpoints_test, output_nonpoints, 0)


############################################
# GÉNÉRATION DES CSV
############################################

def write_csv(filename, entries):
    with open(filename, "w") as f:
        for folder, label in entries:
            f.write(f"{folder},{label}\n")

write_csv("train.csv", train_entries)
write_csv("val.csv", val_entries)
write_csv("test.csv", test_entries)

print("CSV générés : train.csv, val.csv, test.csv")


46 vidéos points détectées.
46 vidéos non-points détectées.
Extraction frames pour TRAIN…
Extraction frames pour VAL…
Extraction frames pour TEST…
CSV générés : train.csv, val.csv, test.csv


# Entrainement du modèle (CNN + LSTM)

In [1]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from PIL import Image
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

In [2]:
##########################################
# 1) DATASET POUR LES SÉQUENCES DE FRAMES
##########################################

class FrameSequenceDataset(Dataset):
    """
    Dataset pour charger les frames pré-extraites d'une vidéo.
    Échantillonnage possible pour limiter le nombre de frames.
    """
    def __init__(self, csv_path, transform=None, frame_skip=2):
        self.items = []
        self.transform = transform
        self.frame_skip = frame_skip  # toutes les 'frame_skip' frames

        with open(csv_path, "r") as f:
            for line in f:
                folder, label = line.strip().split(",")
                self.items.append((folder, int(label)))

    def __len__(self):
        return len(self.items)

    def __getitem__(self, idx):
        folder, label = self.items[idx]

        frame_files = sorted([f for f in os.listdir(folder) if f.lower().endswith(".jpg")])
        frame_files = frame_files[::self.frame_skip]  # échantillonnage

        frames = []
        for f in frame_files:
            img = Image.open(os.path.join(folder, f)).convert("RGB")
            if self.transform:
                img = self.transform(img)
            frames.append(img)

        video_tensor = torch.stack(frames)  # (T, 3, H, W)
        return video_tensor, label

##########################################
# 2) COLLATE FN POUR PAD LES SÉQUENCES
##########################################

def collate_fn(batch):
    sequences = [item[0] for item in batch]
    labels = torch.tensor([item[1] for item in batch], dtype=torch.long)
    lengths = torch.tensor([seq.shape[0] for seq in sequences])
    padded = pad_sequence(sequences, batch_first=True)
    return padded, lengths, labels

##########################################
# 3) MODELE CNN + LSTM
##########################################

class PointDetectorCNNLSTM(nn.Module):
    def __init__(self, hidden_size=256, num_layers=2, num_classes=2):
        super().__init__()
        resnet = models.resnet18(weights=models.ResNet18_Weights.DEFAULT)
        modules = list(resnet.children())[:-1]  # remove last fc
        self.cnn = nn.Sequential(*modules)
        self.feature_dim = 512

        self.lstm = nn.LSTM(
            input_size=self.feature_dim,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=True
        )

        self.fc = nn.Linear(hidden_size*2, num_classes)

    def forward(self, x, lengths):
        B, T, C, H, W = x.shape
        x = x.view(B*T, C, H, W)
        feats = self.cnn(x)                 # (B*T, 512, 1, 1)
        feats = feats.view(B, T, self.feature_dim)

        packed = pack_padded_sequence(feats, lengths.cpu(), batch_first=True, enforce_sorted=False)
        output, _ = self.lstm(packed)
        output, _ = pad_packed_sequence(output, batch_first=True)

        # dernière frame utile pour chaque séquence
        last_outputs = torch.stack([output[i, length-1, :] for i, length in enumerate(lengths)])
        return self.fc(last_outputs)

##########################################
# 4) HYPERPARAMS + DATALOADERS
##########################################

transform = transforms.Compose([
    transforms.Resize((112,112)),  # réduit la mémoire
    transforms.ToTensor()
])

train_dataset = FrameSequenceDataset("train.csv", transform, frame_skip=2)
test_dataset  = FrameSequenceDataset("test.csv", transform, frame_skip=2)

train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True, collate_fn=collate_fn)
test_loader  = DataLoader(test_dataset, batch_size=1, shuffle=False, collate_fn=collate_fn)


In [3]:
##########################################
# 5) GPU SETUP
##########################################

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

model = PointDetectorCNNLSTM().to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()

##########################################
# 6) TRAIN + EVAL FUNCTIONS
##########################################

def train_one_epoch(epoch):
    model.train()
    total_loss = 0
    for videos, lengths, labels in train_loader:
        videos = videos.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        preds = model(videos, lengths)
        loss = criterion(preds, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # libération mémoire GPU
        del videos, labels, preds
        torch.cuda.empty_cache()

    print(f"Epoch {epoch} - Train loss: {total_loss/len(train_loader):.4f}")

def evaluate():
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for videos, lengths, labels in test_loader:
            videos = videos.to(device)
            labels = labels.to(device)

            preds = model(videos, lengths)
            predicted = preds.argmax(dim=1)

            correct += (predicted == labels).sum().item()
            total += labels.size(0)

            del videos, labels, preds
            torch.cuda.empty_cache()

    print(f"Test accuracy: {100 * correct / total:.2f}%")

##########################################
# 7) TRAINING LOOP
##########################################

for epoch in range(10):
    train_one_epoch(epoch)
    evaluate()

torch.save(model.state_dict(), "point_detector_lstm_gpu_optimized.pt")
print("Modèle sauvegardé : point_detector_lstm_gpu_optimized.pt")

Using device: cuda
Epoch 0 - Train loss: 0.3302
Test accuracy: 68.75%
Epoch 1 - Train loss: 0.0245
Test accuracy: 100.00%
Epoch 2 - Train loss: 0.0688
Test accuracy: 87.50%
Epoch 3 - Train loss: 0.0786
Test accuracy: 81.25%
Epoch 4 - Train loss: 0.0049
Test accuracy: 87.50%
Epoch 5 - Train loss: 0.0019
Test accuracy: 87.50%
Epoch 6 - Train loss: 0.0014
Test accuracy: 87.50%
Epoch 7 - Train loss: 0.0010
Test accuracy: 87.50%
Epoch 8 - Train loss: 0.0008
Test accuracy: 87.50%
Epoch 9 - Train loss: 0.0007
Test accuracy: 87.50%
Modèle sauvegardé : point_detector_lstm_gpu_optimized.pt


# Application du modèle sur une nouvelle vidéo

Chargement du modèle

In [1]:
import torch
import torch.nn as nn
import torchvision.models as models


In [2]:
class PointDetectorResNetBiLSTM(nn.Module):
    def __init__(self, num_classes=2, hidden_size=256, num_layers=2):
        super().__init__()

        # 1) Charger ResNet18 ou ResNet34 (les deux sont compatibles avec les clés que tu montres)
        resnet = models.resnet18(weights=None)

        # 2) On enlève la dernière FC, on garde tout le CNN
        self.cnn = nn.Sequential(
            resnet.conv1,
            resnet.bn1,
            resnet.relu,
            resnet.maxpool,
            resnet.layer1,   # cnn.4.x dans tes clés
            resnet.layer2,   # cnn.5.x
            resnet.layer3,   # cnn.6.x
            resnet.layer4    # cnn.7.x
        )

        # 3) Adaptive pooling → vecteur 512
        self.pool = nn.AdaptiveAvgPool2d((1,1))

        # 4) BiLSTM
        self.lstm = nn.LSTM(
            input_size=512,
            hidden_size=hidden_size,
            num_layers=num_layers,
            bidirectional=True,    # ← obligatoire (clé reverse détectée)
            batch_first=True
        )

        # 5) Fully Connected  
        # bidirectional => 2 * hidden_size
        self.fc = nn.Linear(hidden_size * 2, num_classes)


    def forward(self, x):
        B, T, C, H, W = x.shape
        x = x.view(B*T, C, H, W)

        # CNN
        f = self.cnn(x)
        f = self.pool(f)
        f = f.view(B, T, -1)   # → (B,T,512)

        # LSTM
        out, _ = self.lstm(f)
        out = out[:, -1]       # dernière frame
        out = self.fc(out)
        return out

In [4]:
model = PointDetectorResNetBiLSTM()
state = torch.load("point_detector_lstm_gpu_optimized.pt", map_location="cuda",weights_only=True)
model.load_state_dict(state)
model = model.to("cuda")
model.eval()
print("OK : modèle chargé correctement")


OK : modèle chargé correctement


Inférence

In [1]:
import cv2
import torch
import torch.nn as nn
from torchvision import models, transforms
from torch.utils.data import DataLoader, Dataset
from PIL import Image


In [2]:
# Modèle CNN + LSTM pour la détection de points
class CNN_LSTM(nn.Module):
    def __init__(self, num_classes=2):
        super().__init__()
        # CNN = ResNet18
        base = models.resnet18(weights=None)
        self.cnn = nn.Sequential(*list(base.children())[:-1])
        cnn_feature_dim = 512

        # LSTM bidirectionnel, conforme au checkpoint
        self.lstm = nn.LSTM(
            input_size=cnn_feature_dim,
            hidden_size=256,
            num_layers=2,
            batch_first=True,
            bidirectional=True
        )

        self.fc = nn.Linear(512, num_classes)  # 256*2 pour bidirectional

    def forward(self, x):
        B, T, C, H, W = x.shape
        x = x.view(B*T, C, H, W)
        features = self.cnn(x)
        features = features.view(B, T, -1)
        lstm_out, _ = self.lstm(features)
        out = self.fc(lstm_out[:, -1, :])
        return out


In [3]:
# FONCTION D'EXTRACTION DE FRAMES AVEC TRANSFERT GPU
def extract_frames_gpu(video_path, device="cuda", target_size=(224,224), max_frames=None):
    """
    Extrait toutes les frames et les transfère sur le GPU.
    """
    transform = transforms.Compose([
        transforms.Resize(target_size),
        transforms.ToTensor()
    ])
    
    cap = cv2.VideoCapture(video_path)
    frames = []
    count = 0

    while True:
        ret, frame = cap.read()
        if not ret:
            break
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frame = Image.fromarray(frame)
        frame_tensor = transform(frame)
        frames.append(frame_tensor)
        count += 1
        if max_frames and count >= max_frames:
            break

    cap.release()
    frames_gpu = torch.stack(frames).to(device)
    return frames_gpu, len(frames)


In [4]:
# Détection des segments dans les prédictions
def detect_segments(predictions):
    """
    Détecte les segments consécutifs où prediction=1.
    """
    segments = []
    in_segment = False
    start = None

    for i, p in enumerate(predictions):
        if p == 1 and not in_segment:
            in_segment = True
            start = i
        elif p == 0 and in_segment:
            in_segment = False
            end = i - 1
            segments.append((start, end))

    if in_segment:
        segments.append((start, len(predictions)-1))

    return segments


In [11]:
def run_inference_streaming(
    video_path,
    model_path,
    out_txt_path="new_points_frames.txt",
    threshold=0.5,
    chunk_size=200
):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print("Using:", device)

    # Charger modèle
    model = CNN_LSTM().to(device)
    state = torch.load(model_path, map_location=device)
    model.load_state_dict(state)
    model.eval()

    # Préparation
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor()
    ])

    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        raise RuntimeError("Impossible d’ouvrir la vidéo.")

    total_predictions = []  # toutes les classes frame par frame
    frame_idx = 0
    batch_frames = []

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        # Prépare la frame
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        pil = Image.fromarray(frame)
        tensor = transform(pil)
        batch_frames.append(tensor)

        # Quand chunk plein, on infère
        if len(batch_frames) == chunk_size:
            frames_gpu = torch.stack(batch_frames).unsqueeze(0).to(device)
            with torch.no_grad():
                logits = model(frames_gpu)
                prob = torch.softmax(logits, dim=1)[0, 1].cpu().numpy()
            preds = (prob >= threshold).astype(int)

            total_predictions.extend(preds.tolist())
            batch_frames = []  # Vide → libère RAM/Vram

        frame_idx += 1

    # Traiter les dernières frames restantes
    if len(batch_frames) > 0:
        frames_gpu = torch.stack(batch_frames).unsqueeze(0).to(device)
        with torch.no_grad():
            logits = model(frames_gpu)
            prob = torch.softmax(logits, dim=1)[0, 1].cpu().numpy()
        preds = (prob >= threshold).astype(int)
        total_predictions.extend(preds.tolist())

    cap.release()
    print("Total frames traitées :", len(total_predictions))

    # Détection segments
    detected_segments = []
    in_seg = False
    start = None

    for i, p in enumerate(total_predictions):
        if p == 1 and not in_seg:
            in_seg = True
            start = i
        elif p == 0 and in_seg:
            in_seg = False
            detected_segments.append((start, i - 1))

    if in_seg:
        detected_segments.append((start, len(total_predictions) - 1))

    # Sauvegarde fichier txt
    with open(out_txt_path, "w") as f:
        for s, e in detected_segments:
            f.write(f"{s}-{e}\n")

    print(f"{len(detected_segments)} segments sauvegardés dans {out_txt_path}")
    return detected_segments


# Exécution de l'inférence
segments = run_inference_streaming(
    video_path="new_video.mp4",
    model_path="point_detector_lstm_gpu_optimized.pt",
    out_txt_path="new_points_frames.txt",
    threshold=0.5,
    chunk_size=200
)

segments


Using: cuda


  state = torch.load(model_path, map_location=device)


TypeError: 'int' object is not iterable

Using: cuda


  state = torch.load(model_path, map_location=device)


TypeError: 'int' object is not iterable