<a href="https://www.kaggle.com/code/abdullah892/balance-fall-assessment-withoutpose-using-videomae?scriptVersionId=270191682" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
!pip install torch torchvision torchaudio  # If not installed; use --extra-index-url https://download.pytorch.org/whl/cu121 for CUDA
!pip install transformers datasets av evaluate

In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # 0 = all logs, 1 = info, 2 = warnings, 3 = errors only
import warnings
warnings.filterwarnings('ignore')  # Suppress Python warnings

In [None]:
import os
import av
import numpy as np
from pathlib import Path
from transformers import AutoImageProcessor, VideoMAEForVideoClassification, TrainingArguments, Trainer
from datasets import Dataset, Features, ClassLabel, Array4D
import evaluate
import torch
from tqdm import tqdm  # For progress bars if needed

np.random.seed(0)

In [None]:
import os
from pathlib import Path

data_dir = Path('/kaggle/input/multiple-cameras-fall-dataset/')
for root, dirs, files in os.walk(data_dir):
    print(f"Root: {root}")
    print(f"  Dirs: {dirs[:5]}...")  # First few subdirs
    print(f"  Files: {files[:5]}...")  # First few files
    print("---")

In [None]:
import av
import numpy as np
from pathlib import Path
from transformers import AutoImageProcessor
from datasets import Dataset, Features, Value, ClassLabel, Array4D

np.random.seed(0)

# Frame sampling functions
def read_video_pyav(container, indices):
    frames = []
    container.seek(0)
    start_index = indices[0]
    end_index = indices[-1]
    for i, frame in enumerate(container.decode(video=0)):
        if i > end_index:
            break
        if i >= start_index and i in indices:
            frames.append(frame)
    return np.stack([x.to_ndarray(format="rgb24") for x in frames])

def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
    converted_len = int(clip_len * frame_sample_rate)
    if seg_len < converted_len:
        indices = np.linspace(0, seg_len - 1, num=clip_len).astype(np.int64)
    else:
        end_idx = np.random.randint(converted_len, seg_len)
        start_idx = end_idx - converted_len
        indices = np.linspace(start_idx, end_idx, num=clip_len)
        indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
    return indices

# Preprocessor
image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base")

# Dataset loading
data_dir = Path('/kaggle/input/multiple-cameras-fall-dataset/dataset/dataset/')

# Collect paths and labels
video_paths = []
labels = []

print("Collecting video paths...")
for folder in sorted(data_dir.iterdir()):
    if folder.is_dir() and 'chute' in folder.name.lower():
        scenario_num = int(folder.name.lower().replace('chute', ''))
        label = 1 if scenario_num <= 22 else 0
        for video in folder.glob('cam*.avi'):
            video_paths.append(str(video))
            labels.append(label)

print(f"Total videos: {len(video_paths)} (Falls: {sum(labels)}, No-falls: {len(labels) - sum(labels)})")

# Process all videos
print("\nProcessing all videos (this may take 5-15 minutes)...")
processed_videos = []
processed_labels = []
failed_count = 0

for idx, (video_path, label) in enumerate(zip(video_paths, labels)):
    try:
        if idx % 10 == 0:
            print(f"Progress: {idx}/{len(video_paths)} videos...")
        
        container = av.open(video_path)
        video_stream = container.streams.video[0]
        total_frames = video_stream.frames
        
        if total_frames == 0:
            total_frames = sum(1 for _ in container.decode(video=0))
            container.seek(0)
        
        indices = sample_frame_indices(clip_len=16, frame_sample_rate=1, seg_len=total_frames)
        video = read_video_pyav(container, indices)
        inputs = image_processor(list(video), return_tensors="pt")
        pixel_values = inputs['pixel_values'].squeeze(0).numpy()
        
        # Ensure it's a numpy array
        processed_videos.append(pixel_values)
        processed_labels.append(label)
        
        container.close()
        
    except Exception as e:
        print(f"Failed to process {video_path}: {e}")
        failed_count += 1

print(f"\n✓ Processing complete!")
print(f"Successfully processed: {len(processed_videos)}/{len(video_paths)}")
print(f"Failed: {failed_count}")

# Convert to numpy array first
print("\nConverting to numpy arrays...")
processed_videos = np.array(processed_videos)
processed_labels = np.array(processed_labels)

print(f"Videos array shape: {processed_videos.shape}")
print(f"Labels array shape: {processed_labels.shape}")

# Create dataset with processed videos
print("\nCreating HuggingFace dataset...")
dataset = Dataset.from_dict({
    "pixel_values": processed_videos,
    "label": processed_labels
})

# Cast to proper feature types
dataset = dataset.cast_column("label", ClassLabel(num_classes=2, names=['no_fall', 'fall']))

print(f"\n✓ Dataset created successfully!")
print(f"Columns: {dataset.column_names}")
print(f"Size: {len(dataset)}")
print(f"Sample pixel_values shape: {np.array(dataset[0]['pixel_values']).shape}")
print(f"Sample label: {dataset[0]['label']}")

# Split dataset
print("\nSplitting dataset...")
dataset = dataset.train_test_split(test_size=0.2, seed=42)

print(f"\n{'='*60}")
print(f"PREPROCESSING COMPLETE!")
print(f"{'='*60}")
print(f"Train size: {len(dataset['train'])}")
print(f"Test size: {len(dataset['test'])}")
print(f"Columns: {dataset['train'].column_names}")
print(f"Train pixel_values shape: {np.array(dataset['train'][0]['pixel_values']).shape}")
print(f"{'='*60}\n")
print("✓ Ready for training!\n")

In [None]:
import os
import torch
import transformers
from transformers import VideoMAEForVideoClassification, TrainingArguments, Trainer
import evaluate

# Disable wandb
os.environ["WANDB_MODE"] = "disabled"
os.environ["WANDB_DISABLED"] = "true"

# Check environment
print("="*60)
print("TRAINING SETUP")
print("="*60)
print("CUDA Available:", torch.cuda.is_available())
print("GPU Device:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "None")
print("Transformers Version:", transformers.__version__)

# Verify dataset is ready
print("\nDataset verification:")
print("Train size:", len(dataset['train']))
print("Test size:", len(dataset['test']))
print("Train columns:", dataset['train'].column_names)

# Set format to PyTorch
dataset.set_format(type='torch', columns=['pixel_values', 'label'])

print("Sample pixel_values shape:", dataset['train'][0]['pixel_values'].shape)
print("Sample label:", dataset['train'][0]['label'])
print("="*60 + "\n")

# Load model
print("Loading VideoMAE model...")
model = VideoMAEForVideoClassification.from_pretrained(
    "MCG-NJU/videomae-base-finetuned-kinetics",
    num_labels=2,
    id2label={0: "no_fall", 1: "fall"},
    label2id={"no_fall": 0, "fall": 1},
    ignore_mismatched_sizes=True
)

# Move to GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(f"✓ Model loaded and moved to {device}\n")

# Class weights for imbalance (10x weight for fall class)
class_weights = torch.tensor([1.0, 10.0]).to(device)
print(f"Class weights: {class_weights.cpu().numpy()}")

# Custom Trainer for class weights
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss = torch.nn.functional.cross_entropy(logits, labels, weight=class_weights)
        return (loss, outputs) if return_outputs else loss

# Data collator to rename 'label' to 'labels'
def collate_fn(batch):
    pixel_values = torch.stack([item['pixel_values'] for item in batch])
    labels = torch.tensor([item['label'] for item in batch])
    return {'pixel_values': pixel_values, 'labels': labels}

# Training arguments
training_args = TrainingArguments(
    output_dir="/kaggle/working/videomae-fall-detection",
    num_train_epochs=10,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    warmup_steps=100,
    weight_decay=0.01,
    learning_rate=5e-5,
    logging_dir="/kaggle/working/logs",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True if torch.cuda.is_available() else False,
    dataloader_num_workers=0,
    report_to="none",
    logging_steps=5,
    logging_first_step=True,
    gradient_accumulation_steps=2,
    save_total_limit=2,  # Keep only 2 best checkpoints
)

print("\nTraining configuration:")
print(f"  Epochs: {training_args.num_train_epochs}")
print(f"  Batch size: {training_args.per_device_train_batch_size}")
print(f"  Learning rate: {training_args.learning_rate}")
print(f"  FP16: {training_args.fp16}")
print(f"  Gradient accumulation: {training_args.gradient_accumulation_steps}")

# Metrics
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions = torch.argmax(torch.tensor(eval_pred.predictions), dim=-1)
    return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)

# Clear previous outputs
print("\nCleaning previous outputs...")
!rm -rf /kaggle/working/videomae-fall-detection/*
!rm -rf /kaggle/working/logs/*

# Check GPU memory
print("\nGPU Status:")
!nvidia-smi --query-gpu=memory.used,memory.total --format=csv

# Initialize Trainer
print("\nInitializing Trainer...")
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    compute_metrics=compute_metrics,
    tokenizer=image_processor,
    data_collator=collate_fn,
)

print("✓ Trainer initialized\n")

# Train
print("="*60)
print("STARTING TRAINING")
print("="*60)

try:
    trainer.train()
    print("\n" + "="*60)
    print("✓ TRAINING COMPLETED SUCCESSFULLY!")
    print("="*60)
except Exception as e:
    print("\n" + "="*60)
    print("✗ TRAINING FAILED")
    print("="*60)
    print(f"Error: {e}")
    import traceback
    traceback.print_exc()
    raise

# Save final model
print("\nSaving model...")
trainer.save_model("/kaggle/working/videomae-fall-detection")
image_processor.save_pretrained("/kaggle/working/videomae-fall-detection")

print("\nSaved files:", os.listdir("/kaggle/working/videomae-fall-detection"))

# Final evaluation
print("\n" + "="*60)
print("FINAL EVALUATION")
print("="*60)
eval_results = trainer.evaluate()
for key, value in eval_results.items():
    print(f"{key}: {value}")
print("="*60)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

print("Evaluating on entire test set...")

# Get all predictions
all_predictions = []
all_labels = []
all_probs = []

for idx in range(len(dataset['test'])):
    # Get the original video path (you'll need to match indices)
    pixel_values = dataset['test'][idx]['pixel_values']
    true_label = dataset['test'][idx]['label']
    
    # Predict
    inputs = {'pixel_values': pixel_values.unsqueeze(0).to(device)}
    
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probabilities = torch.nn.functional.softmax(logits, dim=-1)
        predicted_class = torch.argmax(probabilities, dim=-1).item()
    
    all_predictions.append(predicted_class)
    all_labels.append(true_label)
    all_probs.append(probabilities[0][1].item())  # Fall probability

# Classification report
print("\n" + "="*60)
print("CLASSIFICATION REPORT")
print("="*60)
print(classification_report(all_labels, all_predictions, 
                          target_names=['no_fall', 'fall']))

# Confusion matrix
cm = confusion_matrix(all_labels, all_predictions)
print("\nConfusion Matrix:")
print(cm)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['No Fall', 'Fall'],
            yticklabels=['No Fall', 'Fall'])
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.savefig('/kaggle/working/confusion_matrix.png', dpi=300, bbox_inches='tight')
print("✓ Confusion matrix saved to /kaggle/working/confusion_matrix.png")
plt.show()

# Calculate additional metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(all_labels, all_predictions)
precision = precision_score(all_labels, all_predictions)
recall = recall_score(all_labels, all_predictions)
f1 = f1_score(all_labels, all_predictions)

print("\n" + "="*60)
print("DETAILED METRICS")
print("="*60)
print(f"Accuracy:  {accuracy:.2%}")
print(f"Precision: {precision:.2%} (of predicted falls, how many are correct)")
print(f"Recall:    {recall:.2%} (of actual falls, how many detected)")
print(f"F1-Score:  {f1:.2%}")
print("="*60)