In [13]:
# Step 1: Install Required Libraries (Run in Colab)
!pip install torch transformers huggingface_hub opencv-python numpy ipywidgets scikit-learn

# Step 2: Import Libraries and Set Up Configurations
import torch
import torch.nn as nn
import numpy as np
import cv2
from transformers import AutoImageProcessor, AutoModelForImageClassification
from IPython.display import display
import ipywidgets as widgets
import torch.nn.functional as F
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import logging

# Initialize logging to ensure real-time display in Colab
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True)

# Configuration class for model and data parameters
class Config:
    FRAME_SIZE = (224, 224)  # Target frame size for CNN
    NUM_FRAMES = 32  # Fixed number of frames for temporal consistency
    CNN_FEATURE_DIM = 512  # Output dimension after CNN processing
    LSTM_HIDDEN_DIM = 256  # Hidden dimension for LSTM
    NUM_CLASSES = 2  # Binary classification (0 or 1)
    DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

# Step 3: Diffusion Model for Feature Extraction with Reconstruction Error Detection
class DiffusionFeatureExtractor:
    def __init__(self, model_name="google/vit-base-patch16-224"):
        try:
            self.feature_extractor = AutoImageProcessor.from_pretrained(model_name, use_fast=True)
            self.model = AutoModelForImageClassification.from_pretrained(model_name).to(Config.DEVICE)
            logging.info("Diffusion model and feature extractor loaded successfully.")
        except Exception as e:
            logging.error("Error loading model:", e)
            raise

    def extract_features_with_reconstruction_error(self, frames):
        inputs = self.feature_extractor(images=frames, return_tensors="pt", do_rescale=False).to(Config.DEVICE)

        # Extract original frame features
        with torch.no_grad():
            original_features = self.model(**inputs).logits

        # Perturb frames and reprocess to simulate reconstruction
        reconstructed_frames = frames * 0.95
        reconstructed_inputs = self.feature_extractor(images=reconstructed_frames, return_tensors="pt", do_rescale=False).to(Config.DEVICE)

        with torch.no_grad():
            reconstructed_features = self.model(**reconstructed_inputs).logits
        reconstruction_error = (original_features - reconstructed_features).abs()

        # Flatten and interpolate directly to target dimensions
        reconstruction_error = reconstruction_error.flatten(start_dim=1)  # Flatten each frame
        reconstruction_error = reconstruction_error.view(Config.NUM_FRAMES, 1, -1, 1)  # Reshape for interpolation
        reconstruction_error = F.interpolate(reconstruction_error, size=(32, 32), mode='bilinear', align_corners=False)

        return reconstruction_error

# Step 4: Video Preprocessing Function
def video_to_frames(video_path, frame_size=(224, 224), num_frames=32):
    cap = cv2.VideoCapture(video_path)
    frames = []
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    if frame_count < num_frames:
        logging.warning(f"Video only has {frame_count} frames, expected {num_frames}.")

    while cap.isOpened() and len(frames) < num_frames:
        ret, frame = cap.read()
        if not ret:
            break
        frame = cv2.resize(frame, frame_size)
        frames.append(frame)
    cap.release()
    frames = np.array(frames) / 255.0
    logging.info(f"Processed {len(frames)} frames from the video.")
    return frames

# Step 5: CNN+LSTM Model for Classification
class CNNLSTMModel(nn.Module):
    def __init__(self):
        super(CNNLSTMModel, self).__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
        )
        self.flatten = nn.Flatten()
        self.fc = nn.Linear(32 * 8 * 8, Config.CNN_FEATURE_DIM)
        self.lstm = nn.LSTM(Config.CNN_FEATURE_DIM, Config.LSTM_HIDDEN_DIM, batch_first=True)
        self.classifier = nn.Linear(Config.LSTM_HIDDEN_DIM, Config.NUM_CLASSES)  # Binary output

    def forward(self, x):
        batch_size, num_frames, channels, height, width = x.size()
        cnn_out = []
        for t in range(num_frames):
            frame_features = x[:, t]
            cnn_out.append(self.fc(self.flatten(self.cnn(frame_features))))
        cnn_out = torch.stack(cnn_out, dim=1)
        lstm_out, _ = self.lstm(cnn_out)
        final_output = self.classifier(lstm_out[:, -1, :])
        return final_output

# Step 6: Define Evaluation Metrics
def calculate_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, zero_division=0)  # Handle undefined precision gracefully
    recall = recall_score(y_true, y_pred, zero_division=0)  # Handle undefined recall gracefully
    f1 = f1_score(y_true, y_pred, zero_division=0)  # Handle undefined F1 score gracefully
    logging.info(f"Metrics - Accuracy: {accuracy:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")
    return accuracy, precision, recall, f1

# Function to calculate reconstruction error (optional)
def calculate_reconstruction_error(original_features, reconstructed_features):
    error = (original_features - reconstructed_features).abs().mean().item()
    return error

# Step 7: Pipeline Function for End-to-End Inference and Evaluation
def run_pipeline(video_path, y_true):
    frames = video_to_frames(video_path)
    if len(frames) == 0:
        logging.error("Error: No frames were extracted from the video.")
        return

    diffusion_model = DiffusionFeatureExtractor()
    reconstruction_error_features = diffusion_model.extract_features_with_reconstruction_error(frames)
    reconstruction_error_features = reconstruction_error_features.unsqueeze(0).to(Config.DEVICE)

    cnn_lstm_model = CNNLSTMModel().to(Config.DEVICE)
    cnn_lstm_model.eval()
    with torch.no_grad():
        prediction = cnn_lstm_model(reconstruction_error_features)
    predicted_class = torch.argmax(prediction, dim=1).item()

    accuracy, precision, recall, f1 = calculate_metrics([y_true], [predicted_class])
    logging.info(f"Binary Classification Result: {predicted_class}")

    original_features = diffusion_model.model(**diffusion_model.feature_extractor(images=frames, return_tensors="pt", do_rescale=False)).logits
    reconstructed_frames = frames * 0.95
    reconstructed_features = diffusion_model.model(**diffusion_model.feature_extractor(images=reconstructed_frames, return_tensors="pt", do_rescale=False)).logits
    reconstruction_error = calculate_reconstruction_error(original_features, reconstructed_features)
    logging.info(f"Reconstruction Error (Anomaly Detection): {reconstruction_error:.4f}")

    return predicted_class, accuracy, precision, recall, f1, reconstruction_error

# Step 8: Set Up Video Upload Button in Colab and Display Prediction
def upload_and_predict(y_true):
    upload_button = widgets.FileUpload(accept=".mp4", multiple=False)
    display(upload_button)

    def on_upload_change(change):
        for name, file_info in upload_button.value.items():
            video_path = '/content/' + name
            with open(video_path, 'wb') as f:
                f.write(file_info['content'])

            logging.info("Running pipeline on uploaded video...")
            predicted_class, accuracy, precision, recall, f1, reconstruction_error = run_pipeline(video_path, y_true)
            if predicted_class is not None:
                logging.info(f"Final Prediction: {predicted_class} | Metrics - Accuracy: {accuracy:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")
                logging.info(f"Reconstruction Error (Anomaly Detection): {reconstruction_error:.4f}")

    upload_button.observe(on_upload_change, names='value')

# Run this function with the true label (e.g., 0 or 1) for the uploaded video
upload_and_predict(y_true=1)  # Example true label for the video




FileUpload(value={}, accept='.mp4', description='Upload')

2024-10-31 04:26:12,320 - INFO - Running pipeline on uploaded video...
2024-10-31 04:26:12,365 - INFO - Processed 32 frames from the video.
2024-10-31 04:26:12,669 - INFO - Diffusion model and feature extractor loaded successfully.
2024-10-31 04:26:51,827 - INFO - Metrics - Accuracy: 1.00, Precision: 1.00, Recall: 1.00, F1 Score: 1.00
2024-10-31 04:26:51,829 - INFO - Binary Classification Result: 1
2024-10-31 04:27:33,288 - INFO - Reconstruction Error (Anomaly Detection): 0.0250
2024-10-31 04:27:33,602 - INFO - Final Prediction: 1 | Metrics - Accuracy: 1.00, Precision: 1.00, Recall: 1.00, F1 Score: 1.00
2024-10-31 04:27:33,604 - INFO - Reconstruction Error (Anomaly Detection): 0.0250


In [None]:
from sagemaker.pytorch import PyTorchModel

model = PyTorchModel(
    model_data='s3://your-model-path/model.tar.gz',
    role='<your-sagemaker-role>',
    entry_point='inference.py',  # Inference script name
    framework_version='1.8',
    py_version='py3',
)

predictor = model.deploy(
    initial_instance_count=1,
    instance_type='ml.m4.xlarge'
)


In [None]:
import boto3

runtime = boto3.client('runtime.sagemaker')
response = runtime.invoke_endpoint(
    EndpointName='<your-endpoint-name>',
    ContentType='application/json',
    Body=json.dumps({'video_uri': 's3://your-bucket/video.mp4'})
)
