# De-identification Pipeline (Colab Version)

This notebook runs the full media de-identification pipeline based on your `pipeline_batch.py` script.

**Features:**
1. Downloads video from YouTube.
2. Extracts audio for transcription.
3. Blurs faces in the video (Optimized for Colab/CPU).
4. Transcribes audio using Azure Speech-to-Text.
5. Redacts PHI from the transcript using Azure Health De-identification.
6. Recombines processed video and audio.

**Setup:**
1. Set your Azure secrets in the Configuration cell.
2. Enable GPU runtime (Runtime > Change runtime type > T4 GPU) for best performance.

In [None]:
!pip install azure-cognitiveservices-speech azure-health-deidentification azure-identity azure-storage-blob yt-dlp opencv-python-headless numpy requests

In [None]:
import os
import time
import json
import requests
import yt_dlp
import cv2
import subprocess
import numpy as np
from azure.storage.blob import BlobServiceClient, generate_blob_sas, BlobSasPermissions
from azure.health.deidentification import DeidentificationClient
from azure.identity import DefaultAzureCredential
from datetime import datetime, timedelta, timezone

# --- CONFIGURATION ---
# TODO: Replace these with your actual keys
YOUTUBE_URL = "https://www.youtube.com/watch?v=IrCmLrUXdmo" 
SPEECH_KEY = "YOUR_SPEECH_KEY"
SPEECH_REGION = "eastus"
DEID_SERVICE_ENDPOINT = "YOUR_DEID_ENDPOINT"
STORAGE_CONN_STR = "YOUR_STORAGE_CONNECTION_STRING"

if SPEECH_KEY == "YOUR_SPEECH_KEY":
    print("WARNING: You need to set your Azure API keys in this cell!")

In [None]:
def download_video_and_extract_audio(youtube_url, output_filename="colab_media"):
    print(f"Downloading video from {youtube_url}...")
    ydl_opts = {
        'format': 'best[ext=mp4]/best',
        'outtmpl': output_filename + '.%(ext)s',
        'quiet': True,
    }
    video_path = f"{output_filename}.mp4"
    audio_path = f"{output_filename}.wav"

    try:
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            ydl.download([youtube_url])
        
        if not os.path.exists(video_path):
            for f in os.listdir("."):
                if f.startswith(output_filename) and not f.endswith(".wav"):
                    video_path = f
                    break
        
        print(f"Video downloaded: {video_path}")
        
        print("Extracting audio...")
        subprocess.run([
            'ffmpeg', '-y', '-i', video_path, 
            '-vn', '-acodec', 'pcm_s16le', '-ar', '16000', '-ac', '1', 
            audio_path
        ], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
        
        return video_path, audio_path
    except Exception as e:
        print(f"Error: {e}")
        return None, None

def blur_faces(video_path, output_path):
    """
    Blurs faces in the video. Includes resizing optimization for faster processing.
    """
    print(f"Blurring faces in {video_path}...")
    
    face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
    cap = cv2.VideoCapture(video_path)
    
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
    
    frame_count = 0
    start_time = time.time()
    
    # Optimization: Detect on smaller image to speed up CPU processing
    scale_factor = 0.5 

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret: break
            
        # Resize for detection
        small_frame = cv2.resize(frame, (0, 0), fx=scale_factor, fy=scale_factor)
        gray = cv2.cvtColor(small_frame, cv2.COLOR_BGR2GRAY)
        
        faces = face_cascade.detectMultiScale(gray, 1.1, 5, minSize=(30, 30))
        
        for (x, y, w, h) in faces:
            # Scale coordinates back to original size
            x = int(x / scale_factor)
            y = int(y / scale_factor)
            w = int(w / scale_factor)
            h = int(h / scale_factor)
            
            # Bounds check
            y_end = min(height, y+h)
            x_end = min(width, x+w)
            
            if y < y_end and x < x_end:
                roi = frame[y:y_end, x:x_end]
                roi = cv2.GaussianBlur(roi, (99, 99), 30)
                frame[y:y_end, x:x_end] = roi
            
        out.write(frame)
        
        frame_count += 1
        if frame_count % 500 == 0:
            elapsed = time.time() - start_time
            fps_proc = frame_count / elapsed
            eta_min = (total_frames - frame_count) / fps_proc / 60
            print(f"Processed {frame_count}/{total_frames} frames. Speed: {fps_proc:.2f} fps. ETA: {eta_min:.1f} min")
            
    cap.release()
    out.release()
    print(f"Blurring complete: {output_path}")
    return output_path

def upload_to_blob(file_path, connection_string, container_name="colab-uploads"):
    try:
        blob_service_client = BlobServiceClient.from_connection_string(connection_string)
        container_client = blob_service_client.get_container_client(container_name)
        if not container_client.exists(): container_client.create_container()
        blob_name = os.path.basename(file_path)
        blob_client = container_client.get_blob_client(blob_name)
        with open(file_path, "rb") as data:
            blob_client.upload_blob(data, overwrite=True)
        sas_token = generate_blob_sas(
            account_name=blob_client.account_name,
            container_name=container_name,
            blob_name=blob_name,
            account_key=blob_service_client.credential.account_key,
            permission=BlobSasPermissions(read=True),
            expiry=datetime.now(timezone.utc) + timedelta(hours=24)
        )
        return f"{blob_client.url}?{sas_token}"
    except Exception as e:
        print(f"Blob upload error: {e}")
        return None

def submit_transcription_job(audio_url, speech_key, speech_region):
    api_url = f"https://{speech_region}.api.cognitive.microsoft.com/speechtotext/v3.1/transcriptions"
    headers = {"Ocp-Apim-Subscription-Key": speech_key, "Content-Type": "application/json"}
    payload = {
        "displayName": f"colab_transcription_{int(time.time())}",
        "description": "De-identification POC Colab",
        "locale": "en-US",
        "contentUrls": [audio_url],
        "properties": {"wordLevelTimestampsEnabled": False, "punctuationMode": "DictatedAndAutomatic"}
    }
    response = requests.post(api_url, headers=headers, json=payload)
    if response.status_code == 201:
        return response.json()["self"]
    print(f"Transcription submission failed: {response.text}")
    return None

def wait_for_transcript(job_url, speech_key):
    headers = {"Ocp-Apim-Subscription-Key": speech_key}
    while True:
        response = requests.get(job_url, headers=headers)
        status = response.json()["status"]
        if status == "Succeeded": break
        if status == "Failed": return None
        time.sleep(10)
    
    results_url = response.json()["links"]["files"]
    files = requests.get(results_url, headers=headers).json()["values"]
    for f in files:
        if f["kind"] == "Transcription":
            data = requests.get(f["links"]["contentUrl"]).json()
            return " ".join([p["display"] for p in data["combinedRecognizedPhrases"]])
    return None

def redact_phi(text, endpoint):
    try:
        credential = DefaultAzureCredential()
        client = DeidentificationClient(endpoint=endpoint, credential=credential)
        # Chunking for API limits
        chunk_size = 5000
        chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
        redacted_chunks = []
        for chunk in chunks:
            response = client.deidentify_text(body={"inputText": chunk})
            redacted_chunks.append(response.output_text)
        return "".join(redacted_chunks)
    except Exception as e:
        print(f"Redaction error/skipped: {e}")
        return text

In [None]:
# --- RUN PIPELINE ---
print("=== Step 1: Download ===")
video_path, audio_path = download_video_and_extract_audio(YOUTUBE_URL)

if video_path:
    print("\n=== Step 2: Face Blurring (Optimized) ===")
    blurred_output = "colab_blurred_silent.mp4"
    blur_faces(video_path, blurred_output)
    
    print("\n=== Step 3: Transcription & De-ID ===")
    if SPEECH_KEY != "YOUR_SPEECH_KEY":
        sas_url = upload_to_blob(audio_path, STORAGE_CONN_STR)
        if sas_url:
            job = submit_transcription_job(sas_url, SPEECH_KEY, SPEECH_REGION)
            if job:
                transcript = wait_for_transcript(job, SPEECH_KEY)
                print(f"Transcript preview: {transcript[:100]}...")
                # redacted = redact_phi(transcript, DEID_SERVICE_ENDPOINT) 
                # print(f"Redacted: {redacted[:100]}")
    else:
        print("Skipping Cloud steps (Keys not set).")
    
    print("\n=== Step 4: Final Merge ===")
    final_output = "colab_final_deidentified.mp4"
    subprocess.run([
        'ffmpeg', '-y', '-i', blurred_output, '-i', audio_path, 
        '-c:v', 'copy', '-c:a', 'aac', final_output
    ])
    print(f"DONE! Output: {final_output}")
    
    # Helper to download file in Colab
    from google.colab import files
    files.download(final_output)