# CrowdFace Demo: Neural-Adaptive Crowd Segmentation with Contextual Pixel-Space Advertisement Integration[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/BlackBoyZeus/CrowdFace/blob/main/CrowdFace_Demo.ipynb)This notebook demonstrates the CrowdFace system, which combines state-of-the-art segmentation models with contextual advertisement placement.

## Setup and DependenciesFirst, let's install the required dependencies.

In [None]:
!pip install torch torchvision opencv-python transformers diffusers accelerate safetensors huggingface_hub matplotlib

In [None]:
# Clone necessary repositories!git clone https://github.com/BlackBoyZeus/CrowdFace.git!git clone https://github.com/PeterL1n/RobustVideoMatting.git# Add to Python pathimport syssys.path.append('CrowdFace')sys.path.append('RobustVideoMatting')# Import basic librariesimport osimport torchimport numpy as npimport cv2from PIL import Imageimport matplotlib.pyplot as pltfrom tqdm.notebook import tqdm

In [None]:
# Set up Hugging Face token for model accessimport os# Try to get token from environment or Colab secretsHUGGINGFACE_TOKEN = Nonetry:    from google.colab import userdata    if userdata.get('HUGGINGFACE_TOKEN'):        HUGGINGFACE_TOKEN = userdata.get('HUGGINGFACE_TOKEN')        print("Using token from Colab secrets")except:    if os.environ.get('HUGGINGFACE_TOKEN'):        HUGGINGFACE_TOKEN = os.environ.get('HUGGINGFACE_TOKEN')        print("Using token from environment variables")# If no token is found, prompt the userif not HUGGINGFACE_TOKEN:    print("No Hugging Face token found. Please enter your token below.")    HUGGINGFACE_TOKEN = input("Enter your Hugging Face token: ")

In [None]:
# Define utility functions for the demodef load_video(video_path, max_frames=100):    # Load video frames from path    cap = cv2.VideoCapture(video_path)    frames = []    count = 0        while cap.isOpened() and count < max_frames:        ret, frame = cap.read()        if not ret:            break        frames.append(frame)        count += 1        cap.release()    print(f"Loaded {len(frames)} frames from {video_path}")    return framesdef create_sample_ad():    # Create a sample advertisement image    ad_img = np.ones((300, 500, 4), dtype=np.uint8) * 255    # Add some text    cv2.putText(ad_img, "SAMPLE AD", (50, 150), cv2.FONT_HERSHEY_SIMPLEX, 2, (0, 0, 255, 255), 5)    return ad_imgdef display_frames(frames, num_frames=5):    # Display a sample of video frames    if len(frames) == 0:        print("No frames to display")        return            step = max(1, len(frames) // num_frames)    fig, axes = plt.subplots(1, num_frames, figsize=(20, 4))        for i in range(num_frames):        idx = min(i * step, len(frames) - 1)        frame = cv2.cvtColor(frames[idx], cv2.COLOR_BGR2RGB)        axes[i].imshow(frame)        axes[i].set_title(f"Frame {idx}")        axes[i].axis('off')        plt.tight_layout()    plt.show()

In [None]:
# Load SAM2 model for segmentationfrom transformers import SamModel, SamProcessorprint("Loading SAM2 model...")model_id = "facebook/sam2"# Use a try-except block to handle potential errorstry:    sam_processor = SamProcessor.from_pretrained(model_id, token=HUGGINGFACE_TOKEN)    sam_model = SamModel.from_pretrained(model_id, token=HUGGINGFACE_TOKEN)        device = "cuda" if torch.cuda.is_available() else "cpu"    sam_model = sam_model.to(device)    print(f"SAM2 model loaded successfully on {device}")except Exception as e:    print(f"Error loading SAM2 model: {e}")    print("Using a placeholder for demonstration purposes")    sam_model = None    sam_processor = None

In [None]:
# Load RVM model for video mattingtry:    from model import MattingNetwork        device = "cuda" if torch.cuda.is_available() else "cpu"        # Load RVM model    rvm_model = MattingNetwork('mobilenetv3').eval().to(device)        # Download RVM weights if needed    if not os.path.exists('rvm_mobilenetv3.pth'):        !wget https://github.com/PeterL1n/RobustVideoMatting/releases/download/v1.0.0/rvm_mobilenetv3.pth        # Load weights    rvm_model.load_state_dict(torch.load('rvm_mobilenetv3.pth', map_location=device))    print("RVM model loaded successfully")except Exception as e:    print(f"Error loading RVM model: {e}")    print("Using a placeholder for demonstration purposes")    rvm_model = None

In [None]:
# Define a simplified CrowdFace pipeline for demonstrationclass SimpleCrowdFacePipeline:    def __init__(self, sam_model, sam_processor, rvm_model=None):        self.sam_model = sam_model        self.sam_processor = sam_processor        self.rvm_model = rvm_model        self.device = "cuda" if torch.cuda.is_available() else "cpu"        def segment_people(self, frame):        # Segment people in the frame using SAM2        if self.sam_model is None or self.sam_processor is None:            # Create a simple placeholder mask for demonstration            mask = np.zeros((frame.shape[0], frame.shape[1]), dtype=np.uint8)            # Add a simple ellipse as a "person"            cv2.ellipse(mask,                        (frame.shape[1]//2, frame.shape[0]//2),                       (frame.shape[1]//4, frame.shape[0]//2),                       0, 0, 360, 255, -1)            return mask                    # Convert frame to RGB if it's in BGR format        if isinstance(frame, np.ndarray) and frame.shape[2] == 3:            rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)        else:            rgb_frame = frame                    # Process the image with SAM        inputs = self.sam_processor(rgb_frame, return_tensors="pt").to(self.device)                # Generate automatic mask predictions        with torch.no_grad():            outputs = self.sam_model(**inputs)                    # Get the predicted masks        masks = self.sam_processor.image_processor.post_process_masks(            outputs.pred_masks.cpu(),            inputs["original_sizes"].cpu(),            inputs["reshaped_input_sizes"].cpu()        )                # Take the largest mask as a person (simplified approach)        combined_mask = np.zeros((frame.shape[0], frame.shape[1]), dtype=np.uint8)                if len(masks) > 0 and len(masks[0]) > 0:            largest_mask = None            largest_area = 0                        for mask in masks[0]:                mask_np = mask.numpy()                area = np.sum(mask_np)                if area > largest_area:                    largest_area = area                    largest_mask = mask_np                                if largest_mask is not None:                combined_mask = largest_mask.astype(np.uint8) * 255                return combined_mask        def find_ad_placement(self, frame, mask):        # Find suitable locations for ad placement        binary_mask = (mask > 128).astype(np.uint8)        contours, _ = cv2.findContours(binary_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)                if not contours:            # Default to center-right if no contours found            return (frame.shape[1] * 3 // 4, frame.shape[0] // 2)                    largest_contour = max(contours, key=cv2.contourArea)        x, y, w, h = cv2.boundingRect(largest_contour)                # Default placement to the right of the person        ad_x = min(x + w + 20, frame.shape[1] - 100)        ad_y = y                return (ad_x, ad_y)        def place_ad(self, frame, ad_image, position, scale=0.3):        # Place the ad in the frame at the specified position        # Convert ad_image to numpy array if it's a PIL Image        if isinstance(ad_image, Image.Image):            ad_image = np.array(ad_image)            # Convert RGB to BGR if needed            if ad_image.shape[2] == 3:                ad_image = cv2.cvtColor(ad_image, cv2.COLOR_RGB2BGR)                # Resize ad image        ad_height = int(frame.shape[0] * scale)        ad_width = int(ad_image.shape[1] * (ad_height / ad_image.shape[0]))        ad_resized = cv2.resize(ad_image, (ad_width, ad_height))                # Extract position        x, y = position                # Ensure the ad fits within the frame        if x + ad_width > frame.shape[1]:            x = frame.shape[1] - ad_width        if y + ad_height > frame.shape[0]:            y = frame.shape[0] - ad_height                    # Create a copy of the frame        result = frame.copy()                # Check if ad has an alpha channel        if ad_resized.shape[2] == 4:            # Extract alpha channel            alpha = ad_resized[:, :, 3] / 255.0            alpha = np.expand_dims(alpha, axis=2)                        # Extract RGB channels            rgb = ad_resized[:, :, :3]                        # Get the region of interest in the frame            roi = result[y:y+ad_height, x:x+ad_width]                        # Blend the ad with the frame using alpha            blended = (1.0 - alpha) * roi + alpha * rgb                        # Place the blended image back into the frame            result[y:y+ad_height, x:x+ad_width] = blended        else:            # Simple overlay without alpha blending            result[y:y+ad_height, x:x+ad_width] = ad_resized                    return result        def process_video(self, frames, ad_image, output_path=None, display_results=True):        # Process video frames with ad placement        results = []                for i, frame in enumerate(tqdm(frames, desc="Processing frames")):            # Every 10 frames, re-detect people and ad placement            if i % 10 == 0:                mask = self.segment_people(frame)                ad_position = self.find_ad_placement(frame, mask)                        # Place the ad            result_frame = self.place_ad(frame, ad_image, ad_position)            results.append(result_frame)                    # Display results        if display_results:            display_frames(results)                    # Save video if output path is provided        if output_path:            height, width = results[0].shape[:2]            fourcc = cv2.VideoWriter_fourcc(*"mp4v")            out = cv2.VideoWriter(output_path, fourcc, 30, (width, height))                        for frame in results:                out.write(frame)                            out.release()            print(f"Video saved to {output_path}")                    return results

In [None]:
# Initialize the CrowdFace pipelinepipeline = SimpleCrowdFacePipeline(    sam_model=sam_model,    sam_processor=sam_processor,    rvm_model=rvm_model)print("CrowdFace pipeline initialized")

In [None]:
# Download a sample video if neededsample_video_path = "sample_video.mp4"if not os.path.exists(sample_video_path):    !wget -O sample_video.mp4 https://pixabay.com/videos/download/video-41758_source.mp4?attachment    print(f"Downloaded sample video to {sample_video_path}")else:    print(f"Using existing sample video: {sample_video_path}")# Load video framesframes = load_video(sample_video_path, max_frames=50)  # Limit to 50 frames for faster processing# Display sample framesdisplay_frames(frames)

In [None]:
# Create or upload an ad imagetry:    from google.colab import files    print("Upload an ad image:")    uploaded = files.upload()    ad_path = next(iter(uploaded.keys()))    ad_image = cv2.imread(ad_path, cv2.IMREAD_UNCHANGED)    print(f"Uploaded ad image: {ad_path}")except:    # Create a sample ad    ad_image = create_sample_ad()    cv2.imwrite("sample_ad.png", ad_image)    print("Created a sample ad image")# Display the ad imageplt.figure(figsize=(5, 3))if ad_image.shape[2] == 4:  # With alpha channel    plt.imshow(cv2.cvtColor(ad_image, cv2.COLOR_BGRA2RGBA))else:    plt.imshow(cv2.cvtColor(ad_image, cv2.COLOR_BGR2RGB))plt.title("Advertisement Image")plt.axis('off')plt.show()

In [None]:
# Process the video with ad placementoutput_path = "output_video.mp4"processed_frames = pipeline.process_video(    frames,     ad_image,     output_path=output_path,    display_results=True)

In [None]:
# Display the output videofrom IPython.display import Video# Display the videoVideo(output_path)

In [None]:
# Download the processed videotry:    from google.colab import files    files.download(output_path)    print(f"Download initiated for {output_path}")except ImportError:    print(f"Video saved locally at {output_path}")

## ConclusionIn this notebook, we've demonstrated a simplified version of the CrowdFace system, which:1. Uses SAM2 for crowd segmentation (or a placeholder if the model isn't available)2. Finds optimal ad placement locations based on the segmentation3. Places ads with proper alpha blending4. Processes videos frame by frameThe full CrowdFace system includes additional components:- RVM (Robust Video Matting) for high-quality alpha matte generation- BAGEL (ByteDance Ad Generation and Embedding Library) for intelligent scene understanding and ad placement- More sophisticated ad optimization based on scene contextThis demo provides a practical implementation that works even without access to all the advanced models, making it easy to test and experiment with the core functionality.