In [2]:
# Install required packages
!pip install -q transformers accelerate qwen-vl-utils[decord] torch
!pip install -q flash-attn --no-build-isolation

print("✅ Installation complete!")

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for flash-attn (setup.py) ... [?25l[?25hdone
✅ Installation complete!


In [3]:
import torch
print(f"GPU Available: {torch.cuda.is_available()}")
print(f"GPU Name: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'No GPU'}")
print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")

GPU Available: True
GPU Name: Tesla T4
GPU Memory: 14.74 GB


In [4]:
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
import torch

print("📥 Loading Qwen2.5-VL-3B model...")
print("This will take 3-5 minutes on first run (downloading ~6GB)")

# Load model
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2.5-VL-3B-Instruct",
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

# Load processor
processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct")

print("✅ Model loaded successfully!")
print(f"Model is on device: {model.device}")

`torch_dtype` is deprecated! Use `dtype` instead!


📥 Loading Qwen2.5-VL-3B model...
This will take 3-5 minutes on first run (downloading ~6GB)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.53G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/216 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

chat_template.json: 0.00B [00:00, ?B/s]

✅ Model loaded successfully!
Model is on device: cuda:0


In [7]:
# Download a better sample video (MP4 format)
!wget -q https://test-videos.co.uk/vids/bigbuckbunny/mp4/h264/360/Big_Buck_Bunny_360_10s_1MB.mp4 -O sample_video.mp4

print("✅ Sample video downloaded: sample_video.mp4")

# Check the file
import os
file_size = os.path.getsize("sample_video.mp4") / (1024 * 1024)
print(f"File size: {file_size:.2f} MB")

✅ Sample video downloaded: sample_video.mp4
File size: 0.95 MB


In [8]:
# Analyze the video with corrected path
messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "video",
                "video": "sample_video.mp4",  # Changed: removed "file://"
            },
            {
                "type": "text",
                "text": "Analyze this video in detail. Describe: 1) What activity is happening? 2) What is the overall mood/emotion? 3) What is the setting (indoor/outdoor)? 4) What kind of background music would fit this video?"
            },
        ],
    }
]

# Prepare inputs
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")

# Generate response
print("🎥 Analyzing video...")
generated_ids = model.generate(**inputs, max_new_tokens=500)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)

print("\n" + "="*60)
print("📊 VIDEO ANALYSIS RESULTS:")
print("="*60)
print(output_text[0])
print("="*60)

🎥 Analyzing video...

📊 VIDEO ANALYSIS RESULTS:
In this video, we see a whimsical and enchanting scene set in a lush, vibrant forest. The central focus is on a large, animated tree that has sprouted a small, round house-like structure at its base. This house is nestled within a grassy mound, surrounded by various plants and flowers, creating a sense of magic and wonder.

The tree's trunk is thick and sturdy, with long, thin branches that extend outward, adding to its majestic appearance. The house itself is simple yet charming, with a small door and windows that allow light to filter through. The surrounding environment is rich with greenery, including tall trees with dense foliage, bushes with colorful flowers, and a variety of other plants that add depth and texture to the scene.

The mood of the video is serene and magical, evoking a sense of peace and wonder. The bright colors and detailed animation create a feeling of enchantment, as if the viewer has stumbled upon a hidden, magic

In [9]:
def analyze_video_for_music(video_path, user_prompt=None):
    """
    Analyzes video and combines with user prompt to generate music description

    Args:
        video_path: Path to video file
        user_prompt: Optional user preference (e.g., "epic action music")

    Returns:
        dict with video analysis and music recommendation
    """

    # Build the prompt
    base_prompt = """Analyze this video for background music generation. Provide:
1. Scene description (what's happening)
2. Overall mood and emotion
3. Setting (indoor/outdoor, time of day if visible)
4. Pace/energy level (slow, medium, fast)
5. Recommended music style, instruments, and tempo"""

    if user_prompt:
        base_prompt += f"\n\nUser preference: {user_prompt}"

    # Prepare messages
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "video", "video": video_path},
                {"type": "text", "text": base_prompt},
            ],
        }
    ]

    # Process
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    # Generate
    generated_ids = model.generate(**inputs, max_new_tokens=500)
    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )

    return {
        "video_analysis": output_text[0],
        "user_prompt": user_prompt
    }

# Test it!
result = analyze_video_for_music("sample_video.mp4", user_prompt="Make it more upbeat and fun")
print("="*60)
print("RESULT:")
print("="*60)
print(result["video_analysis"])

RESULT:
1. **Scene description**: The video depicts a whimsical forest scene with a large, animated treehouse perched on a grassy mound. The treehouse has a round entrance and is surrounded by lush greenery, including moss-covered rocks and vibrant flowers. The treehouse is adorned with colorful decorations, such as flowers and small ornaments, adding to its enchanting appearance. The background features dense foliage, including tall trees and bushes, creating a serene and magical atmosphere.

2. **Overall mood and emotion**: The overall mood of the video is cheerful and enchanting, evoking a sense of wonder and magic. The bright colors and the playful design of the treehouse contribute to a lighthearted and joyful atmosphere.

3. **Setting**: The setting is an outdoor forest scene during what appears to be daytime, as indicated by the bright lighting and clear visibility of the surroundings.

4. **Pace/energy level**: The pace of the video is slow and serene, with a gentle and calming

In [11]:
import re

def extract_music_params_v2(analysis_text):
    """Extract structured music parameters from analysis"""
    params = {
        "mood": "",
        "tempo_bpm": "",
        "instruments": [],
        "style": "",
        "energy": ""
    }

    # Extract mood
    mood_match = re.search(r'mood.*?is\s+([^.]+)', analysis_text, re.IGNORECASE)
    if mood_match:
        params["mood"] = mood_match.group(1).strip()

    # Extract tempo (BPM)
    tempo_match = re.search(r'(\d+)-(\d+)\s+beats per minute', analysis_text, re.IGNORECASE)
    if tempo_match:
        params["tempo_bpm"] = f"{tempo_match.group(1)}-{tempo_match.group(2)}"

    # Extract instruments
    instruments_match = re.search(r'[Ii]nstruments?\s+(?:such as|like|including)\s+([^.]+)', analysis_text)
    if instruments_match:
        instruments_text = instruments_match.group(1)
        # Split by commas and "or"/"and"
        instruments = re.split(r',|\s+or\s+|\s+and\s+', instruments_text)
        params["instruments"] = [inst.strip() for inst in instruments if inst.strip()]

    # Extract style
    style_match = re.search(r'music style.*?([^.]+?)(?:would|could)', analysis_text, re.IGNORECASE)
    if style_match:
        params["style"] = style_match.group(1).strip()

    # Extract energy
    energy_match = re.search(r'[Pp]ace.*?is\s+([^,]+)', analysis_text)
    if energy_match:
        params["energy"] = energy_match.group(1).strip()

    # Keep full description as backup
    params["description"] = analysis_text

    return params

# Test it!
result = analyze_video_for_music("sample_video.mp4")
music_params = extract_music_params_v2(result["video_analysis"])

print("📝 EXTRACTED MUSIC PARAMETERS:")
print("="*60)
for key, value in music_params.items():
    if key != "description":  # Don't print full description
        print(f"{key.upper()}: {value}")

📝 EXTRACTED MUSIC PARAMETERS:
MOOD: serene and playful
TEMPO_BPM: 60-80
INSTRUMENTS: ['acoustic guitar', 'flute', 'xylophone could be used to create a soothing', 'enchanting atmosphere']
STYLE: , instruments, and tempo**: For this video, a gentle and whimsical folk or children's music style
ENERGY: slow and calm
