In [1]:
import json
import random

# Movie and character pools (mix of Hollywood & Bollywood style names)
movie_titles = [
    "Inception", "The Dark Knight", "Interstellar", "Titanic", "Avengers",
    "3 Idiots", "Dilwale Dulhania Le Jayenge", "Sholay", "PK", "Zindagi Na Milegi Dobara",
    "The Matrix", "Gladiator", "Jurassic Park", "Harry Potter", "The Lord of the Rings",
    "Lagaan", "Chak De India", "Barfi", "Dangal", "Gully Boy",
    "Iron Man", "Captain America", "Thor", "Doctor Strange", "Black Panther",
    "Kuch Kuch Hota Hai", "Kabhi Khushi Kabhi Gham", "Kal Ho Naa Ho", "Bajrangi Bhaijaan", "Pathaan",
    "The Lion King", "Frozen", "Toy Story", "Finding Nemo", "Up",
    "Bahubali", "RRR", "KGF", "Pushpa", "Jawan",
    "Forrest Gump", "Shawshank Redemption", "Fight Club", "The Godfather", "Goodfellas",
    "Andaz Apna Apna", "Don", "Kabir Singh", "War", "Sanju"
]

characters = ["Raj", "Simran", "Amit", "Cobb", "Ariadne", "Joker", "Batman", "Rancho", "Farhan", "Raju"]

# Generate dataset
dataset = {"dataset_name": "Custom Multimodal Movie Dataset", "movies": []}

for i, title in enumerate(movie_titles[:50], start=1):
    movie = {
        "movie_id": f"M{i:03d}",
        "movie_title": title,
        "year": random.choice(range(1990, 2025)),
        "language": random.choice(["English", "Hindi"]),
        "scenes": []
    }

    for s in range(1, 6):  # 5 scenes per movie
        scene = {
            "scene_id": f"S{s:03d}",
            "scene_description": f"Description of scene {s} from {title}.",
            "dialogues": [
                {"character": random.choice(characters), "text": f"Dialogue line {j+1} from scene {s} in {title}."}
                for j in range(random.randint(2, 4))  # 2–4 dialogues per scene
            ],
            "scene_image": f"images/{title.replace(' ', '_').lower()}_scene{s}.jpg"
        }
        movie["scenes"].append(scene)

    dataset["movies"].append(movie)

# Save JSON file
with open("movie_dataset.json", "w", encoding="utf-8") as f:
    json.dump(dataset, f, indent=2, ensure_ascii=False)

print("✅ movie_dataset.json created with 50 movies × 5 scenes each")


✅ movie_dataset.json created with 50 movies × 5 scenes each
