# Prepare T2V Dataset

Convert prompts/MovieGenVideoBench.txt into individual .txt files in data/t2v/ folder.
Each file contains a single prompt as required by the training pipeline.

In [None]:
import os
from pathlib import Path

In [None]:
# Configuration
input_file = "prompts/MovieGenVideoBench.txt"
output_dir = "data/t2v"

In [None]:
# Create output directory
os.makedirs(output_dir, exist_ok=True)
print(f"Output directory: {output_dir}")

In [None]:
# Read prompts from input file
with open(input_file, "r", encoding="utf-8") as f:
    prompts = [line.strip() for line in f if line.strip()]

print(f"Total prompts: {len(prompts)}")

In [None]:
# Write each prompt to a separate file
for i, prompt in enumerate(prompts, start=1):
    output_path = os.path.join(output_dir, f"{i}.txt")
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(prompt)

print(f"Created {len(prompts)} prompt files in {output_dir}/")

In [None]:
# Verify by reading a few samples
print("\n--- Sample files ---")
for i in [1, 2, len(prompts)]:
    filepath = os.path.join(output_dir, f"{i}.txt")
    with open(filepath, "r", encoding="utf-8") as f:
        content = f.read()
    print(f"\n{i}.txt ({len(content)} chars):")
    print(content[:100] + "..." if len(content) > 100 else content)