In [None]:
%pip install transformers
%pip install accelerate>=0.26.0
%pip install --upgrade jinja2

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import os

os.environ["HF_HOME"] = "/tmp/huggingface"
os.environ["TRANSFORMERS_CACHE"] = "/tmp/huggingface"

model_name = "Qwen/Qwen2.5-7B-Instruct-1M"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto",
    cache_dir="/tmp/huggingface"
)
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir="/tmp/huggingface")

In [None]:
chunks = []

with open("tt0032138.txt", "r", encoding="utf-8") as f:
    chunk = []
    for line in f:
        chunk.append(line.strip())
        if len(chunk) >= 100:
            chunks.append(chunk)
            chunk = []
    if chunk:
        chunks.append(chunk)

In [None]:
chunks

In [None]:

with open("output.txt", "w", encoding="utf-8") as file:
    for chunk in chunks:
        text = "\n".join(chunk)
        prompt = f'''You are given a passage from a movie script. Return all spoken dialogue. This may be in the form of explicit dialogue, 
                or as a person speaking in stage directions or in any other sections of the script. Do not return any addition content aside from the explicit lines from the transcript. 
                Do not number the output. {text}'''
        messages = [
            {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ]
        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

        generated_ids = model.generate(
            **model_inputs,
            max_new_tokens=512
        )
        generated_ids = [
            output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
        ]

        response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

        file.write(response + "\n")


In [None]:
print(response)