In [37]:
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from logging import warn
import re
import numpy as np

In [42]:
# === LOAD MODEL ===
model_name = "Qwen/Qwen2.5-14B-Instruct"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [51]:
# === LOAD AND CLEAN TEXT ===
csv_path = "../data/raw/stimuli/transcripts/movie10/bourne/movie10_bourne01.tsv"
column_name = "text_per_tr"
df = pd.read_csv(csv_path, sep="\t").iloc[:50]
tr_lines = [" " if pd.isna(t) else str(t).strip() for t in df[column_name]]  # cleaned TRs
joined_input = "\n".join(tr_lines)  # use newline to protect separators from being merged

In [52]:
# === BUILD PROMPT ===
messages = [
    {
        "role": "system",
        "content": (
            "You are Qwen, a helpful multilingual assistant. "
            "You translate English transcripts into French, preserving structure and formatting exactly."
        )
    },
    {
        "role": "user",
        "content": (
            "You are translating fMRI experiment transcripts from English to French.\n"
            "- Each line represents a 1.49-second temporal resolution (TR).\n"
            "- Translate **each line separately** and preserve the **exact number and order of lines**.\n"
            "- Leave empty lines empty.\n"
            "- Do not add or remove lines.\n\n"
            "Here is the transcript:\n"
            "\n\n\n\n\n\n"
            "This is not a drill, soldier.\n"
            "It's not a drill,\n"
            "soldier. We clear on\n"
            "that?\n"
            "This is a\n"
            "live project.\n"
            "You're a go.\n"
            "\n"
            "Training is over.\n"
            "\n"
            "\n"
            "Translate to French:"
        )
    },
    {
        "role": "assistant",
        "content": (
            "\n\n\n\n\n\n"
            "Ceci n'est pas une simulation, soldat.\n"
            "Ce n'est pas une simulation,\n"
            "soldat. C'est clair ?\n"
            "Tu comprends ?\n"
            "C'est un\n"
            "projet réel.\n"
            "Tu as le feu vert.\n"
            "\n"
            "La formation est terminée.\n"
            "\n"
            "\n"
        )
    },
    {
        "role": "user",
        "content": (
            "Now translate the following English transcript into French, following the same rules.\n"
            "Each line is one TR. Preserve line breaks exactly. Do not add or remove any lines.\n"
            "Here is the transcript:\n"
            f"{joined_input}"
        )
    }
]

In [None]:
# === TOKENIZE & GENERATE ===
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=2000,
    do_sample=False,
)

# === EXTRACT RESPONSE ===
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

# === POSTPROCESS ===
translated_trs = response.strip().split("\n")

# === VALIDATION ===
if len(translated_trs) != len(tr_lines):
    warn(f"TR count mismatch: input={len(tr_lines)}, output={len(translated_trs)}")

# === RESTORE TO CSV FORMAT ===
df["translated_tr"] = translated_trs

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


In [None]:
translated_trs