# Debug Chinese Translation Truncation Issue

**For Google Colab:**
1. Mount Google Drive (run cell below)
2. Set `ROOT_DIR` to your project folder path

**For local execution:** Skip the Google Drive cell

---

**Problem:** Chinese translations truncate at first comma

**Goal:** Diagnose the root cause

**Test cases:** Pairs 702, 1107, 616

In [None]:
# Mount Google Drive (Colab only)
try:
    from google.colab import drive
    drive.mount('/content/drive')
    ROOT_DIR = "/content/drive/MyDrive/UofT/CSC2517/term_paper/code_zh_en"
    import os
    os.chdir(ROOT_DIR)
    print(f"Changed to: {os.getcwd()}")
except ImportError:
    print("Local environment")

In [None]:
# Verify paths
import os
print(f"Working directory: {os.getcwd()}")
print(f"Model exists: {os.path.exists('../models/nllb-1.3B')}")

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import numpy as np

In [None]:
# Load model
model_dir = "../models/nllb-1.3B"
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir, attn_implementation="eager")

device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
model = model.to(device)
print(f"Device: {device}")

In [None]:
# Test cases
test_cases = [
    {'id': 702, 'en': "At 10:00pm, Sun Yijie, who had been pregnant for four months, was released on bail of NT$200,000."},
    {'id': 1107, 'en': "It is boundless. If you need and are brave enough to initiate crowdfunding, everything will become possible."},
    {'id': 616, 'en': "It was Mr. Dong's real intention to sign the agreement, which was legal and effective after being signed and sealed by all the parties concerned."}
]
print(f"Loaded {len(test_cases)} test cases")

## CRITICAL DIAGNOSTIC: Token-by-Token Analysis

This will show exactly what the model generates and where it stops.

In [None]:
# Deep diagnostic on first test case
test_case = test_cases[0]

tokenizer.src_lang = "eng_Latn"
inputs = tokenizer(test_case['en'], return_tensors="pt").to(device)

print(f"English: {test_case['en']}")
print(f"Input tokens: {len(inputs.input_ids[0])}")
print()

# Generate with max_new_tokens
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        forced_bos_token_id=tokenizer.convert_tokens_to_ids("zho_Hans"),
        max_new_tokens=200,
        output_scores=True,
        return_dict_in_generate=True
    )

print("=" * 80)
print("TOKEN-BY-TOKEN GENERATION:")
print("=" * 80)

output_ids = outputs.sequences[0].tolist()
output_tokens = tokenizer.convert_ids_to_tokens(outputs.sequences[0])

for i, (token_id, token) in enumerate(zip(output_ids, output_tokens)):
    is_eos = (token_id == tokenizer.eos_token_id)
    marker = " <-- EOS TOKEN" if is_eos else ""
    print(f"[{i:3d}] ID={token_id:6d} Token={repr(token):30s}{marker}")
    if is_eos:
        print(f"\n*** Generation stopped at position {i} ***")
        break

print()
decoded = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)
print(f"Decoded: {decoded}")
print(f"Length: {len(decoded)} chars, {len(outputs.sequences[0])} tokens")
print(f"Ends with comma: {decoded.rstrip().endswith(',')}")

## Test Reverse Direction: ZH→EN

In [None]:
# Test if ZH→EN has the same problem
chinese_test = "晚间10时许，怀有4月身孕的孙颐婕20万元交保，先行离开。"

print(f"Chinese input: {chinese_test}")
print()

tokenizer.src_lang = "zho_Hans"
inputs = tokenizer(chinese_test, return_tensors="pt").to(device)

with torch.no_grad():
    outputs = model.generate(
        **inputs,
        forced_bos_token_id=tokenizer.convert_tokens_to_ids("eng_Latn"),
        max_new_tokens=200
    )

translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"English output: {translation}")
print(f"Length: {len(translation)} chars, {len(outputs[0])} tokens")
print()
print("Does ZH→EN work correctly or also truncate?")

## Test All Parameter Combinations

In [None]:
# Test different parameter combinations
configs = [
    {'name': 'Original', 'params': {'max_length': 128}},
    {'name': 'Longer max_length', 'params': {'max_length': 256}},
    {'name': 'max_new_tokens', 'params': {'max_new_tokens': 200}},
    {'name': 'max_new_tokens + constraints', 'params': {'max_new_tokens': 200, 'num_beams': 1, 'do_sample': False, 'early_stopping': False}}
]

test_case = test_cases[0]
tokenizer.src_lang = "eng_Latn"
inputs = tokenizer(test_case['en'], return_tensors="pt").to(device)

print(f"Testing: {test_case['en'][:60]}...")
print("=" * 80)

for config in configs:
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            forced_bos_token_id=tokenizer.convert_tokens_to_ids("zho_Hans"),
            **config['params']
        )
    
    translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"{config['name']:30s}: {translation[:80]}..." if len(translation) > 80 else f"{config['name']:30s}: {translation}")
    print(f"{'':30s}  ({len(outputs[0])} tokens, {len(translation)} chars)")
    print()

## Check EOS Token Configuration

In [None]:
print("Tokenizer configuration:")
print(f"EOS token: {repr(tokenizer.eos_token)}")
print(f"EOS token ID: {tokenizer.eos_token_id}")
print()

# Check Chinese comma
ch_comma = "\uff0c"
ch_ids = tokenizer.encode(ch_comma, add_special_tokens=False)
print(f"Chinese comma (\uff0c): IDs={ch_ids}, Tokens={tokenizer.convert_ids_to_tokens(ch_ids)}")

# Check English comma
en_comma = ","
en_ids = tokenizer.encode(en_comma, add_special_tokens=False)
print(f"English comma (,): IDs={en_ids}, Tokens={tokenizer.convert_ids_to_tokens(en_ids)}")
print()

if tokenizer.eos_token_id in ch_ids or tokenizer.eos_token_id in en_ids:
    print("WARNING: Comma includes EOS token!")
else:
    print("Commas do not match EOS token ID")

## Summary

Based on the diagnostics above, the root cause should be visible in the token-by-token output.

In [None]:
print("=" * 80)
print("KEY QUESTIONS TO ANSWER:")
print("=" * 80)
print()
print("1. In the token-by-token output, does EOS appear right after the comma?")
print("2. Does the ZH→EN translation work correctly?")
print("3. Do any parameter combinations produce longer outputs?")
print()
print("If EOS appears after the comma, this is a model/tokenizer bug.")
print("If ZH→EN works but EN→ZH doesn't, it's specific to Chinese generation.")