# Inspect YT8M Interest-Summarizer SFT Data (JSONL)

This notebook loads your generated JSONL files and shows samples + simple stats.

**Files expected (Windows paths):**
- `yt8m_interest_sft.jsonl`
- `yt8m_chatml_train.jsonl`

If your paths differ, edit the `DATA_DIR` cell.


In [None]:
import os, json, random
from pathlib import Path

# ðŸ‘‰ Edit this if needed
DATA_DIR = r"D:\repo\lixia_homejob\llm-rec-interest-qwen\data\processed"

SFT_JSONL = os.path.join(DATA_DIR, "yt8m_interest_sft.jsonl")
CHATML_JSONL = os.path.join(DATA_DIR, "yt8m_chatml_train.jsonl")

print('SFT_JSONL:', SFT_JSONL)
print('exists:', os.path.exists(SFT_JSONL))
print('CHATML_JSONL:', CHATML_JSONL)
print('exists:', os.path.exists(CHATML_JSONL))


In [None]:
def read_jsonl(path, max_lines=None):
    data = []
    with open(path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if max_lines is not None and i >= max_lines:
                break
            line = line.strip()
            if not line:
                continue
            data.append(json.loads(line))
    return data

sft = read_jsonl(SFT_JSONL)
print('Loaded SFT rows:', len(sft))
print('Keys:', sft[0].keys())


In [None]:
def show_one(rec):
    print('=== INSTRUCTION ===')
    print(rec.get('instruction',''))
    print('\n=== INPUT ===')
    print(rec.get('input',''))
    print('\n=== OUTPUT ===')
    print(rec.get('output',''))

show_one(random.choice(sft))


In [None]:
# Show multiple random samples
for i in range(3):
    print('\n' + '='*90)
    print('SAMPLE', i+1)
    print('='*90)
    show_one(random.choice(sft))


In [None]:
# Quick quality checks
def has_label_id_noise(text: str) -> bool:
    return 'label_' in text

num_noise_in = sum(has_label_id_noise(r.get('input','')) for r in sft)
num_noise_out = sum(has_label_id_noise(r.get('output','')) for r in sft)
print('Rows with label_ in input:', num_noise_in)
print('Rows with label_ in output:', num_noise_out)

empty_out = sum(1 for r in sft if not r.get('output','').strip())
print('Rows with empty output:', empty_out)


In [None]:
# Length stats (rough, by characters)
in_lens = [len(r.get('input','')) for r in sft]
out_lens = [len(r.get('output','')) for r in sft]

def pct(x, p):
    x = sorted(x)
    idx = int((p/100)*(len(x)-1))
    return x[idx]

print('Input length chars: min/median/p95/max', min(in_lens), pct(in_lens,50), pct(in_lens,95), max(in_lens))
print('Output length chars: min/median/p95/max', min(out_lens), pct(out_lens,50), pct(out_lens,95), max(out_lens))


In [None]:
# Load ChatML file and preview one
chat = read_jsonl(CHATML_JSONL)
print('Loaded ChatML rows:', len(chat))
print('Keys:', chat[0].keys())

sample = random.choice(chat)['text']
print(sample[:1200])
print('\n... (truncated) ...')


## Tips
- If you still see many `label_###` tokens, consider regenerating SFT with filtering.
- If outputs feel too list-like, adjust the output template in your data builder script.
