In [41]:
import json
from tqdm import tqdm
from transformers import AutoTokenizer  

In [42]:
MODEL_NAME = "meta-llama/Llama-3.2-3B-Instruct" 
INPUT_PATH = "final_results.json"
OUTPUT_PATH = "final_results_with_token_index.json"
LOG_EVERY_N = int(1e3) 

In [43]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
print(f"Loaded tokenizer: {MODEL_NAME}  (vocab = {len(tokenizer)})")

Loaded tokenizer: meta-llama/Llama-3.2-3B-Instruct  (vocab = 128256)


In [44]:
with open(INPUT_PATH, "r", encoding="utf8") as f:
    data = json.load(f)
print(f"Loaded {len(data)} records from {INPUT_PATH}")

Loaded 18317 records from final_results.json


In [45]:
def find_token_start(big_ids, sub_ids):
    """Return first index where sub_ids appears in big_ids, else -1."""
    if not sub_ids: 
        return -1
    sub_len = len(sub_ids)
    for i in range(len(big_ids) - sub_len + 1):
        if big_ids[i : i + sub_len] == sub_ids:
            return i
    return -1

results = []
for idx, elem in enumerate(tqdm(data, desc="Processing")):
    if not elem["is_correct"]:
        print(f"Skipping idx={idx} (not correct)")
        continue

    src = elem["source_text"].replace("$", "")
    query = elem["extracted_search_term"].replace("$", "")

    ids_full = tokenizer.encode(src, add_special_tokens=False)
    ids_sub = tokenizer.encode(query, add_special_tokens=False)

    start_idx = find_token_start(ids_full, ids_sub)
    if start_idx == -1:
        print(f"No token match at idx={idx}, term='{query}'")
        continue         

    out_elem = dict(elem)        

    out_elem["source_text"] = src
    out_elem["extracted_search_term"] = query

    out_elem["start_index"] = start_idx
    out_elem["end_index"] = start_idx + len(ids_sub) - 1
    out_elem["decoded_value"] = tokenizer.decode(ids_full[start_idx:out_elem["end_index"] + 1], skip_special_tokens=True)

    out_elem["tokens"] = ids_full
    results.append(out_elem)

    if (len(results) % LOG_EVERY_N) == 0:
        print(f"Processed OK: {len(results)} items retained")
    
    '''if idx == 32:
        break'''

print(f"Finished. Retained {len(results)} / {len(data)} items in total.")

Processing:   0%|          | 0/18317 [00:00<?, ?it/s]

Skipping idx=49 (not correct)
Skipping idx=50 (not correct)
No token match at idx=74, term='.4 liters * 1000 ml/liter'
Skipping idx=86 (not correct)


Processing:   1%|          | 138/18317 [00:00<00:13, 1376.24it/s]

Skipping idx=187 (not correct)
Skipping idx=189 (not correct)


Processing:   2%|▏         | 276/18317 [00:00<00:13, 1374.36it/s]

No token match at idx=295, term='.50-.25'


Processing:   3%|▎         | 544/18317 [00:00<00:13, 1290.02it/s]

Skipping idx=529 (not correct)


Processing:   4%|▎         | 679/18317 [00:00<00:13, 1310.52it/s]

No token match at idx=571, term='200(.60)'
Skipping idx=615 (not correct)
Skipping idx=808 (not correct)
Skipping idx=809 (not correct)


Processing:   5%|▍         | 836/18317 [00:00<00:12, 1395.99it/s]

Skipping idx=828 (not correct)


Processing:   5%|▌         | 976/18317 [00:00<00:12, 1361.84it/s]

Skipping idx=888 (not correct)
Skipping idx=1000 (not correct)
Processed OK: 1000 items retained


Processing:   7%|▋         | 1264/18317 [00:01<00:14, 1187.38it/s]

Skipping idx=1234 (not correct)
Skipping idx=1244 (not correct)
No token match at idx=1266, term='.5*2'


Processing:   8%|▊         | 1418/18317 [00:01<00:13, 1280.02it/s]

No token match at idx=1394, term='3(98)'
Skipping idx=1484 (not correct)
Skipping idx=1507 (not correct)
Skipping idx=1516 (not correct)


Processing:   8%|▊         | 1554/18317 [00:01<00:12, 1296.08it/s]

Skipping idx=1536 (not correct)
Skipping idx=1556 (not correct)


Processing:   9%|▉         | 1689/18317 [00:01<00:13, 1247.33it/s]

Skipping idx=1670 (not correct)
Skipping idx=1671 (not correct)


Processing:  11%|█▏        | 2095/18317 [00:01<00:13, 1218.99it/s]

Skipping idx=1980 (not correct)
Processed OK: 2000 items retained
No token match at idx=2043, term='.20*30'
Skipping idx=2086 (not correct)
No token match at idx=2101, term='4/(1/8)'
No token match at idx=2111, term='.85*30'
No token match at idx=2112, term='.45*30'
Skipping idx=2158 (not correct)
Skipping idx=2162 (not correct)
Skipping idx=2166 (not correct)


Processing:  14%|█▍        | 2533/18317 [00:01<00:11, 1369.59it/s]

Skipping idx=2299 (not correct)
Skipping idx=2364 (not correct)
No token match at idx=2436, term='Caleb’s 7 gallons + Cynthia’s 8 gallons'
Skipping idx=2535 (not correct)
No token match at idx=2549, term='.5*2'


Processing:  16%|█▋        | 3004/18317 [00:02<00:10, 1476.89it/s]

Skipping idx=2737 (not correct)
Skipping idx=2767 (not correct)
Skipping idx=2822 (not correct)
Skipping idx=2888 (not correct)
Skipping idx=2905 (not correct)


Processing:  18%|█▊        | 3309/18317 [00:02<00:10, 1494.45it/s]

Processed OK: 3000 items retained
Skipping idx=3062 (not correct)
Skipping idx=3092 (not correct)
Skipping idx=3093 (not correct)
Skipping idx=3094 (not correct)
Skipping idx=3096 (not correct)
Skipping idx=3128 (not correct)
Skipping idx=3158 (not correct)
Skipping idx=3165 (not correct)
No token match at idx=3332, term='.5*354,000'


Processing:  20%|█▉        | 3624/18317 [00:02<00:09, 1508.31it/s]

Skipping idx=3417 (not correct)
No token match at idx=3445, term='14*50%'
No token match at idx=3517, term='.10*150'
Skipping idx=3560 (not correct)
Skipping idx=3565 (not correct)
Skipping idx=3717 (not correct)


Processing:  22%|██▏       | 3951/18317 [00:02<00:09, 1496.76it/s]

Skipping idx=3733 (not correct)
Skipping idx=3774 (not correct)


Processing:  22%|██▏       | 4114/18317 [00:03<00:09, 1534.10it/s]

Processed OK: 4000 items retained
No token match at idx=4067, term='.36*100'
Skipping idx=4098 (not correct)
Skipping idx=4135 (not correct)
No token match at idx=4137, term='.50*5'
Skipping idx=4163 (not correct)


Processing:  26%|██▌       | 4730/18317 [00:03<00:09, 1438.66it/s]

Skipping idx=4411 (not correct)
Skipping idx=4539 (not correct)
Skipping idx=4602 (not correct)
Skipping idx=4619 (not correct)
Skipping idx=4709 (not correct)


Processing:  28%|██▊       | 5040/18317 [00:03<00:08, 1484.48it/s]

Skipping idx=4769 (not correct)
Skipping idx=4814 (not correct)
Skipping idx=4884 (not correct)
Skipping idx=4952 (not correct)
Skipping idx=5052 (not correct)
Processed OK: 5000 items retained


Processing:  29%|██▉       | 5357/18317 [00:03<00:08, 1535.15it/s]

No token match at idx=5117, term='.50*4'
No token match at idx=5133, term='.30*80'
No token match at idx=5153, term='40(.50)'
Skipping idx=5291 (not correct)
Skipping idx=5304 (not correct)
No token match at idx=5332, term='.20*30'
No token match at idx=5334, term='.50*36'
Skipping idx=5389 (not correct)
Skipping idx=5403 (not correct)


Processing:  31%|███       | 5671/18317 [00:04<00:08, 1523.41it/s]

No token match at idx=5499, term='.4 * 80'
Skipping idx=5607 (not correct)
Skipping idx=5608 (not correct)
No token match at idx=5632, term='.45*20'
Skipping idx=5659 (not correct)
No token match at idx=5716, term='.20*120.00'
Skipping idx=5759 (not correct)
Skipping idx=5781 (not correct)
Skipping idx=5784 (not correct)
Skipping idx=5798 (not correct)
No token match at idx=5807, term='.20*60'


Processing:  35%|███▍      | 6339/18317 [00:04<00:07, 1589.41it/s]

Processed OK: 6000 items retained
Skipping idx=6284 (not correct)
Skipping idx=6299 (not correct)
No token match at idx=6307, term='3/(1/3)'
No token match at idx=6325, term='* 2'


Processing:  36%|███▋      | 6661/18317 [00:04<00:07, 1594.83it/s]

Skipping idx=6457 (not correct)
Skipping idx=6554 (not correct)
Skipping idx=6584 (not correct)
Skipping idx=6585 (not correct)
No token match at idx=6607, term='.5*2220'
No token match at idx=6614, term='8(5)'
No token match at idx=6632, term='.5 hours * 60 minutes/hour'
No token match at idx=6649, term='.25*24'
Skipping idx=6740 (not correct)


Processing:  38%|███▊      | 6981/18317 [00:04<00:07, 1572.77it/s]

No token match at idx=6800, term='.10*180'
Skipping idx=6882 (not correct)
Skipping idx=7046 (not correct)
No token match at idx=7082, term='.10*1200'
Skipping idx=7104 (not correct)
Processed OK: 7000 items retained


Processing:  40%|███▉      | 7305/18317 [00:05<00:06, 1583.95it/s]

Skipping idx=7138 (not correct)
No token match at idx=7151, term='.5*60'
No token match at idx=7189, term='.10*500'
Skipping idx=7268 (not correct)
Skipping idx=7278 (not correct)
No token match at idx=7304, term='.20*525'
Skipping idx=7365 (not correct)
Skipping idx=7426 (not correct)


Processing:  43%|████▎     | 7787/18317 [00:05<00:06, 1581.57it/s]

Skipping idx=7599 (not correct)
Skipping idx=7604 (not correct)
Skipping idx=7722 (not correct)
Skipping idx=7782 (not correct)


Processing:  44%|████▍     | 8104/18317 [00:05<00:06, 1507.43it/s]

No token match at idx=7959, term='.25*1352'
Processed OK: 8000 items retained
No token match at idx=8208, term='P + 3 + P + 6 + P + 9 + P + 12'
No token match at idx=8209, term='P + 3 + P + 6 + P + 9 + P + 12'
No token match at idx=8210, term='P + 9 + 3'
Skipping idx=8214 (not correct)


Processing:  46%|████▌     | 8417/18317 [00:05<00:07, 1253.43it/s]

Skipping idx=8292 (not correct)
Skipping idx=8341 (not correct)
No token match at idx=8396, term='.5 minutes/flower * 56 flowers'
Skipping idx=8493 (not correct)


Processing:  48%|████▊     | 8726/18317 [00:06<00:06, 1379.39it/s]

No token match at idx=8541, term='.5 hours * 60 minutes/hour'
Skipping idx=8551 (not correct)
No token match at idx=8602, term='40(.50)'
Skipping idx=8676 (not correct)
Skipping idx=8763 (not correct)
Skipping idx=8842 (not correct)


Processing:  49%|████▉     | 9027/18317 [00:06<00:06, 1411.46it/s]

Skipping idx=8853 (not correct)
No token match at idx=8893, term='¢25 x 3'
No token match at idx=8894, term='¢75 x 2'
No token match at idx=8895, term='¢75 + ¢150 + ¢50'
No token match at idx=8896, term='¢275/¢25'
Skipping idx=8904 (not correct)
Skipping idx=8978 (not correct)
Skipping idx=9089 (not correct)


Processing:  51%|█████     | 9319/18317 [00:06<00:06, 1426.05it/s]

Processed OK: 9000 items retained
Skipping idx=9190 (not correct)
Skipping idx=9294 (not correct)
Skipping idx=9331 (not correct)
Skipping idx=9345 (not correct)
Skipping idx=9439 (not correct)


Processing:  53%|█████▎    | 9633/18317 [00:06<00:05, 1462.36it/s]

Skipping idx=9476 (not correct)


Processing:  54%|█████▍    | 9951/18317 [00:06<00:05, 1501.50it/s]

Skipping idx=9799 (not correct)
Skipping idx=9805 (not correct)
Skipping idx=9981 (not correct)
Skipping idx=10059 (not correct)
Skipping idx=10064 (not correct)


Processing:  57%|█████▋    | 10421/18317 [00:07<00:05, 1520.15it/s]

Skipping idx=10134 (not correct)
Skipping idx=10135 (not correct)
Processed OK: 10000 items retained
Skipping idx=10167 (not correct)
No token match at idx=10170, term='.2*1000000'
Skipping idx=10294 (not correct)
Skipping idx=10372 (not correct)
Skipping idx=10426 (not correct)


Processing:  59%|█████▊    | 10722/18317 [00:07<00:05, 1440.61it/s]

Skipping idx=10498 (not correct)
No token match at idx=10516, term='.20*60'
Skipping idx=10543 (not correct)
Skipping idx=10640 (not correct)
Skipping idx=10641 (not correct)


Processing:  60%|██████    | 11009/18317 [00:07<00:05, 1360.81it/s]

Skipping idx=10838 (not correct)
Skipping idx=10950 (not correct)
Skipping idx=10994 (not correct)
Skipping idx=11053 (not correct)
Skipping idx=11055 (not correct)


Processing:  62%|██████▏   | 11296/18317 [00:07<00:05, 1385.93it/s]

Skipping idx=11127 (not correct)
Skipping idx=11170 (not correct)
Processed OK: 11000 items retained
Skipping idx=11303 (not correct)


Processing:  64%|██████▍   | 11748/18317 [00:08<00:04, 1448.71it/s]

Skipping idx=11584 (not correct)
Skipping idx=11649 (not correct)
Skipping idx=11668 (not correct)
No token match at idx=11672, term='6/(1/4)'
Skipping idx=11713 (not correct)
Skipping idx=11734 (not correct)
Skipping idx=11795 (not correct)
Skipping idx=11804 (not correct)
Skipping idx=11869 (not correct)
Skipping idx=11886 (not correct)
Skipping idx=11887 (not correct)


Processing:  66%|██████▌   | 12052/18317 [00:08<00:05, 1212.80it/s]

Skipping idx=11890 (not correct)
Skipping idx=11992 (not correct)


Processing:  67%|██████▋   | 12358/18317 [00:08<00:04, 1361.06it/s]

No token match at idx=12150, term='.10*20'
Processed OK: 12000 items retained
No token match at idx=12207, term='.25*344'
No token match at idx=12335, term='.50*100'
Skipping idx=12457 (not correct)


Processing:  69%|██████▉   | 12667/18317 [00:08<00:03, 1437.99it/s]

Skipping idx=12498 (not correct)
Skipping idx=12500 (not correct)
No token match at idx=12505, term='.25*32'
Skipping idx=12507 (not correct)
Skipping idx=12611 (not correct)
No token match at idx=12686, term='6/(1/3)'
No token match at idx=12751, term='.5*4'
Skipping idx=12790 (not correct)
Skipping idx=12813 (not correct)
No token match at idx=12815, term='.10 x 1980'


Processing:  72%|███████▏  | 13156/18317 [00:09<00:03, 1522.70it/s]

No token match at idx=12974, term='.20*95'
Skipping idx=13042 (not correct)
Skipping idx=13067 (not correct)
No token match at idx=13100, term='.25*1120'
Skipping idx=13104 (not correct)
Processed OK: 13000 items retained
No token match at idx=13265, term='.30*50'
Skipping idx=13291 (not correct)


Processing:  74%|███████▍  | 13627/18317 [00:09<00:03, 1537.00it/s]

No token match at idx=13419, term='.10*1700'
Skipping idx=13598 (not correct)
Skipping idx=13612 (not correct)
Skipping idx=13636 (not correct)
Skipping idx=13648 (not correct)
No token match at idx=13651, term='.20*40'


Processing:  77%|███████▋  | 14114/18317 [00:09<00:02, 1559.91it/s]

No token match at idx=13879, term='.15*2000'
Skipping idx=13921 (not correct)
Skipping idx=13978 (not correct)
No token match at idx=14078, term='.25*3='
No token match at idx=14079, term='.75*14'
No token match at idx=14080, term='.5*2'
No token match at idx=14082, term='.5+10.5+14'
Skipping idx=14117 (not correct)
Skipping idx=14159 (not correct)
Skipping idx=14183 (not correct)


Processing:  79%|███████▉  | 14428/18317 [00:10<00:02, 1540.89it/s]

Processed OK: 14000 items retained
Skipping idx=14267 (not correct)
No token match at idx=14286, term='.20*20'
No token match at idx=14288, term='.10*50'
Skipping idx=14322 (not correct)
Skipping idx=14380 (not correct)
Skipping idx=14403 (not correct)
No token match at idx=14466, term='.25 hours * 28 miles per hour'
No token match at idx=14467, term='.25 hours * 60 miles per hour'
Skipping idx=14482 (not correct)


Processing:  80%|████████  | 14733/18317 [00:10<00:02, 1441.95it/s]

No token match at idx=14570, term='.20*500'
No token match at idx=14573, term='.30*500'
Skipping idx=14673 (not correct)
Skipping idx=14777 (not correct)
Skipping idx=14860 (not correct)


Processing:  83%|████████▎ | 15232/18317 [00:10<00:01, 1546.71it/s]

Skipping idx=14970 (not correct)
Skipping idx=14999 (not correct)
Skipping idx=15020 (not correct)
Skipping idx=15076 (not correct)
No token match at idx=15118, term='.25*16'
No token match at idx=15120, term='.50*12'
Skipping idx=15130 (not correct)
Skipping idx=15164 (not correct)
Skipping idx=15211 (not correct)
No token match at idx=15230, term='2*2.5'
Processed OK: 15000 items retained


Processing:  85%|████████▍ | 15537/18317 [00:10<00:01, 1446.36it/s]

Skipping idx=15321 (not correct)
Skipping idx=15373 (not correct)
No token match at idx=15385, term='.70*200'
Skipping idx=15564 (not correct)


Processing:  86%|████████▌ | 15683/18317 [00:10<00:01, 1400.46it/s]

Skipping idx=15597 (not correct)
Skipping idx=15681 (not correct)
Skipping idx=15683 (not correct)
Skipping idx=15685 (not correct)
Skipping idx=15736 (not correct)
Skipping idx=15789 (not correct)


Processing:  89%|████████▉ | 16282/18317 [00:11<00:01, 1325.93it/s]

Skipping idx=15973 (not correct)
Skipping idx=15990 (not correct)
Skipping idx=16069 (not correct)
Skipping idx=16072 (not correct)
No token match at idx=16133, term='.20*50'
No token match at idx=16134, term='.10*50'
Processed OK: 16000 items retained
Skipping idx=16291 (not correct)


Processing:  91%|█████████▏| 16724/18317 [00:11<00:01, 1373.31it/s]

No token match at idx=16509, term='.2*40000'
Skipping idx=16546 (not correct)
Skipping idx=16547 (not correct)
Skipping idx=16564 (not correct)
Skipping idx=16579 (not correct)
Skipping idx=16612 (not correct)
Skipping idx=16613 (not correct)
No token match at idx=16635, term='.30*50'
No token match at idx=16646, term='( 0.1 ) * 12'
Skipping idx=16650 (not correct)
Skipping idx=16652 (not correct)
Skipping idx=16653 (not correct)
Skipping idx=16654 (not correct)
Skipping idx=16682 (not correct)
Skipping idx=16707 (not correct)


Processing:  94%|█████████▍| 17186/18317 [00:12<00:00, 1476.41it/s]

Skipping idx=17002 (not correct)
Skipping idx=17079 (not correct)
Skipping idx=17220 (not correct)
No token match at idx=17239, term='70(.20)'
No token match at idx=17243, term='.40*100'
No token match at idx=17244, term='.20*40'
No token match at idx=17250, term='.10*30'
Processed OK: 17000 items retained


Processing:  96%|█████████▋| 17643/18317 [00:12<00:00, 1490.13it/s]

No token match at idx=17402, term='8(100)'
Skipping idx=17409 (not correct)
Skipping idx=17447 (not correct)
Skipping idx=17557 (not correct)
Skipping idx=17597 (not correct)
Skipping idx=17603 (not correct)
No token match at idx=17616, term='.75 * 4'
Skipping idx=17689 (not correct)
No token match at idx=17712, term='.20*350'


Processing:  99%|█████████▉| 18092/18317 [00:12<00:00, 1469.64it/s]

Skipping idx=17838 (not correct)
No token match at idx=17851, term='(4+2)+3'
Skipping idx=17872 (not correct)
Skipping idx=18064 (not correct)


Processing: 100%|██████████| 18317/18317 [00:12<00:00, 1428.63it/s]

Skipping idx=18240 (not correct)
Skipping idx=18250 (not correct)
Skipping idx=18251 (not correct)
Skipping idx=18289 (not correct)
Processed OK: 18000 items retained
Skipping idx=18315 (not correct)
Finished. Retained 18005 / 18317 items in total.





In [46]:
with open(OUTPUT_PATH, "w", encoding="utf8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

print(f"Output written ➜  {OUTPUT_PATH}")

Output written ➜  final_results_with_token_index.json
