In [None]:
import json
import os
import random
from collections import defaultdict
from tqdm import tqdm

In [None]:

input_file = "train_indian_news_llm.jsonl"
output_file = "train_sampled.jsonl"

total_samples = 100_000
minimum_year = 2000
maximum_year = 2024
random_seed = 42

random.seed(random_seed)

In [None]:
# ----------------------------------------


def extract_year(input_text: str):
    try:
        start = input_text.index("[YEAR=") + 6
        end = input_text.index("]", start)
        return int(input_text[start:end])
    except Exception:
        return None


In [1]:
def main():
    year_buckets = defaultdict(list)
    with open(input_file, encoding="utf-8") as f:
        for line in tqdm(f, desc="Reading JSONL"):
            record = json.loads(line)
            year = extract_year(record.get("input", ""))
            if year and minimum_year <= year <= maximum_year:
                year_buckets[year].append(record)

    years = sorted(year_buckets.keys())
    num_years = len(years)
    if num_years == 0:
        raise RuntimeError("no valid year foundd in dataset")

    samples_per_year = total_samples // num_years

    print("years", years)
    print("samples year", samples_per_year)

    final_samples = []

    for year in years:
        bucket = year_buckets[year]
        if len(bucket) <= samples_per_year:
            sampled = bucket
        else:
            sampled = random.sample(bucket, samples_per_year)

        final_samples.extend(sampled)
        print(f"Year {year}: {len(sampled)}")

    random.shuffle(final_samples)

    with open(output_file, "w", encoding="utf-8") as f:
        for record in final_samples:
            f.write(json.dumps(record, ensure_ascii=False) + "\n")

    print(len(final_samples))
if __name__ == "__main__":
    main()


Reading JSONL: 347155it [00:06, 57049.06it/s]


Years found: [2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
Samples per year (target): 4000
Year 2000: 2156 samples
Year 2001: 2300 samples
Year 2002: 4000 samples
Year 2003: 3933 samples
Year 2004: 4000 samples
Year 2005: 4000 samples
Year 2006: 2185 samples
Year 2007: 2076 samples
Year 2008: 3363 samples
Year 2009: 3478 samples
Year 2010: 4000 samples
Year 2011: 4000 samples
Year 2012: 4000 samples
Year 2013: 4000 samples
Year 2014: 4000 samples
Year 2015: 4000 samples
Year 2016: 4000 samples
Year 2017: 4000 samples
Year 2018: 4000 samples
Year 2019: 4000 samples
Year 2020: 4000 samples
Year 2021: 4000 samples
Year 2022: 4000 samples
Year 2023: 4000 samples
Year 2024: 4000 samples

Final sampled dataset size: 91491
Saved to: train_sampled.jsonl


In [2]:
import json

with open("train_sampled.jsonl", encoding="utf-8") as f:
    for i, line in enumerate(f, start=1):
        try:
            json.loads(line)
        except Exception as e:
            print("Broken line:", i)
            raise
print("JSONL looks clean.")

JSONL looks clean.


In [1]:
import torch
print("CUDA available:", torch.cuda.is_available())
print("Torch version:", torch.__version__)

CUDA available: True
Torch version: 2.5.1+cu121
