# Download and Inspect the Collection

The dataset was created from the Chronicling America collection — over 21 million digitized newspaper pages (1756–1963) curated by the Library of Congress and NEH. They used 39,330 pages (1800–1920), representing 53 US states, to ensure wide geographic and temporal coverage.

Source: https://dl.acm.org/doi/pdf/10.1145/3626772.3657891

GitHub: https://github.com/DataScienceUIBK/ChroniclingAmericaQA?tab=readme-ov-file

In [17]:
!curl -L "https://huggingface.co/datasets/Bhawna/ChroniclingAmericaQA/resolve/main/test.json?download=true" -o test.json
!curl -L "https://huggingface.co/datasets/Bhawna/ChroniclingAmericaQA/resolve/main/train.json?download=true" -o train.json
!curl -L "https://huggingface.co/datasets/Bhawna/ChroniclingAmericaQA/resolve/main/dev.json?download=true" -o validation.json

import json

files = ["train.json", "validation.json", "test.json"]

for path in files:
    print(f"\n===== {path} =====")
    try:
        with open(path, "r", encoding="utf-8") as f:
            # Read a few hundred characters to see what kind of JSON it is
            head = f.read(500)
            print("Preview of first 500 characters:\n")
            print(head[:500])
        # Try to load only part of the file
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
        if isinstance(data, list):
            print(f"\nLoaded {len(data)} items (list).")
            print("Dictionary keys:", list(data[0].keys()))
            print(json.dumps(data[0], indent=2)[:600])
        elif isinstance(data, dict):
            print("\nTop-level is a dictionary. Keys:", list(data.keys()))
            for k, v in data.items():
                if isinstance(v, list):
                    print(f"Key '{k}' contains a list of {len(v)} items.")
                    if v:
                        print("First item keys:", list(v[0].keys()))
                        print(json.dumps(v[0], indent=2)[:600])
                        break
        else:
            print(f"Unexpected top-level type: {type(data)}")
    except Exception as e:
        print(f"Could not parse {path} as JSON: {e}")

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100  1350  100  1350    0     0   4615      0 --:--:-- --:--:-- --:--:--  4655

  2 71.5M    2 2028k    0     0  4819k      0  0:00:15 --:--:--  0:00:15 4819k
 79 71.5M   79 57.2M    0     0  40.2M      0  0:00:01  0:00:01 --:--:-- 55.1M
100 71.5M  100 71.5M    0     0  42.7M      0  0:00:01  0:00:01 --:--:-- 55.5M


^C


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100  1348  100  1348    0     0   7987      0 --:--:-- --:--:-- --:--:--  8071

  0 1315M    0 11.9M    0     0  17.5M      0  0:01:15 --:--:--  0:01:15 17.5M
  5 1315M    5 74.1M    0     0  44.1M      0  0:00:29  0:00:01  0:00:28 62.3M
 10 1315M   10  138M    0     0  51.7M      0  0:00:25  0:00:02  0:00:23 63.4M
 15 1315M   15  204M    0     0  55.5M      0  0:00:23  0:00:03  0:00:20 64.1M
 20 1315M   20  267M    0     0  57.0M      0  0:00:23  0:00:04  0:00:19 63.8M
 24 1315M   24  326M    0     0  57.5M      0  0:00:22  0:00:05  0:00:17 62.9M
 29 1315M   29  385M    0     0  57.7M      0  0:00:22  0:00:06  0:00:16 62.3M
 33 1315M   33  446M    0     0  58.1M      0  0:00:22  0:00:07  0:00:15 61.5M
 38 1315M   38  509M    0     0  58.7M      0  0:0


===== train.json =====
Preview of first 500 characters:

[
    {
        "query_id": "train_1",
        "question": "Who is the author of the book, \"Horrors of Slavery, or the American Turf in Tripoli\"?",
        "answer": "WILLIAM RAY",
        "org_answer": "WILLIAM RAY",
        "para_id": "New_Hampshire_18070804_1",
        "context": "Aiscellaneous Repository. From the Albany Register, WAR, OR A PROSPECT OF IT, From recent instances of British Outrage. BY: WILLIAM RAY, Author of the contemplated publication, entitled, \u201cHorrors of Slavery, 

Loaded 439302 items (list).
Dictionary keys: ['query_id', 'question', 'answer', 'org_answer', 'para_id', 'context', 'raw_ocr', 'publication_date', 'trans_que', 'trans_ans', 'url']
{
  "query_id": "train_1",
  "question": "Who is the author of the book, \"Horrors of Slavery, or the American Turf in Tripoli\"?",
  "answer": "WILLIAM RAY",
  "org_answer": "WILLIAM RAY",
  "para_id": "New_Hampshire_18070804_1",
  "context": "Aiscellaneous R

# Create the Document Collection

To do that, we create a new json file that contains the 'para_id', 'context', 'raw_ocr', 'publication_date' keys, for all para_id in the collection.

para_id: is the id of a paragraph of a news paper page.

In [None]:
import json
import os

inputs = ["train.json", "validation.json", "test.json"]
output = "document_collection.json"

def load_list_or_empty(path):
    if not os.path.exists(path) or os.path.getsize(path) == 0:
        print(f"Skipping {path} because it is missing or empty")
        return []
    try:
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
        if isinstance(data, list):
            return data
        print(f"Skipping {path} because it is not a list at the top level")
        return []
    except json.JSONDecodeError:
        print(f"Skipping {path} because it is not valid JSON")
        return []

def project(recs):
    out = []
    for r in recs:
        out.append({
            "para_id": r.get("para_id", ""),
            "context": r.get("context", ""),
            "raw_ocr": r.get("raw_ocr", ""),
            "publication_date": r.get("publication_date", "")
        })
    return out

all_recs = []
for p in inputs:
    recs = load_list_or_empty(p)
    print(f"Loaded {len(recs)} records from {p}")
    all_recs.extend(project(recs))

# deduplicate by para_id keeping the first one seen
uniq = {}
for rec in all_recs:
    pid = rec.get("para_id", "")
    if pid and pid not in uniq:
        uniq[pid] = rec

result = list(uniq.values())

with open(output, "w", encoding="utf-8") as f:
    json.dump(result, f, ensure_ascii=False, indent=2)

print(f"Wrote {len(result)} records to {output}")
print(json.dumps(result[:3], indent=2))

## You should check that the collection you have matches that of the paper!

# Create the Test Queries Data Structure

We keep the first 10.000 queries due to memory errors in the free colab version.

To be comparable, please keep the top 10.000 queries for evaluation.

In [None]:
import json
import re
import unicodedata
import string

input_file = "test.json"
output_file = "test_queries.json"

# Load the data
with open(input_file, "r", encoding="utf-8") as f:
    data = json.load(f)

def clean_question(text):
    if not isinstance(text, str):
        return ""
    text = unicodedata.normalize("NFKC", text)
    text = re.sub(rf"[{re.escape(string.punctuation)}]", " ", text)  # remove punctuation
    text = re.sub(r"\s+", " ", text)  # collapse multiple spaces
    return text.strip()

# Extract and clean
queries = [
    {
        "query_id": item.get("query_id", ""),
        "question": clean_question(item.get("question", "")),
    }
    for item in data
]

# Sort by query_id (assuming numeric)
queries = sorted(queries, key=lambda x: int(x["query_id"]) if str(x["query_id"]).isdigit() else x["query_id"])

# Keep only the first 10,000
queries = queries[:10000]

# Save new JSON
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(queries, f, ensure_ascii=False, indent=2)

print(f"Saved {len(queries)} entries to {output_file}")
print(json.dumps(queries[:3], indent=2))

# Create the Qrels for the test set

In [None]:
input_file = "test.json"
qrels_file = "test_qrels.json"
answers_file = "test_query_answers.json"

# Load the data
with open(input_file, "r", encoding="utf-8") as f:
    data = json.load(f)

# Build the qrels file: query_id, iteration=0, para_id, relevance=1
qrels = [
    {
        "query_id": item.get("query_id", ""),
        "iteration": 0,
        "para_id": item.get("para_id", ""),
        "relevance": 1
    }
    for item in data
]

# Build the query_answers file: same plus answer and org_answer
query_answers = [
    {
        "query_id": item.get("query_id", ""),
        "iteration": 0,
        "para_id": item.get("para_id", ""),
        "relevance": 1,
        "answer": item.get("answer", ""),
        "org_answer": item.get("org_answer", "")
    }
    for item in data
]

# Save both files
with open(qrels_file, "w", encoding="utf-8") as f:
    json.dump(qrels, f, ensure_ascii=False, indent=2)

with open(answers_file, "w", encoding="utf-8") as f:
    json.dump(query_answers, f, ensure_ascii=False, indent=2)

print(f"Saved {len(qrels)} entries to {qrels_file}")
print(f"Saved {len(query_answers)} entries to {answers_file}")
print("Sample qrels entry:", qrels[0])
print("Sample query_answers entry:", query_answers[0])

# Retrieval - Good Luck!

In [None]:
#FIX F'ING JAVA_HOME FOR LIL JUPITER
import os
os.environ["JAVA_HOME"] = r"C:\Users\Luigi\miniforge3\envs\retroir\Library\lib\jvm"
os.environ["PATH"] = os.environ["JAVA_HOME"] + r"\bin;" + os.environ["PATH"]


In [None]:
import pyterrier as pt
import pandas as pd
import json
import shutil
import os

In [29]:
with open("document_collection.json", "r", encoding = "utf-8") as file:
    data = json.load (file)
df = pd.DataFrame.from_dict(data)


with open("test_queries.json", "r", encoding = "utf-8") as file:
    data = json.load (file)
test_queries = pd.DataFrame.from_dict(data)
test_queries.rename(columns={"query_id": "qid", "question": "query"}, inplace=True)



with open("test_qrels.json", "r", encoding = "utf-8") as file:
    data = json.load (file)
test_qrels = pd.DataFrame.from_dict(data)
test_qrels.rename(columns={"query_id": "qid", "para_id": "docno"}, inplace=True)

In [None]:
# Prepare dataframe for PyTerrier: needs columns 'docno' and 'text'
corpus_dataframe = df.rename(columns={"para_id": "docno", "context": "text"})[["docno", "text"]]

longest_len = corpus_dataframe["docno"].str.len().max()

# Create or reset an index folder
index_path = os.path.abspath("terrier_index")  # absolute path
print(index_path)
if os.path.exists(index_path):
    shutil.rmtree(index_path)
os.makedirs(index_path, exist_ok=True)

# Build the index
# Store docno as metadata so we can recover it later if needed
# Build the index using the updated IterDictIndexer signature
# Key parameters now are: meta, text_attrs, meta_reverse, pretokenised, fields, threads
indexer = pt.IterDictIndexer(
    index_path,
    meta={"docno": longest_len},  #TO CHECK          # store docno as metadata (up to 200 characters)
    text_attrs=["text"],           # which field(s) contain the text
    meta_reverse=["docno"],        # enable reverse lookup on docno
    pretokenised=False,
    fields=False,
    threads=1,
)

index_ref = indexer.index(corpus_dataframe.to_dict(orient="records"))

# Open the index to ensure it is valid
index = pt.IndexFactory.of(index_ref)

# Print a simple summary
print("Index location:", index_path)
print("Indexed documents:", index.getCollectionStatistics().getNumberOfDocuments())

c:\Users\Luigi\ALL PROGRAMMING\VS_CODE_PROJECTS\IR PROJECT\IR-RS_Project\terrier_index
pisello


Java started (triggered by TerrierIndexer.__init__) and loaded: pyterrier.java.colab, pyterrier.java, pyterrier.java.24, pyterrier.terrier.java [version=5.11 (build: craig.macdonald 2025-01-13 21:29), helper_version=0.0.8]


20:12:57.420 [main] WARN org.terrier.structures.indexing.Indexer -- Indexed 1 empty documents
Index location: c:\Users\Luigi\ALL PROGRAMMING\VS_CODE_PROJECTS\IR PROJECT\IR-RS_Project\terrier_index
Indexed documents: 131921


In [8]:
##Printing the files related to the index
print("Index files:")
!dir terrier_index\ /Q /S

Index files:
 Il volume nell'unit� C � Windows
 Numero di serie del volume: 76CA-2D63

 Directory di c:\Users\Luigi\ALL PROGRAMMING\VS_CODE_PROJECTS\IR PROJECT\IR-RS_Project\terrier_index

15/12/2025  20:12    <DIR>          WIN-7E2RNL78PDD\Luigi  .
15/12/2025  20:12    <DIR>          WIN-7E2RNL78PDD\Luigi  ..
15/12/2025  20:12        20.349.218 WIN-7E2RNL78PDD\Luigi  data.direct.bf
15/12/2025  20:12         2.242.657 WIN-7E2RNL78PDD\Luigi  data.document.fsarrayfile
15/12/2025  20:12        16.490.981 WIN-7E2RNL78PDD\Luigi  data.inverted.bf
15/12/2025  20:12        20.351.556 WIN-7E2RNL78PDD\Luigi  data.lexicon.fsomapfile
15/12/2025  20:12             1.017 WIN-7E2RNL78PDD\Luigi  data.lexicon.fsomaphash
15/12/2025  20:12           946.584 WIN-7E2RNL78PDD\Luigi  data.lexicon.fsomapid
15/12/2025  20:12        19.260.466 WIN-7E2RNL78PDD\Luigi  data.meta-0.fsomapfile
15/12/2025  20:12         1.055.368 WIN-7E2RNL78PDD\Luigi  data.meta.idx
15/12/2025  20:12         4.879.980 WIN-7E2RNL78PDD

In [9]:
# Retrieve collection statistics
stats = index.getCollectionStatistics()

print("Terrier Collection Statistics")
print("--------------------------------")
print(f"Indexed documents:        {stats.getNumberOfDocuments()}")
print(f"Unique terms (vocabulary): {stats.getNumberOfUniqueTerms()}")
print(f"Total tokens:             {stats.getNumberOfTokens()}")
print(f"Average document length:  {stats.getAverageDocumentLength():.2f}")

Terrier Collection Statistics
--------------------------------
Indexed documents:        131921
Unique terms (vocabulary): 236646
Total tokens:             15575099
Average document length:  118.06


In [11]:
display(corpus_dataframe)

for kv in index.getLexicon():
  print("%s -> %s" % (kv.getKey(), kv.getValue().toString()) )

Unnamed: 0,docno,text
0,New_Hampshire_18070804_1,Aiscellaneous Repository. From the Albany Regi...
1,New_Hampshire_18070804_4,Surely he above the rest of his fellow mortals...
2,New_Hampshire_18070804_5,"At Westmoreland, Mrs. Sally Lincoln, wife of M..."
3,New_Hampshire_18070804_8,Upon the correction of this remedy the stomach...
4,New_Hampshire_18070804_9,"Also FOR SALE AS ABOVE, NEW GOODS, STEPHEN HAR..."
...,...,...
131916,Nebraska_19130626_7,"""Did you?” said Fran politely. “So father grad..."
131917,Indiana_19170719_6,"When a boy begins to learn a trade, the ""play ..."
131918,Kentucky_19110727_5,It is situated in the valley of the great many...
131919,Rhode_Island_19140626_10,"A PRACTICAL LESSON IN AGRICULTURE, MAY 1708, T..."


0 -> term675 Nt=2246 TF=7312 maxTF=2147483647 @{0 0 0}
00 -> term2976 Nt=8162 TF=30048 maxTF=2147483647 @{0 3521 4}
000 -> term1606 Nt=16804 TF=43212 maxTF=2147483647 @{0 13025 0}
0001 -> term35405 Nt=7 TF=7 maxTF=2147483647 @{0 27915 4}
0002 -> term172507 Nt=1 TF=1 maxTF=2147483647 @{0 27940 4}
0005 -> term172506 Nt=1 TF=1 maxTF=2147483647 @{0 27944 6}
0006 -> term172505 Nt=1 TF=1 maxTF=2147483647 @{0 27949 0}
0007 -> term12591 Nt=1 TF=1 maxTF=2147483647 @{0 27953 2}
0009 -> term172504 Nt=1 TF=1 maxTF=2147483647 @{0 27956 0}
000f -> term56780 Nt=6 TF=9 maxTF=2147483647 @{0 27960 2}
000l -> term5346 Nt=10 TF=15 maxTF=2147483647 @{0 27980 7}
000lb -> term55461 Nt=1 TF=1 maxTF=2147483647 @{0 28003 2}
000potect -> term110147 Nt=1 TF=1 maxTF=2147483647 @{0 28007 0}
000qr -> term197774 Nt=1 TF=2 maxTF=2147483647 @{0 28011 0}
000th -> term152920 Nt=5 TF=5 maxTF=2147483647 @{0 28015 3}
001 -> term13033 Nt=100 TF=111 maxTF=2147483647 @{0 28033 3}
0010 -> term102486 Nt=2 TF=3 maxTF=2147483647 @

KeyboardInterrupt: 

In [32]:
from pyterrier.measures import *


In [33]:
tfidf = pt.terrier.Retriever(index, wmodel="TF_IDF")

bm25 = pt.terrier.Retriever(index, wmodel="BM25")

pt.Experiment(
    [tfidf, bm25],
    test_queries,
    test_qrels,
    eval_metrics=[AP@1000,P@5,P@10],
    names=["TF_IDF", "BM25"]
)

Unnamed: 0,name,P@5,P@10,AP@1000
0,TF_IDF,0.15916,0.08404,0.708384
1,BM25,0.15916,0.08404,0.708274
