# Wrangle MS-MARCO for Relevancy Classifications

The purpose of this notebook is to both transform the MS-MARCO dataset into relevancy classification format and to document the quirks of the original data format. MS-MARCO is a benchmark dataset used to evaluated information retrieval systems.

In [None]:
import json

import pandas as pd
from datasets import load_dataset

pd.set_option("display.max_colwidth", 1000)

In [None]:
split = "train"
version = "v1.1"
dataset = load_dataset("ms_marco", version, split=split)

In [None]:
raw_df = dataset.to_pandas()
raw_df.head()

The "wellFormedAnswers" column contains only empty lists. This column is omitted from the final dataset.

In [None]:
raw_df["wellFormedAnswers"].apply(len).value_counts()

Explode each row of the original dataset so that the new dataset contains one row per query-context pair.

In [None]:
query_texts = []
query_ids = []
query_types = []
reference_responses = []
selections = []
document_texts = []
document_urls = []
for data in dataset:
    document_data = data["passages"]
    selections_for_query = list(map(bool, document_data["is_selected"]))
    document_texts_for_query = document_data["passage_text"]
    document_urls_for_query = document_data["url"]
    assert (
        len(selections_for_query) == len(document_texts_for_query) == len(document_urls_for_query)
    )
    num_documents_for_query = len(selections_for_query)
    selections.extend(selections_for_query)
    document_texts.extend(document_texts_for_query)
    document_urls.extend(document_urls_for_query)
    query_ids.extend([data["query_id"]] * num_documents_for_query)
    query_texts.extend([data["query"]] * num_documents_for_query)
    query_types.extend([data["query_type"]] * num_documents_for_query)
    reference_responses.extend([data["answers"]] * num_documents_for_query)
df = pd.DataFrame(
    {
        "query_id": query_ids,
        "query_text": query_texts,
        "query_type": query_types,
        "relevant": selections,
        "document_text": document_texts,
        "document_url": document_urls,
        "reference_responses": reference_responses,
    }
)
df

Compare the column names of the original dataset with the columns of the wrangled dataset.

In [None]:
set(raw_df.columns).difference(df.columns)

In [None]:
set(df.columns).difference(raw_df.columns)

In [None]:
binary_relevance_classification_df = df[
    ["query_id", "query_text", "document_text", "document_url", "relevant"]
]
binary_relevance_classification_df.head()

Write the data to a JSONL file.

In [None]:
data_path = f"ms_marco-{version}-{split}.jsonl"
with open(data_path, "w") as f:
    for record in binary_relevance_classification_df.to_dict(orient="records"):
        f.write(json.dumps(record) + "\n")