# Simple MS MARCO Data Processing

This notebook processes MS MARCO data from parquet files into a JSON format for our Learning to Rank project.

In [1]:
import pandas as pd
import json
from pathlib import Path
from tqdm.notebook import tqdm

In [2]:
# Configuration
DATA_DIR = Path("../data")
RAW_DIR = DATA_DIR / "raw"
PROCESSED_DIR = DATA_DIR / "processed"
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

# Input files
train_file = RAW_DIR / "train-00000-of-00001.parquet"

# Sample size for development (set to None for full dataset)
SAMPLE_SIZE = None  # Example: 1000

In [3]:
# Load data
print(f"Loading data from {train_file}...")
df = pd.read_parquet(train_file)
if SAMPLE_SIZE is not None:
    df = df.head(SAMPLE_SIZE)
print(f"Loaded {len(df)} rows")

Loading data from ../data/raw/train-00000-of-00001.parquet...
Loaded 82326 rows


In [4]:
# Look at a sample to understand the structure
sample = df.iloc[0]
print(f"Query ID: {sample['query_id']}")
print(f"Query text: {sample['query']}")
print("Passages structure:")
print(sample['passages'].keys())
print(f"Number of passages: {len(sample['passages']['passage_text'])}")

Query ID: 19699
Query text: what is rba
Passages structure:
dict_keys(['is_selected', 'passage_text', 'url'])
Number of passages: 10


In [5]:
# Process data
queries = {}
passages = {}
matches = {}
passage_id_counter = 0

for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing data"):
    query_id = str(row['query_id'])
    query_text = row['query']
    
    # Add query to queries dictionary
    queries[query_id] = query_text
    
    # Process passages
    passage_texts = row['passages']['passage_text']
    is_selected = row['passages']['is_selected']
    
    # Find selected passage index
    selected_idx = None
    for i, val in enumerate(is_selected):
        if val == 1:
            selected_idx = i
            break
    
    if selected_idx is not None:
        # Create unique IDs for passages
        passage_ids = [f"p{passage_id_counter + i}" for i in range(len(passage_texts))]
        
        # Add passages to dictionary
        for pid, text in zip(passage_ids, passage_texts):
            passages[pid] = text
        
        # Create match entry
        matches[query_id] = {
            "suggested": passage_ids,
            "selected": passage_ids[selected_idx]
        }
        
        # Update counter
        passage_id_counter += len(passage_texts)

# Print statistics
print(f"Processed {len(queries)} queries, {len(passages)} passages, and {len(matches)} matches")

Processing data:   0%|          | 0/82326 [00:00<?, ?it/s]

Processed 82326 queries, 655443 passages, and 79704 matches


In [6]:
# Save data to JSON files
print("Saving data to JSON files...")

with open(PROCESSED_DIR / "queries.json", 'w') as f:
    json.dump(queries, f)
    
with open(PROCESSED_DIR / "passages.json", 'w') as f:
    json.dump(passages, f)
    
with open(PROCESSED_DIR / "matches.json", 'w') as f:
    json.dump(matches, f)
    
print(f"Data saved to:\n- {PROCESSED_DIR / 'queries.json'}\n- {PROCESSED_DIR / 'passages.json'}\n- {PROCESSED_DIR / 'matches.json'}")

Saving data to JSON files...
Data saved to:
- ../data/processed/queries.json
- ../data/processed/passages.json
- ../data/processed/matches.json


In [None]:
# Generate a sample triplet
def generate_sample_triplet():
    if not matches:
        return None
        
    # Get a random query
    query_id = list(matches.keys())[0]
    match = matches[query_id]
    
    # Get positive passage
    positive_id = match["selected"]
    
    # Get a negative passage
    negative_ids = [pid for pid in match["suggested"] if pid != positive_id]
    if not negative_ids:
        return None
    negative_id = negative_ids[0]
    
    # Create triplet
    return {
        "query": queries[query_id],
        "positive": passages[positive_id],
        "negative": passages[negative_id]
    }

triplet = generate_sample_triplet()
if triplet:
    print("Sample triplet:")
    print(f"Query: {triplet['query']}")
    pos_text = triplet['positive'][:100] + "..." if len(triplet['positive']) > 100 else triplet['positive']
    neg_text = triplet['negative'][:100] + "..." if len(triplet['negative']) > 100 else triplet['negative']
    print(f"Positive: {pos_text}")
    print(f"Negative: {neg_text}")