# MS MARCO Data Download and Processing

This notebook handles both downloading and processing MS MARCO data for our Learning to Rank project:

1. Downloads raw data from HuggingFace (if not already downloaded)
2. Processes the data into JSON format for training
3. Deduplicates passages to ensure each unique passage text is assigned only one ID

**Note**: If you already have the required files in the data/raw directory, the download step will be skipped.

In [13]:
import pandas as pd
import json
import requests
import shutil
import tempfile
import os
from pathlib import Path
from tqdm import tqdm

In [14]:
# Configuration
DATA_DIR = Path("../data")
RAW_DIR = DATA_DIR / "raw"
PROCESSED_DIR = DATA_DIR / "processed"

# Create directories if they don't exist
RAW_DIR.mkdir(parents=True, exist_ok=True)
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

# Dataset files
DATASETS = ["train", "validation", "test"]
DATASET_FILES = {dataset: RAW_DIR / f"{dataset}-00000-of-00001.parquet" for dataset in DATASETS}

# Sample size for development (set to None for full dataset)
SAMPLE_SIZE = None  # Example: 1000

In [15]:
# Download function
def download_ms_marco_data():
    """Download MS MARCO dataset files from HuggingFace if they don't already exist."""
    print("Checking for MS MARCO dataset files...")
    
    # Check which files need to be downloaded
    missing_files = [f for dataset, f in DATASET_FILES.items() if not f.exists()]
    
    if not missing_files:
        print("All dataset files already exist! Skipping download.")
        return True
    
    print(f"Need to download {len(missing_files)} files.")
    
    # Download missing files
    for dataset, file_path in DATASET_FILES.items():
        if file_path.exists():
            print(f"  {dataset} dataset already exists, skipping download")
            continue
        
        try:
            print(f"  Downloading {dataset} dataset...")
            
            # Direct URL to the dataset file
            url = f"https://huggingface.co/datasets/microsoft/ms_marco/resolve/main/v1.1/{dataset}-00000-of-00001.parquet"
            
            # Download with progress bar
            response = requests.get(url, stream=True)
            response.raise_for_status()  # Raise exception for HTTP errors
            
            # Get total file size for progress bar
            total_size = int(response.headers.get('content-length', 0))
            
            # Use tqdm for a progress bar
            progress_bar = tqdm(total=total_size, unit='B', unit_scale=True, desc=f"  {dataset}")
            
            # Save the file
            with open(file_path, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    if chunk:  # filter out keep-alive chunks
                        f.write(chunk)
                        progress_bar.update(len(chunk))
            
            progress_bar.close()
            print(f"  {dataset} dataset downloaded successfully")
            
        except Exception as e:
            print(f"Error downloading {dataset} dataset: {e}")
            return False
    
    # Clean up any cache folders that might have been created
    cleanup_paths = [
        Path(".cache"),
        Path(tempfile.gettempdir()) / "huggingface"
    ]
    
    for path in cleanup_paths:
        if path.exists() and path.is_dir():
            try:
                print(f"Cleaning up cache directory: {path}")
                shutil.rmtree(path)
            except Exception as e:
                print(f"Warning: Could not clean up {path}: {e}")
    
    print("Download complete!")
    return True

# Run download
download_successful = download_ms_marco_data()

Checking for MS MARCO dataset files...
All dataset files already exist! Skipping download.


In [16]:
# Verify the download was successful
if not download_successful:
    raise Exception("Download failed. Please check the error messages above.")

# Set train file for processing
train_file = DATASET_FILES["train"]

# Load data
print(f"Loading data from {train_file}...")
df = pd.read_parquet(train_file)
if SAMPLE_SIZE is not None:
    df = df.head(SAMPLE_SIZE)
print(f"Loaded {len(df)} rows")

Loading data from ../data/raw/train-00000-of-00001.parquet...
Loaded 82326 rows


In [17]:
# Process data with deduplication
queries = {}
passages = {}
matches = {}
passage_id_counter = 0

# Dictionary to track text to ID mapping for deduplication
text_to_id = {}

# Stats for reporting
duplicate_count = 0
total_passages_processed = 0

for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing data"):
    query_id = str(row['query_id'])
    query_text = row['query']
    
    # Add query to queries dictionary
    queries[query_id] = query_text
    
    # Process passages
    passage_texts = row['passages']['passage_text']
    is_selected = row['passages']['is_selected']
    total_passages_processed += len(passage_texts)
    
    # Find selected passage index
    selected_idx = None
    for i, val in enumerate(is_selected):
        if val == 1:
            selected_idx = i
            break
    
    if selected_idx is not None:
        # We'll collect passage IDs for this query
        query_passage_ids = []
        
        # Process each passage for this query
        for i, text in enumerate(passage_texts):
            # Check if we've seen this passage before
            if text in text_to_id:
                # Reuse existing ID
                pid = text_to_id[text]
                duplicate_count += 1
            else:
                # Create new ID for this unique passage
                pid = f"p{passage_id_counter}"
                passage_id_counter += 1
                
                # Store in our mapping and passages dictionary
                text_to_id[text] = pid
                passages[pid] = text
            
            # Add this passage ID to the query's passage list
            query_passage_ids.append(pid)
        
        # Create match entry - using deduplicated IDs
        matches[query_id] = {
            "suggested": query_passage_ids,
            "selected": query_passage_ids[selected_idx]
        }

# Print statistics
print(f"Processed {len(queries)} queries, {len(passages)} unique passages, and {len(matches)} matches")
print(f"Found {duplicate_count} duplicate passages out of {total_passages_processed} total passages")
print(f"Deduplication rate: {duplicate_count / total_passages_processed * 100:.2f}%")

Processing data: 100%|██████████| 82326/82326 [00:07<00:00, 11401.14it/s]

Processed 82326 queries, 608265 unique passages, and 79704 matches
Found 47178 duplicate passages out of 676193 total passages
Deduplication rate: 6.98%





In [18]:
# Remove existing files if they exist to ensure clean data
for filename in ["queries.json", "passages.json", "matches.json"]:
    filepath = PROCESSED_DIR / filename
    if filepath.exists():
        print(f"Removing existing file: {filepath}")
        os.remove(filepath)

# Save data to JSON files
print("Saving data to JSON files...")

with open(PROCESSED_DIR / "queries.json", 'w') as f:
    json.dump(queries, f)
    
with open(PROCESSED_DIR / "passages.json", 'w') as f:
    json.dump(passages, f)
    
with open(PROCESSED_DIR / "matches.json", 'w') as f:
    json.dump(matches, f)
    
print(f"Data saved to:\n- {PROCESSED_DIR / 'queries.json'}\n- {PROCESSED_DIR / 'passages.json'}\n- {PROCESSED_DIR / 'matches.json'}")

Saving data to JSON files...
Data saved to:
- ../data/processed/queries.json
- ../data/processed/passages.json
- ../data/processed/matches.json
