In [None]:
# Dask is more efficient than pandas
!pip install tqdm spacy dask[dataframe] 


In [None]:
import os
import csv
import pandas as pd
import dask.dataframe as dd
import spacy
from tqdm import tqdm
from google.colab import drive

drive.mount('/content/drive')

# Constants
DATA_DIR = '/content/drive/MyDrive/MSMARCO/'
COLLECTION_FILE = 'msmarco-docs.tsv'
OUTPUT_FILE = 'Passage_Collection.csv'
MAX_TOKENS = 500

In [None]:
def load_tokenizer():
    """Loads the spaCy tokenizer with specific components excluded."""
    nlp = spacy.load("en_core_web_sm", exclude=[
        "ner", "tagger", "parser", "lemmatizer", "textcat", "attribute_ruler"
    ])
    nlp.max_length = 2500000
    return nlp

In [None]:
def read_collection(data_dir, filename):
    """Reads the collection file into a DataFrame."""
    file_path = os.path.join(data_dir, filename)
    collection = dd.read_csv(
        file_path, sep='\t', header=None,
        usecols=[0, 3], names=['docid', 'doc_text']
    ).compute()
    return collection

In [None]:
def process_collection(collection, tokenizer, max_tokens):
    """Processes the collection to extract a limited number of tokens."""
    passage_collection = []
    for i in tqdm(range(len(collection))):
        new_doc_tokens = []
        count = 0
        tokens = tokenizer(str(collection.iloc[i].doc_text)) # tokenize to get a list of tokens 
        for token in tokens:
            if count == max_tokens:
                break
            if not token.is_space:
                new_doc_tokens.append(token.text) # add tokens until reaching 500
                count += 1
        passage_collection.append([collection.iloc[i].docid, ' '.join(new_doc_tokens)])
    return passage_collection


In [None]:
def write_to_csv(data, data_dir, filename):
    """Writes the processed data to a CSV file."""
    file_path = os.path.join(data_dir, filename)
    columns = ['docid', 'doc_text']
    with open(file_path, 'w', encoding="utf8", newline='') as f:
        writer = csv.writer(f)
        writer.writerow(columns)
        writer.writerows(data)


In [None]:
tokenizer = load_tokenizer()
collection = read_collection(DATA_DIR, COLLECTION_FILE)
passage_collection = process_collection(collection, tokenizer, MAX_TOKENS)
write_to_csv(passage_collection, DATA_DIR, OUTPUT_FILE)