# Title Index with Pre-computed Phrases

**Important: DO NOT CLEAR THE OUTPUT OF THIS NOTEBOOK AFTER EXECUTION!!!**

This notebook builds a Title inverted index using:
- Pre-computed phrases from `gs://db204905756/phrases/strong_phrases.pkl`
- Pre-computed title doc lengths from `gs://db204905756/title_stemmed/title_doc_lengths.pickle`

**Output folder:** `title_postingsPhrases_gcp/`

## Setup & Imports

In [None]:
!pip install -q google-cloud-storage==1.43.0
!pip install -q graphframes

In [None]:
# Check cluster status
!gcloud dataproc clusters list --region us-central1

In [None]:
import pyspark
import sys
from collections import Counter, OrderedDict, defaultdict
import itertools
from itertools import islice, count, groupby
import pandas as pd
import os
import re
from operator import itemgetter
from time import time
from pathlib import Path
import pickle
from google.cloud import storage
from contextlib import closing
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import hashlib

def _hash(s):
    return hashlib.blake2b(bytes(s, encoding='utf8'), digest_size=5).hexdigest()

nltk.download('stopwords')

In [None]:
# Check graphframes jar
!ls -l /usr/lib/spark/jars/graph*

In [None]:
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf, SparkFiles
from pyspark.sql import SQLContext
from graphframes import *

In [None]:
spark

In [None]:
# Bucket configuration
bucket_name = 'db204905756'
full_path = f"gs://{bucket_name}/"
paths = []

client = storage.Client()
blobs = client.list_blobs(bucket_name)
for b in blobs:
    if "parquet" in b.name:
        paths.append(full_path + b.name)

print(f"Found {len(paths)} parquet files")

## Stopwords & Stemmer Setup

In [None]:
# Initialize Porter Stemmer
STEMMER = PorterStemmer()

english_stopwords = frozenset(stopwords.words('english'))
corpus_stopwords = ["category", "references", "also", "external", "links", 
                    "may", "first", "see", "history", "people", "one", "two", 
                    "part", "thumb", "including", "second", "following", 
                    "many", "however", "would", "became"]

all_stopwords = english_stopwords.union(corpus_stopwords)
RE_WORD = re.compile(r"""[\#\@\w](['"-]?\w){2,24}""", re.UNICODE)

print(f"Total stopwords: {len(all_stopwords)}")

## Load Data - TITLE

In [None]:
# Load parquet files
parquetFile = spark.read.parquet(*paths)

# Select TITLE (not body text!)
doc_title_pairs = parquetFile.select("title", "id").rdd

print(f"Total documents: {parquetFile.count():,}")

In [None]:
# Preview sample
sample = doc_title_pairs.take(5)
for s in sample:
    print(f"ID: {s[1]}, Title: {s[0]}")

## Load Pre-computed Phrases from GCS

**No need to calculate PMI - we use existing phrases!**

In [None]:
# Download strong_phrases.pkl from GCS
print("Loading pre-computed phrases from GCS...")
!gsutil cp gs://db204905756/phrases/strong_phrases.pkl strong_phrases.pkl

# Load the phrases
with open('strong_phrases.pkl', 'rb') as f:
    strong_phrases = pickle.load(f)

print(f"✅ Loaded {len(strong_phrases):,} pre-computed phrases!")

# Show some examples
print("\nSample phrases:")
for i, phrase in enumerate(list(strong_phrases)[:10]):
    print(f"  {phrase[0]}_{phrase[1]}")

## Load Pre-computed Title Doc Lengths

**No need to recalculate - we use existing doc lengths!**

In [None]:
# Load existing title doc lengths
print("Loading pre-computed title doc lengths from GCS...")

bucket = client.bucket(bucket_name)
blob = bucket.blob('title_stemmed/title_doc_lengths.pickle')
contents = blob.download_as_bytes()
title_doc_lengths_dict = pickle.loads(contents)

print(f"✅ Loaded {len(title_doc_lengths_dict):,} title doc lengths")

# Calculate average
avg_title_len = sum(title_doc_lengths_dict.values()) / len(title_doc_lengths_dict)
print(f"Average title length: {avg_title_len:.2f} tokens")

## Tokenization Function (with Stemming + Phrases)

In [None]:
def tokenize_with_phrases_stemmed(text, phrases_set, stopwords_set):
    """
    Tokenize text with stemming, replacing recognized phrases with single tokens.
    """
    if text is None:
        return []
    
    # Extract and stem tokens
    raw_tokens = [token.group().lower() for token in RE_WORD.finditer(text)]
    tokens = [STEMMER.stem(t) for t in raw_tokens if t not in stopwords_set]
    
    if len(tokens) <= 1:
        return tokens
    
    # Merge phrases
    result = []
    i = 0
    while i < len(tokens):
        if i < len(tokens) - 1:
            bigram = (tokens[i], tokens[i+1])
            if bigram in phrases_set:
                result.append(f"{tokens[i]}_{tokens[i+1]}")
                i += 2
                continue
        result.append(tokens[i])
        i += 1
    
    return result

# Test
test_title = "United States President"
test_result = tokenize_with_phrases_stemmed(test_title, strong_phrases, all_stopwords)
print(f"Input: {test_title}")
print(f"Output: {test_result}")

## Custom InvertedIndex with Configurable Folder

We define our own write functions to use `title_postingsPhrases_gcp/` folder instead of the default `postings_gcp/`

In [None]:
# Configuration
POSTINGS_FOLDER = "title_postingsPhrases_gcp"  # <-- Custom folder name!
BLOCK_SIZE = 1999998
TUPLE_SIZE = 6
TF_MASK = 2 ** 16 - 1

print(f"✅ Posting lists will be written to: gs://{bucket_name}/{POSTINGS_FOLDER}/")

In [None]:
class MultiFileWriter:
    """Sequential binary writer to multiple files - writes to custom folder"""
    def __init__(self, base_dir, name, bucket_name, folder_name):
        self._base_dir = Path(base_dir)
        self._name = name
        self._folder_name = folder_name
        self._file_gen = (open(self._base_dir / f'{name}_{i:03}.bin', 'wb') 
                          for i in itertools.count())
        self._f = next(self._file_gen)
        self.client = storage.Client()
        self.bucket = self.client.bucket(bucket_name)
        
    def write(self, b):
        locs = []
        while len(b) > 0:
            pos = self._f.tell()
            remaining = BLOCK_SIZE - pos
            if remaining == 0:  
                self._f.close()
                self.upload_to_gcp()                
                self._f = next(self._file_gen)
                pos, remaining = 0, BLOCK_SIZE
            self._f.write(b[:remaining])
            locs.append((self._f.name, pos))
            b = b[remaining:]
        return locs

    def close(self):
        self._f.close()
    
    def upload_to_gcp(self):
        file_name = self._f.name
        # Use custom folder name!
        blob = self.bucket.blob(f"{self._folder_name}/{file_name}")
        blob.upload_from_filename(file_name)


def write_a_posting_list(b_w_pl, bucket_name, folder_name):
    """Write posting list to GCS with custom folder"""
    posting_locs = defaultdict(list)
    bucket_id, list_w_pl = b_w_pl
    
    with closing(MultiFileWriter(".", bucket_id, bucket_name, folder_name)) as writer:
        for w, pl in list_w_pl: 
            b = b''.join([(doc_id << 16 | (tf & TF_MASK)).to_bytes(TUPLE_SIZE, 'big')
                          for doc_id, tf in pl])
            locs = writer.write(b)
            posting_locs[w].extend(locs)
        writer.upload_to_gcp()
        
        # Upload posting locations pickle to custom folder
        with open(f"{bucket_id}_posting_locs.pickle", "wb") as f:
            pickle.dump(dict(posting_locs), f)
        client = storage.Client()
        bucket = client.bucket(bucket_name)
        blob = bucket.blob(f"{folder_name}/{bucket_id}_posting_locs.pickle")
        blob.upload_from_filename(f"{bucket_id}_posting_locs.pickle")
        
    return bucket_id

print("✅ Custom MultiFileWriter defined")

In [None]:
sc = spark.sparkContext

# Broadcast phrases and stopwords to all workers
strong_phrases_broadcast = sc.broadcast(strong_phrases)
all_stopwords_broadcast = sc.broadcast(all_stopwords)

print(f"✅ Broadcasted {len(strong_phrases):,} phrases to all workers")

## Build Inverted Index Functions

In [None]:
NUM_BUCKETS = 124

def token2bucket_id(token):
    """Map token to bucket number"""
    return int(_hash(token), 16) % NUM_BUCKETS


def word_count_with_phrases_stemmed(title, doc_id):
    """
    Count term frequency for each stemmed token (including phrases) in title.
    """
    tokens = tokenize_with_phrases_stemmed(
        title, 
        strong_phrases_broadcast.value, 
        all_stopwords_broadcast.value
    )
    counts = Counter(tokens)
    return [(token, (doc_id, tf)) for token, tf in counts.items()]


def reduce_word_counts(unsorted_pl):
    """Sort posting list by doc_id"""
    return sorted(unsorted_pl, key=lambda x: x[0])


def calculate_df(postings):
    """Calculate document frequency for each token"""
    return postings.map(lambda token: (token[0], len(token[1])))


def partition_postings_and_write(postings):
    """Partition and write posting lists to GCS - writes to title_postingsPhrases_gcp folder"""
    bucket_rdd = postings.map(lambda x: (token2bucket_id(x[0]), x)).groupByKey()
    
    def write_bucket(b_w_pl):
        bucket_id, word_posting_pairs = b_w_pl
        return write_a_posting_list(
            (bucket_id, list(word_posting_pairs)), 
            bucket_name, 
            POSTINGS_FOLDER  # <-- Use custom folder!
        )
    
    return bucket_rdd.map(write_bucket)

print(f"✅ Functions defined - posting lists will be written to: {POSTINGS_FOLDER}/")

## Build the Title Index

**This should be fast since titles are short (~30-60 minutes)**

In [None]:
%%time
print("Building inverted index with stemming and phrases for TITLES...")
t_start = time()

# Step 1: Word counts (stemmed + phrases)
word_counts = doc_title_pairs.flatMap(lambda x: word_count_with_phrases_stemmed(x[0], x[1]))

# Step 2: Create posting lists
postings = word_counts.groupByKey().mapValues(reduce_word_counts)

# Step 3: Filter rare terms (optional for titles, use lower threshold)
MIN_DF = 3  # Lower threshold for titles since they're short
postings_filtered = postings.filter(lambda x: len(x[1]) >= MIN_DF)

# Step 4: Calculate df
w2df = calculate_df(postings_filtered)
w2df_dict = w2df.collectAsMap()

print(f"\n✅ Total unique tokens (with df >= {MIN_DF}): {len(w2df_dict):,}")
print(f"Time: {(time() - t_start)/60:.2f} minutes")

In [None]:
%%time
# Write posting lists to GCS
print(f"Writing posting lists to {POSTINGS_FOLDER}/...")
_ = partition_postings_and_write(postings_filtered).collect()
print("✅ Done writing posting lists!")

## Collect Posting Locations & Save Index

In [None]:
# Collect all posting list locations from our custom folder
super_posting_locs = defaultdict(list)

for blob in client.list_blobs(bucket_name, prefix=POSTINGS_FOLDER):
    if not blob.name.endswith("pickle"):
        continue
    with blob.open("rb") as f:
        posting_locs = pickle.load(f)
        for k, v in posting_locs.items():
            super_posting_locs[k].extend(v)

print(f"✅ Collected posting locations for {len(super_posting_locs):,} tokens")

In [None]:
# Create InvertedIndex class for saving
class InvertedIndex:
    def __init__(self):
        self.df = {}
        self.posting_locs = {}
    
    def write_index(self, base_dir, name):
        with open(Path(base_dir) / f'{name}.pkl', 'wb') as f:
            pickle.dump(self, f)

# Create and save InvertedIndex
inverted = InvertedIndex()
inverted.posting_locs = dict(super_posting_locs)
inverted.df = w2df_dict

# Save locally
inverted.write_index('.', 'index')

# Upload to GCS - to the index folder
index_src = "index.pkl"
index_dst = f'gs://{bucket_name}/title_stemmed_phrases_idx/index.pkl'
!gsutil cp $index_src $index_dst

print(f"\n✅ Index saved to {index_dst}")

In [None]:
# Copy title doc lengths to the new index folder
!gsutil cp gs://$bucket_name/title_stemmed/title_doc_lengths.pickle gs://$bucket_name/title_stemmed_phrases_idx/title_doc_lengths.pickle

# Copy phrases to the new index folder
!gsutil cp gs://$bucket_name/phrases/strong_phrases.pkl gs://$bucket_name/title_stemmed_phrases_idx/strong_phrases.pkl

print("✅ All supporting files copied!")

## Verify All Files

In [None]:
print("=" * 50)
print("Files in title_stemmed_phrases_idx/:")
print("=" * 50)
!gsutil ls -lh gs://$bucket_name/title_stemmed_phrases_idx/

In [None]:
# Count posting list files in the custom folder
print(f"\nPosting list files in {POSTINGS_FOLDER}/:")
!gsutil ls gs://$bucket_name/title_postingsPhrases_gcp/ | head -20
print("...")
!gsutil ls gs://$bucket_name/title_postingsPhrases_gcp/ | wc -l
print("total files")

## Summary

### Files Created:

| File | Location | Description |
|------|----------|-------------|
| index.pkl | title_stemmed_phrases_idx/ | Inverted index (posting locs + df) |
| title_doc_lengths.pickle | title_stemmed_phrases_idx/ | Document lengths for BM25 |
| strong_phrases.pkl | title_stemmed_phrases_idx/ | Set of detected phrases |
| *.bin | title_postingsPhrases_gcp/ | Binary posting list files |
| *_posting_locs.pickle | title_postingsPhrases_gcp/ | Posting location files |

In [None]:
print("\n" + "="*50)
print("✅ Title Index with Phrases - COMPLETE!")
print("="*50)
print(f"\nVocabulary size: {len(w2df_dict):,}")
print(f"Documents: {len(title_doc_lengths_dict):,}")
print(f"Average title length: {avg_title_len:.2f}")
print(f"\nIndex location: gs://{bucket_name}/title_stemmed_phrases_idx/")
print(f"Posting lists: gs://{bucket_name}/{POSTINGS_FOLDER}/")