# Anchor Text Inverted Index

**Important: DO NOT CLEAR THE OUTPUT OF THIS NOTEBOOK AFTER EXECUTION!!!**

**Output folder:** `anchor_postings_gcp/` (will NOT overwrite body index)

## Setup

In [None]:
!pip install -q google-cloud-storage==1.43.0
!pip install -q graphframes

In [None]:
!gcloud dataproc clusters list --region us-central1

In [None]:
import pyspark
import sys
from collections import Counter, OrderedDict, defaultdict
import itertools
from itertools import islice, count, groupby
import pandas as pd
import os
import re
from operator import itemgetter
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from time import time
from pathlib import Path
import pickle
from google.cloud import storage
from contextlib import closing

import hashlib
def _hash(s):
    return hashlib.blake2b(bytes(s, encoding='utf8'), digest_size=5).hexdigest()

nltk.download('stopwords')

In [None]:
!ls -l /usr/lib/spark/jars/graph*

In [None]:
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf, SparkFiles
from pyspark.sql import SQLContext
from graphframes import *

In [None]:
spark

In [None]:
# ==================================
# BUCKET NAME
# ==================================
bucket_name = 'db204905756'

full_path = f"gs://{bucket_name}/"
paths = []

client = storage.Client()
blobs = client.list_blobs(bucket_name)
for b in blobs:
    if "parquet" in b.name:
        paths.append(full_path + b.name)

print(f"Found {len(paths)} parquet files")

In [None]:
# Load data
parquetFile = spark.read.parquet(*paths)
doc_anchor_pairs = parquetFile.select("anchor_text", "id").rdd
print(f"Total documents: {parquetFile.count():,}")

## Configuration & Custom Writer

**IMPORTANT:** We use a custom folder `anchor_postings_gcp/` to avoid overwriting the body index!

In [None]:
# ==================================
# CONFIGURATION
# ==================================
POSTINGS_FOLDER = "anchor_postings_gcp"  # Custom folder for anchor index!
BLOCK_SIZE = 1999998
TUPLE_SIZE = 6
TF_MASK = 2 ** 16 - 1

print(f"✅ Posting lists will be written to: gs://{bucket_name}/{POSTINGS_FOLDER}/")
print(f"   This will NOT overwrite your body index in postings_gcp/")

In [None]:
# Stopwords and setup
english_stopwords = frozenset(stopwords.words('english'))
corpus_stopwords = ['category', 'references', 'also', 'links', 'external',
                    'first', 'see', 'new', 'two', 'list', 'may', 'one', 'district',
                    'including', 'became', 'however', 'com', 'many', 'began',
                    'make', 'made', 'part', 'would', 'people', 'second',
                    'following', 'history', 'thumb']

all_stopwords = english_stopwords.union(corpus_stopwords)
RE_WORD = re.compile(r"""[\#\@\w](['"-]?\w){2,24}""", re.UNICODE)
STEMMER = PorterStemmer()
NUM_BUCKETS = 124

def token2bucket_id(token):
    return int(_hash(token), 16) % NUM_BUCKETS

print(f"Total stopwords: {len(all_stopwords)}")

In [None]:
# ==================================
# CUSTOM MULTIFILEWRITER - writes to anchor_postings_gcp/
# ==================================

class MultiFileWriter:
    """Sequential binary writer to multiple files - writes to custom folder"""
    def __init__(self, base_dir, name, bucket_name, folder_name):
        self._base_dir = Path(base_dir)
        self._name = name
        self._folder_name = folder_name
        self._file_gen = (open(self._base_dir / f'{name}_{i:03}.bin', 'wb') 
                          for i in itertools.count())
        self._f = next(self._file_gen)
        self.client = storage.Client()
        self.bucket = self.client.bucket(bucket_name)
        
    def write(self, b):
        locs = []
        while len(b) > 0:
            pos = self._f.tell()
            remaining = BLOCK_SIZE - pos
            if remaining == 0:  
                self._f.close()
                self.upload_to_gcp()                
                self._f = next(self._file_gen)
                pos, remaining = 0, BLOCK_SIZE
            self._f.write(b[:remaining])
            locs.append((self._f.name, pos))
            b = b[remaining:]
        return locs

    def close(self):
        self._f.close()
    
    def upload_to_gcp(self):
        file_name = self._f.name
        # Use custom folder name!
        blob = self.bucket.blob(f"{self._folder_name}/{file_name}")
        blob.upload_from_filename(file_name)


def write_a_posting_list(b_w_pl, bucket_name, folder_name):
    """Write posting list to GCS with custom folder"""
    posting_locs = defaultdict(list)
    bucket_id, list_w_pl = b_w_pl
    
    with closing(MultiFileWriter(".", bucket_id, bucket_name, folder_name)) as writer:
        for w, pl in list_w_pl: 
            b = b''.join([(doc_id << 16 | (tf & TF_MASK)).to_bytes(TUPLE_SIZE, 'big')
                          for doc_id, tf in pl])
            locs = writer.write(b)
            posting_locs[w].extend(locs)
        writer.upload_to_gcp()
        
        # Upload posting locations pickle to custom folder
        with open(f"{bucket_id}_posting_locs.pickle", "wb") as f:
            pickle.dump(dict(posting_locs), f)
        client = storage.Client()
        bucket = client.bucket(bucket_name)
        blob = bucket.blob(f"{folder_name}/{bucket_id}_posting_locs.pickle")
        blob.upload_from_filename(f"{bucket_id}_posting_locs.pickle")
        
    return bucket_id

print("✅ Custom MultiFileWriter defined")

## Index Building Functions

In [None]:
def partition_postings_and_write(postings):
    """Partition postings by bucket and write to GCS - uses anchor_postings_gcp folder"""
    bucket_rdd = postings.map(lambda x: (token2bucket_id(x[0]), x)).groupByKey()
    
    def write_bucket(b_w_pl):
        bucket_id, word_posting_pairs = b_w_pl
        return write_a_posting_list(
            (bucket_id, list(word_posting_pairs)), 
            bucket_name, 
            POSTINGS_FOLDER  # <-- Custom folder!
        )
    
    return bucket_rdd.map(write_bucket)


def reduce_word_counts(unsorted_pl):
    """Returns a sorted posting list by wiki_id"""
    return sorted(unsorted_pl, key=lambda x: x[0])


def calculate_df(postings):
    """Calculate document frequency for each token"""
    return postings.map(lambda token: (token[0], len(token[1])))


def tokenize(text):
    """Tokenize and stem text"""
    if text is None:
        return []
    tokens = [token.group() for token in RE_WORD.finditer(text.lower())]
    stemmed = [STEMMER.stem(token) for token in tokens if token not in all_stopwords]
    return stemmed


def stemmed_word_count(text, doc_id):
    """Count stemmed words in text for a given doc_id"""
    tokens = tokenize(text)
    token_counts = Counter(tokens)
    return [(token, (doc_id, count)) for token, count in token_counts.items()]

print("✅ Functions defined")

## Process Anchor Text

In [None]:
# Flatten anchor text pairs
# Each document can have multiple anchor texts pointing to it
new_pairs = doc_anchor_pairs.flatMap(
    lambda pair: [(p.text, p.id) for p in pair[0]] if pair[0] else []
)

print("Sample anchor texts:")
for sample in new_pairs.take(5):
    print(f"  Text: '{sample[0]}' -> Doc ID: {sample[1]}")

In [None]:
%%time
print("Building anchor text index...")

# Word counts
word_counts_anchor = new_pairs.flatMap(lambda x: stemmed_word_count(x[0], x[1]))

# Group and reduce
posting_anchor = word_counts_anchor.groupByKey().mapValues(reduce_word_counts)

# Count unique doc occurrences (a doc might be linked multiple times with same anchor)
def new_count(posting):
    counts = Counter(p[0] for p in posting)
    return list(counts.items())

postings = posting_anchor.map(lambda x: (x[0], new_count(x[1])))

# Cache to avoid recomputation
postings.cache()
print(f"✅ Postings cached")
print(f"Sample: {postings.take(1)}")

In [None]:
%%time
# Calculate df and collect
print("Calculating document frequencies...")
w2df = calculate_df(postings)
w2df_dict = w2df.collectAsMap()
print(f"✅ Vocabulary size: {len(w2df_dict):,}")

In [None]:
%%time
# Write posting lists to GCS
print(f"Writing posting lists to {POSTINGS_FOLDER}/...")
_ = partition_postings_and_write(postings).collect()
print("✅ Done writing posting lists!")

## Collect Posting Locations & Save Index

In [None]:
# Collect posting locations from the ANCHOR folder (not postings_gcp!)
super_posting_locs = defaultdict(list)

for blob in client.list_blobs(bucket_name, prefix=POSTINGS_FOLDER):
    if not blob.name.endswith("pickle"):
        continue
    with blob.open("rb") as f:
        posting_locs = pickle.load(f)
        for k, v in posting_locs.items():
            super_posting_locs[k].extend(v)

print(f"✅ Collected {len(super_posting_locs):,} posting locations")

In [None]:
# Create InvertedIndex class for saving
class InvertedIndex:
    def __init__(self):
        self.df = {}
        self.posting_locs = {}
    
    def write_index(self, base_dir, name):
        with open(Path(base_dir) / f'{name}.pkl', 'wb') as f:
            pickle.dump(self, f)

# Create and save index
inverted = InvertedIndex()
inverted.posting_locs = dict(super_posting_locs)
inverted.df = w2df_dict

# Save locally
inverted.write_index('.', 'anchor_index')

# Upload to GCS
index_src = "anchor_index.pkl"
index_dst = f'gs://{bucket_name}/anchor_index/{index_src}'
!gsutil cp $index_src $index_dst

print(f"\n✅ Index saved to {index_dst}")

## Verify Files

In [None]:
print("=" * 50)
print("Files in anchor_index/:")
print("=" * 50)
!gsutil ls -lh gs://$bucket_name/anchor_index/

In [None]:
print(f"\nPosting list files in {POSTINGS_FOLDER}/:")
!gsutil ls gs://$bucket_name/anchor_postings_gcp/ | head -20
print("...")
!gsutil ls gs://$bucket_name/anchor_postings_gcp/ | wc -l
print("total files")

In [None]:
# Verify body index is UNTOUCHED
print("\n" + "=" * 50)
print("Verifying body index is still intact:")
print("=" * 50)
!gsutil ls gs://$bucket_name/postings_gcp/*.pkl

## Summary

### Files Created:

| File | Location | Description |
|------|----------|-------------|
| anchor_index.pkl | anchor_index/ | Inverted index (posting locs + df) |
| *.bin | anchor_postings_gcp/ | Binary posting list files |
| *_posting_locs.pickle | anchor_postings_gcp/ | Posting location files |

### Your other indexes are SAFE:
- `postings_gcp/` - Body index ✅
- `title_postingsPhrases_gcp/` - Title index ✅

In [None]:
print("\n" + "="*50)
print("✅ Anchor Text Index - COMPLETE!")
print("="*50)
print(f"\nVocabulary size: {len(w2df_dict):,}")
print(f"\nIndex location: gs://{bucket_name}/anchor_index/")
print(f"Posting lists: gs://{bucket_name}/{POSTINGS_FOLDER}/")