# Phrases Inverted Index - Body Text (with Stemming)

**מטרה:** בניית Inverted Index על גוף הטקסט (body) עם תמיכה בביטויים (phrases) באמצעות PMI + Porter Stemming

***Important*** DO NOT CLEAR THE OUTPUT OF THIS NOTEBOOK AFTER EXECUTION!!!

In [None]:
!pip install -q google-cloud-storage==1.43.0
!pip install -q graphframes

In [None]:
# Check cluster status
!gcloud dataproc clusters list --region us-central1

# Imports & Setup

In [None]:
import pyspark
import sys
from collections import Counter, OrderedDict, defaultdict
import itertools
from itertools import islice, count, groupby
import pandas as pd
import os
import re
from operator import itemgetter
import nltk
from nltk.stem.porter import *
from nltk.corpus import stopwords
from time import time
from pathlib import Path
import pickle
import math
import numpy as np
from google.cloud import storage
from contextlib import closing

import hashlib
def _hash(s):
    return hashlib.blake2b(bytes(s, encoding='utf8'), digest_size=5).hexdigest()

nltk.download('stopwords')

In [None]:
# Check graphframes jar
!ls -l /usr/lib/spark/jars/graph*

In [None]:
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf, SparkFiles
from pyspark.sql import SQLContext
from graphframes import *

In [None]:
spark

In [None]:
# ==================================
# IMPORTANT: Change bucket_name to your bucket!
# ==================================
bucket_name = 'db204905756'  # <-- שנה לשם ה-bucket שלך

full_path = f"gs://{bucket_name}/"
paths = []

client = storage.Client()
blobs = client.list_blobs(bucket_name)
for b in blobs:
    if "parquet" in b.name:
        paths.append(full_path + b.name)

print(f"Found {len(paths)} parquet files")

# Stopwords, Regex & Stemmer Setup

In [None]:
# Initialize Porter Stemmer
STEMMER = PorterStemmer()

english_stopwords = frozenset(stopwords.words('english'))
corpus_stopwords = ["category", "references", "also", "external", "links", 
                    "may", "first", "see", "history", "people", "one", "two", 
                    "part", "thumb", "including", "second", "following", 
                    "many", "however", "would", "became", "make", "made",
                    "new", "list", "district", "com", "began"]

all_stopwords = english_stopwords.union(corpus_stopwords)
RE_WORD = re.compile(r"""[\#\@\w](['\-]?\w){2,24}""", re.UNICODE)

print(f"Total stopwords: {len(all_stopwords)}")
print(f"Stemmer initialized: {type(STEMMER).__name__}")

# Load Data - Body Text

In [None]:
# Load parquet files
parquetFile = spark.read.parquet(*paths)

# Select BODY TEXT (not title!)
doc_text_pairs = parquetFile.select("text", "id").rdd

print(f"Total documents: {parquetFile.count():,}")

In [None]:
# Preview sample
sample = doc_text_pairs.take(2)
print(f"Sample doc ID: {sample[0][1]}")
print(f"Sample text (first 500 chars): {sample[0][0][:500]}...")

# PMI-Based Phrase Detection (with Stemming)

## מה זה PMI?
PMI (Pointwise Mutual Information) מודד את הסיכוי ששתי מילים מופיעות יחד בהשוואה לסיכוי שהן מופיעות בנפרד:

$$PMI(word_1, word_2) = \log_2 \frac{P(word_1, word_2)}{P(word_1) \cdot P(word_2)}$$

In [None]:
def extract_unigrams_stemmed(text):
    """
    Extract all stemmed words (unigrams) from text
    """
    if text is None:
        return []
    tokens = [token.group().lower() for token in RE_WORD.finditer(text)]
    # Remove stopwords and apply stemming
    stemmed = [STEMMER.stem(t) for t in tokens if t not in all_stopwords]
    return stemmed


def extract_bigrams_stemmed(text):
    """
    Extract all adjacent stemmed word pairs (bigrams) from text
    """
    tokens = extract_unigrams_stemmed(text)
    bigrams = []
    for i in range(len(tokens) - 1):
        bigrams.append((tokens[i], tokens[i+1]))
    return bigrams


def calculate_pmi(unigram_counts, bigram_counts, total_unigrams, total_bigrams):
    """
    Calculate PMI for all bigrams
    
    PMI(w1, w2) = log2( P(w1, w2) / (P(w1) * P(w2)) )
    """
    pmi_scores = {}
    
    for (w1, w2), bigram_count in bigram_counts.items():
        p_bigram = bigram_count / total_bigrams
        p_w1 = unigram_counts.get(w1, 0) / total_unigrams
        p_w2 = unigram_counts.get(w2, 0) / total_unigrams
        
        if p_w1 > 0 and p_w2 > 0:
            pmi = math.log2(p_bigram / (p_w1 * p_w2))
            pmi_scores[(w1, w2)] = pmi
    
    return pmi_scores


# Test stemming
test_words = ["running", "cities", "better", "university", "played"]
print("Stemming examples:")
for word in test_words:
    print(f"  {word} -> {STEMMER.stem(word)}")

## Step 1: Count Unigrams (Stemmed)

In [None]:
%%time
print("Counting stemmed unigrams (this may take a while for body text)...")

unigram_rdd = doc_text_pairs.flatMap(lambda x: extract_unigrams_stemmed(x[0]))
unigram_counts_rdd = unigram_rdd.map(lambda word: (word, 1)).reduceByKey(lambda a, b: a + b)

# Collect to memory
unigram_counts = dict(unigram_counts_rdd.collect())
total_unigrams = sum(unigram_counts.values())

print(f"Unique stemmed unigrams: {len(unigram_counts):,}")
print(f"Total unigram occurrences: {total_unigrams:,}")

## Step 2: Count Bigrams (Stemmed)

In [None]:
%%time
# For body text, we need higher minimum frequency to filter noise
MIN_BIGRAM_FREQ = 100  # Higher threshold for body text

print(f"Counting stemmed bigrams (min frequency: {MIN_BIGRAM_FREQ})...")

bigram_rdd = doc_text_pairs.flatMap(lambda x: extract_bigrams_stemmed(x[0]))
bigram_counts_rdd = bigram_rdd.map(lambda bg: (bg, 1)).reduceByKey(lambda a, b: a + b)

# Filter by minimum frequency
bigram_counts_filtered = bigram_counts_rdd.filter(lambda x: x[1] >= MIN_BIGRAM_FREQ)

# Collect to memory
bigram_counts = dict(bigram_counts_filtered.collect())
total_bigrams = sum(bigram_counts.values())

print(f"Bigrams with freq >= {MIN_BIGRAM_FREQ}: {len(bigram_counts):,}")
print(f"Total bigram occurrences: {total_bigrams:,}")

## Step 3: Calculate PMI and Filter Strong Phrases

In [None]:
%%time
# PMI thresholds for body text
PMI_THRESHOLD = 6.0  # Higher threshold for body text
MIN_PHRASE_FREQ = 50  # Minimum frequency for a phrase

print(f"Calculating PMI (threshold: {PMI_THRESHOLD}, min_freq: {MIN_PHRASE_FREQ})...")

# Calculate PMI
pmi_scores = calculate_pmi(unigram_counts, bigram_counts, total_unigrams, total_bigrams)

# Filter strong phrases
strong_phrases = set()
phrase_stats = []

for (w1, w2), pmi in pmi_scores.items():
    freq = bigram_counts.get((w1, w2), 0)
    if pmi >= PMI_THRESHOLD and freq >= MIN_PHRASE_FREQ:
        strong_phrases.add((w1, w2))
        phrase_stats.append(((w1, w2), pmi, freq))

print(f"Strong phrases found: {len(strong_phrases):,}")

In [None]:
# Show top phrases by PMI
phrase_stats_sorted = sorted(phrase_stats, key=lambda x: x[1], reverse=True)

print("\nTop 30 stemmed phrases by PMI:")
print("-" * 60)
for (w1, w2), pmi, freq in phrase_stats_sorted[:30]:
    print(f"{w1}_{w2:20s} PMI: {pmi:.2f}  Freq: {freq:,}")

In [None]:
# Show top phrases by frequency
phrase_stats_by_freq = sorted(phrase_stats, key=lambda x: x[2], reverse=True)

print("\nTop 30 stemmed phrases by Frequency:")
print("-" * 60)
for (w1, w2), pmi, freq in phrase_stats_by_freq[:30]:
    print(f"{w1}_{w2:20s} Freq: {freq:,}  PMI: {pmi:.2f}")

In [None]:
# Save strong phrases
phrases_filename = 'strong_phrases_body_stemmed.pkl'
with open(phrases_filename, 'wb') as f:
    pickle.dump(strong_phrases, f)

# Upload to GCS
phrases_dst = f'gs://{bucket_name}/body_stemmed_phrases_idx/{phrases_filename}'
!gsutil cp $phrases_filename $phrases_dst

print(f"Saved {len(strong_phrases):,} phrases to {phrases_dst}")

# Build Inverted Index with Phrases and Stemming

In [None]:
def tokenize_with_phrases_stemmed(text, phrases_set, stopwords_set):
    """
    Tokenize text with stemming, replacing recognized phrases with single tokens.
    
    Example:
    "I live in new york city" -> ["live", "new_york", "citi"]  (stemmed!)
    """
    if text is None:
        return []
    
    # Extract and stem tokens
    raw_tokens = [token.group().lower() for token in RE_WORD.finditer(text)]
    tokens = [STEMMER.stem(t) for t in raw_tokens if t not in stopwords_set]
    
    if len(tokens) <= 1:
        return tokens
    
    # Merge phrases
    result = []
    i = 0
    while i < len(tokens):
        if i < len(tokens) - 1:
            bigram = (tokens[i], tokens[i+1])
            if bigram in phrases_set:
                result.append(f"{tokens[i]}_{tokens[i+1]}")
                i += 2
                continue
        result.append(tokens[i])
        i += 1
    
    return result


# Test
test_text = "The United States and New York City are located in North America"
test_result = tokenize_with_phrases_stemmed(test_text, strong_phrases, all_stopwords)
print(f"Input: {test_text}")
print(f"Output (stemmed + phrases): {test_result}")

In [None]:
# Load InvertedIndex module
%cd -q /home/dataproc
!ls inverted_index_gcp.py

sc = spark.sparkContext
sc.addFile("/home/dataproc/inverted_index_gcp.py")
sys.path.insert(0, SparkFiles.getRootDirectory())

from inverted_index_gcp import InvertedIndex

In [None]:
# Broadcast phrases and stopwords to all workers
strong_phrases_broadcast = sc.broadcast(strong_phrases)
all_stopwords_broadcast = sc.broadcast(all_stopwords)

print(f"Broadcasted {len(strong_phrases):,} phrases to all workers")

In [None]:
NUM_BUCKETS = 124
INDEX_DIR = "body_stemmed_phrases"  # Directory name for body index with stemming

def token2bucket_id(token):
    """Map token to bucket number"""
    return int(_hash(token), 16) % NUM_BUCKETS


def word_count_with_phrases_stemmed(text, doc_id):
    """
    Count term frequency for each stemmed token (including phrases) in document.
    """
    tokens = tokenize_with_phrases_stemmed(
        text, 
        strong_phrases_broadcast.value, 
        all_stopwords_broadcast.value
    )
    counts = Counter(tokens)
    return [(token, (doc_id, tf)) for token, tf in counts.items()]


def reduce_word_counts(unsorted_pl):
    """Sort posting list by doc_id"""
    return sorted(unsorted_pl, key=lambda x: x[0])


def calculate_df(postings):
    """Calculate document frequency for each token"""
    return postings.map(lambda token: (token[0], len(token[1])))


def partition_postings_and_write(postings, base_dir):
    """Partition and write posting lists to GCS"""
    bucket_rdd = postings.map(lambda x: (token2bucket_id(x[0]), x)).groupByKey()
    
    def write_bucket(b_w_pl):
        bucket_id, word_posting_pairs = b_w_pl
        return InvertedIndex.write_a_posting_list(
            (bucket_id, list(word_posting_pairs)), base_dir, bucket_name
        )
    
    return bucket_rdd.map(write_bucket)

## Build the Index

In [None]:
%%time
print("Building inverted index with stemming and phrases for body text...")
print("This will take a while for the full corpus...")

# Step 1: Word counts (stemmed + phrases)
word_counts = doc_text_pairs.flatMap(lambda x: word_count_with_phrases_stemmed(x[0], x[1]))

# Step 2: Create posting lists
postings = word_counts.groupByKey().mapValues(reduce_word_counts)

# Step 3: Filter rare terms (helps reduce index size)
MIN_DF = 5  # Minimum document frequency
postings_filtered = postings.filter(lambda x: len(x[1]) >= MIN_DF)

# Step 4: Calculate df
w2df = calculate_df(postings_filtered)
w2df_dict = w2df.collectAsMap()

print(f"Total unique tokens (with df >= {MIN_DF}): {len(w2df_dict):,}")

In [None]:
%%time
# Write posting lists to GCS
print(f"Writing posting lists to {INDEX_DIR}...")
_ = partition_postings_and_write(postings_filtered, INDEX_DIR).collect()
print("Done writing posting lists!")

In [None]:
# Collect all posting list locations
super_posting_locs = defaultdict(list)

for blob in client.list_blobs(bucket_name, prefix=INDEX_DIR):
    if not blob.name.endswith("pickle"):
        continue
    with blob.open("rb") as f:
        posting_locs = pickle.load(f)
        for k, v in posting_locs.items():
            super_posting_locs[k].extend(v)

print(f"Collected posting locations for {len(super_posting_locs):,} tokens")

In [None]:
# Create and save InvertedIndex
inverted = InvertedIndex()
inverted.posting_locs = super_posting_locs
inverted.df = w2df_dict

# Save locally
inverted.write_index('.', 'index')

# Upload to GCS
index_src = "index.pkl"
index_dst = f'gs://{bucket_name}/body_stemmed_phrases_idx/{index_src}'
!gsutil cp $index_src $index_dst

print(f"Index saved to {index_dst}")

In [None]:
# Verify index size
!gsutil ls -lh $index_dst

# Calculate Document Lengths

In [None]:
def calc_doc_length_stemmed(doc_id, text):
    """Calculate document length in stemmed tokens (including phrases)"""
    tokens = tokenize_with_phrases_stemmed(
        text, 
        strong_phrases_broadcast.value, 
        all_stopwords_broadcast.value
    )
    return (doc_id, len(tokens))

# Calculate lengths
print("Calculating document lengths (stemmed)...")
doc_lengths_rdd = doc_text_pairs.map(lambda x: calc_doc_length_stemmed(x[1], x[0]))
doc_lengths_dict = doc_lengths_rdd.collectAsMap()

print(f"Calculated lengths for {len(doc_lengths_dict):,} documents")

# Stats
lengths = list(doc_lengths_dict.values())
print(f"Average doc length: {np.mean(lengths):.1f}")
print(f"Median doc length: {np.median(lengths):.1f}")
print(f"Max doc length: {max(lengths):,}")

In [None]:
# Save document lengths
lengths_filename = 'body_doc_lengths_stemmed.pickle'

with open(lengths_filename, 'wb') as f:
    pickle.dump(doc_lengths_dict, f)

# Upload to GCS
lengths_dst = f'gs://{bucket_name}/body_stemmed_phrases_idx_Phrases/{lengths_filename}'
!gsutil cp $lengths_filename $lengths_dst

print(f"Document lengths saved to {lengths_dst}")

# Verify All Files

In [None]:
print("Files in body_stemmed_phrases_idx/:")
!gsutil ls -lh gs://$bucket_name/body_stemmed_phrases_idx/

In [None]:
print("\nPosting list files in body_stemmed_phrases/:")
!gsutil ls gs://$bucket_name/body_stemmed_phrases/ | head -20
print("...")
!gsutil ls gs://$bucket_name/body_stemmed_phrases/ | wc -l
print("total files")

# Summary

## Files Created:

| File | Location | Description |
|------|----------|-------------|
| `index.pkl` | `body_stemmed_phrases_idx/` | Inverted index (posting locs + df) |
| `body_doc_lengths_stemmed.pickle` | `body_stemmed_phrases_idx/` | Document lengths for BM25 |
| `strong_phrases_body_stemmed.pkl` | `body_stemmed_phrases_idx/` | Set of detected phrases (stemmed) |
| `*.bin` | `body_stemmed_phrases/` | Binary posting list files |

## Usage in search_frontend.py:

```python
from nltk.stem.porter import PorterStemmer

STEMMER = PorterStemmer()

# Load phrases
with open('strong_phrases_body_stemmed.pkl', 'rb') as f:
    strong_phrases = pickle.load(f)

# Process query with same tokenization (stemming + phrases)
def process_query(query):
    return tokenize_with_phrases_stemmed(query, strong_phrases, all_stopwords)
```

## Important Notes:
- **Query must be processed the same way!** Use stemming + phrase detection
- Stemmed tokens like `unit_state` will match queries like "United States", "united states", etc.

In [None]:
print("✅ Body text index with STEMMING and phrases creation complete!")