# Body Text Inverted Index - NO STEMMING
**For `search_body()` endpoint**

**Important: DO NOT CLEAR THE OUTPUT OF THIS NOTEBOOK AFTER EXECUTION!!!**

Requirements:
- ❌ NO Stemming
- ❌ NO Phrases
- ✅ Body text
- ✅ TF-IDF ready

In [None]:
!pip install -q google-cloud-storage==1.43.0
!pip install -q graphframes

In [None]:
# Check cluster status
!gcloud dataproc clusters list --region us-central1

## Imports & Setup

In [None]:
import pyspark
import sys
from collections import Counter, OrderedDict, defaultdict
import itertools
from itertools import islice, count, groupby
import pandas as pd
import os
import re
from operator import itemgetter
import nltk
from nltk.corpus import stopwords
from time import time
from pathlib import Path
import pickle
import math
import numpy as np
from google.cloud import storage
from contextlib import closing

import hashlib
def _hash(s):
    return hashlib.blake2b(bytes(s, encoding='utf8'), digest_size=5).hexdigest()

nltk.download('stopwords')

In [None]:
# Check graphframes jar
!ls -l /usr/lib/spark/jars/graph*

In [None]:
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf, SparkFiles
from pyspark.sql import SQLContext
from graphframes import *

In [None]:
# ==================================
# IMPORTANT: Change bucket_name to your bucket!
# ==================================
bucket_name = 'db204905756'  # <-- שנה לשם ה-bucket שלך

full_path = f"gs://{bucket_name}/"
paths = []

client = storage.Client()
blobs = client.list_blobs(bucket_name)
for b in blobs:
    if "parquet" in b.name:
        paths.append(full_path + b.name)

print(f"Found {len(paths)} parquet files")

## Stopwords Setup (NO STEMMING!)

In [None]:
# ⚠️ NO STEMMER - This is intentional for search_body() requirement!

english_stopwords = frozenset(stopwords.words('english'))
corpus_stopwords = ["category", "references", "also", "external", "links", 
                    "may", "first", "see", "history", "people", "one", "two", 
                    "part", "thumb", "including", "second", "following", 
                    "many", "however", "would", "became", "make", "made",
                    "new", "list", "district", "com", "began"]

all_stopwords = english_stopwords.union(corpus_stopwords)
RE_WORD = re.compile(r"""[\#\@\w](['"-]?\w){2,24}""", re.UNICODE)

print(f"Total stopwords: {len(all_stopwords)}")
print("⚠️ NO STEMMING - as required for search_body()")

## Load Data - Body Text

In [None]:
# Load parquet files
parquetFile = spark.read.parquet(*paths)

# Select BODY TEXT (not title!)
doc_text_pairs = parquetFile.select("text", "id").rdd

print(f"Total documents: {parquetFile.count():,}")

In [None]:
# Preview sample
sample = doc_text_pairs.take(2)
print(f"Sample doc ID: {sample[0][1]}")
print(f"Sample text (first 500 chars): {sample[0][0][:500]}...")

## Tokenization Function (NO STEMMING!)

In [None]:
def tokenize_nostem(text, stopwords_set):
    """
    Tokenize text WITHOUT stemming.
    Uses staff-provided tokenizer (RE_WORD regex).
    
    This matches the requirement:
    'DO NOT use stemming. DO USE the staff-provided tokenizer'
    """
    if text is None:
        return []
    # Extract tokens using staff-provided regex
    tokens = [token.group().lower() for token in RE_WORD.finditer(text)]
    # Remove stopwords but DO NOT STEM
    return [t for t in tokens if t not in stopwords_set]


# Test tokenization
test_text = "The United States of America has many running cities and universities"
tokens = tokenize_nostem(test_text, all_stopwords)
print(f"Input: {test_text}")
print(f"Output (NO stemming): {tokens}")
print("\n✅ Notice: 'running' stays as 'running', 'cities' stays as 'cities' (no stemming)")

## Load InvertedIndex Module

In [None]:
# Load InvertedIndex module
%cd -q /home/dataproc
!ls inverted_index_gcp.py

sc = spark.sparkContext
sc.addFile("/home/dataproc/inverted_index_gcp.py")
sys.path.insert(0, SparkFiles.getRootDirectory())

from inverted_index_gcp import InvertedIndex

In [None]:
# Broadcast stopwords to all workers
all_stopwords_broadcast = sc.broadcast(all_stopwords)

print(f"✅ Broadcasted stopwords to all workers")

## Build Inverted Index Functions

In [None]:
NUM_BUCKETS = 124
INDEX_DIR = "body_nostem"  # ✅ NEW DIRECTORY - won't overwrite existing files!

def token2bucket_id(token):
    """Map token to bucket number"""
    return int(_hash(token), 16) % NUM_BUCKETS


def word_count_nostem(text, doc_id):
    """
    Count term frequency for each token (NO STEMMING!) in document.
    """
    tokens = tokenize_nostem(text, all_stopwords_broadcast.value)
    counts = Counter(tokens)
    return [(token, (doc_id, tf)) for token, tf in counts.items()]


def reduce_word_counts(unsorted_pl):
    """Sort posting list by doc_id"""
    return sorted(unsorted_pl, key=lambda x: x[0])


def calculate_df(postings):
    """Calculate document frequency for each token"""
    return postings.map(lambda token: (token[0], len(token[1])))


def partition_postings_and_write(postings, base_dir):
    """Partition and write posting lists to GCS"""
    bucket_rdd = postings.map(lambda x: (token2bucket_id(x[0]), x)).groupByKey()
    
    def write_bucket(b_w_pl):
        bucket_id, word_posting_pairs = b_w_pl
        return InvertedIndex.write_a_posting_list(
            (bucket_id, list(word_posting_pairs)), base_dir
        )
    
    return bucket_rdd.map(write_bucket)

print(f"Index will be saved to: gs://{bucket_name}/{INDEX_DIR}/")

## Build the Index

In [None]:
%%time
print("Building inverted index for body text WITHOUT stemming...")
print("This will take a while for the full corpus...")

# Step 1: Word counts (NO stemming!)
word_counts = doc_text_pairs.flatMap(lambda x: word_count_nostem(x[0], x[1]))

# Step 2: Create posting lists
postings = word_counts.groupByKey().mapValues(reduce_word_counts)

# Step 3: Filter rare terms (helps reduce index size)
MIN_DF = 50  # Minimum document frequency - same as stemmed version
postings_filtered = postings.filter(lambda x: len(x[1]) >= MIN_DF)

# Step 4: Calculate df
w2df = calculate_df(postings_filtered)
w2df_dict = w2df.collectAsMap()

print(f"Total unique tokens (with df >= {MIN_DF}): {len(w2df_dict):,}")

In [None]:
%%time
# Write posting lists to GCS
print(f"Writing posting lists to {INDEX_DIR}...")
_ = partition_postings_and_write(postings_filtered, INDEX_DIR).collect()
print("✅ Done writing posting lists!")

## Save Document Frequency (DF) Dictionary

In [None]:
# Save w2df dictionary
w2df_filename = f"{INDEX_DIR}_w2df.pkl"

with open(w2df_filename, 'wb') as f:
    pickle.dump(w2df_dict, f)

print(f"✅ Saved w2df dictionary locally")

# Upload to GCS
!gsutil cp {w2df_filename} gs://{bucket_name}/final_project/{w2df_filename}
print(f"✅ Uploaded w2df dictionary to GCS")

## Verify Index Creation

In [None]:
# Check created files in GCS
!gsutil ls gs://{bucket_name}/{INDEX_DIR}/ | head -10

# Count total files
result = !gsutil ls gs://{bucket_name}/{INDEX_DIR}/*.pickle | wc -l
num_files = int(result[0])
print(f"\n✅ Successfully created {num_files} posting list files!")

## Summary Statistics

In [None]:
print("=== Index Creation Summary ===")
print(f"Total documents processed: {parquetFile.count():,}")
print(f"Total unique tokens (df >= {MIN_DF}): {len(w2df_dict):,}")
print(f"Index directory: {INDEX_DIR}")
print(f"Number of bucket files: {NUM_BUCKETS}")
print(f"\n⚠️ STEMMING: NO (as required for search_body)")

# Show some sample frequencies - these should NOT be stemmed!
print("\nSample tokens (verify NO stemming):")
sample_tokens = ['united', 'states', 'running', 'cities', 'universities', 
                 'played', 'better', 'america', 'texas', 'country']
for token in sample_tokens:
    if token in w2df_dict:
        print(f"  ✅ '{token}': {w2df_dict[token]:,} documents")
    else:
        print(f"  ❌ '{token}': not found (might be filtered by MIN_DF)")

## ✅ Index Creation Complete!

The inverted index has been successfully created with:
- ❌ **NO Stemming** (as required for `search_body()`)
- ❌ **NO Phrases**
- ✅ Body text indexed
- ✅ TF stored in posting lists (ready for TF-IDF)

**Files created:**
- `gs://{bucket_name}/body_nostem/` - Posting lists
- `gs://{bucket_name}/final_project/body_nostem_w2df.pkl` - Document frequencies

**Note:** Document lengths are already computed in the stemmed version and can be reused.

**Important:** Do not clear this notebook's output!