<a href="https://colab.research.google.com/github/CandaceCooley/Assignment-3/blob/main/TF_query_pynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pyspark



In [2]:
from pyspark import SparkContext, SparkConf
import re
import sys
import os

In [3]:
# Paths — UPDATE if needed
tf_index_path = "TF_index"
query_file_path = "query.txt"
output_dir = "Task1Output"

In [4]:
conf = SparkConf().setAppName("TF_Query").setMaster("local")
sc = SparkContext(conf=conf)

In [5]:
stop_words = set([
    "the", "is", "in", "and", "to", "of", "a", "with", "on", "for", "as", "at",
    "by", "an", "this", "that", "from", "are", "be", "or", "was", "it"
])
broadcast_stopwords = sc.broadcast(stop_words)

In [6]:
def preprocess_query(text):
    text = text.lower()
    text = re.sub(r"(https?://\S+|www\.\S+)", "", text)              # Remove URLs
    text = re.sub(r"[^a-z\s]", " ", text)                            # Remove non-alphabetic
    words = text.split()
    return [word for word in words if word not in broadcast_stopwords.value and len(word) > 1]

In [7]:
# Load the saved index from TF_index directory
index_rdd = sc.textFile(tf_index_path)

# Convert each line back into a dictionary-like structure
# Example: pirate@123#1.3+456#2.0 → ("pirate", [(123, 1.3), (456, 2.0)])
def parse_line(line):
    try:
        word, postings_str = line.split('@')
        postings = [(doc.split('#')[0], float(doc.split('#')[1])) for doc in postings_str.split('+')]
        return (word, postings)
    except:
        return ("", [])  # Skip malformed lines

parsed_index = index_rdd.map(parse_line).filter(lambda x: x[0] != "")

In [8]:
with open(query_file_path, 'r') as f:
    raw_query = f.read()

query_words = preprocess_query(raw_query)


In [9]:
# Filter index RDD: only keep words present in query
filtered = parsed_index.filter(lambda x: x[0] in query_words)

# Flatten postings: convert to (docID, freq) pairs
doc_freq_pairs = filtered.flatMap(lambda x: x[1])

# Aggregate scores per document
doc_scores = doc_freq_pairs.reduceByKey(lambda a, b: a + b)

# Sort by frequency (weight), descending
top_docs = doc_scores.sortBy(lambda x: -x[1]).take(10)

In [10]:
# Convert to RDD and save
output_rdd = sc.parallelize(top_docs)
output_rdd.saveAsTextFile(output_dir)