In [None]:
import pandas as pd
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
import math
import nltk
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
text = [
    "John likes soccer. John plays soccer every afternoon after school.",
    "Mary reads books. Mary reads books in the library every evening.",
    "The cat chased the cat around the yard until the cat tired.",
    "He likes football and she likes football more than any other sport."
]
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = word_tokenize(text)
    return [word for word in words if word.lower() not in stop_words and word not in string.punctuation]

tokenized_doc = [remove_stopwords(t) for t in text]

docs = {i+1: tokenized_doc[i] for i in range(len(tokenized_doc))}

tokenized_doc

[['John',
  'likes',
  'soccer',
  'John',
  'plays',
  'soccer',
  'every',
  'afternoon',
  'school'],
 ['Mary',
  'reads',
  'books',
  'Mary',
  'reads',
  'books',
  'library',
  'every',
  'evening'],
 ['cat', 'chased', 'cat', 'around', 'yard', 'cat', 'tired'],
 ['likes', 'football', 'likes', 'football', 'sport']]

In [None]:
def boolean_retrieval(query, docs):
    """
    query: string like "apple AND NOT banana"
    docs: dict {doc_id: [tokenized words]}
    returns: set of matching doc_ids
    """
    results = set()
    query = query.lower().split()

    for doc_id, tokens in docs.items():
        tokens_set = set(tokens)

        i = 0
        match = None
        while i < len(query):
            token = query[i]

            if token == "not":
                term = query[i+1]
                current = term not in tokens_set
                i += 2
            else:
                current = token in tokens_set
                i += 1

            if match is None:
                match = current
            elif i <= len(query):
                op = query[i-2]  # previous operator (and/or)
                if op == "and":
                    match = match and current
                elif op == "or":
                    match = match or current

        if match:
            results.add(doc_id)

    return results


In [None]:
queries = [
    "school AND soccer",
    "books OR cat",
    "cat AND NOT books",
    "football AND likes"
]
for q in queries:
    print(f"Query: {q}")
    print("Matched Docs:", boolean_retrieval(q, docs))
    print("-" * 40)

Query: school AND soccer
Matched Docs: {1}
----------------------------------------
Query: books OR cat
Matched Docs: {2, 3}
----------------------------------------
Query: cat AND NOT books
Matched Docs: {3}
----------------------------------------
Query: football AND likes
Matched Docs: {4}
----------------------------------------


**KL Divergence**

In [None]:
def kl_divergence(p, q, epsilon=1e-10):
    """
    KL(P || Q) = sum_x P(x) * log(P(x)/Q(x))
    p: query model (dict of term probabilities)
    q: document model (dict of term probabilities)
    """
    divergence = 0.0
    for term, p_val in p.items():
        q_val = q.get(term, epsilon)  # smoothing if term missing
        divergence += p_val * math.log(p_val / q_val)
    return divergence

In [None]:
query = ["soccer", "football", "school", "game"]

In [None]:
def get_prob_vector(tokens):
    counts = Counter(tokens)
    total = sum(counts.values())
    return {term: count/total for term, count in counts.items()}

In [None]:
# Build Term Probabilities
query_model = get_prob_vector(query)
doc_models = [get_prob_vector(doc) for doc in tokenized_doc]
doc_models

[{'John': 0.2222222222222222,
  'likes': 0.1111111111111111,
  'soccer': 0.2222222222222222,
  'plays': 0.1111111111111111,
  'every': 0.1111111111111111,
  'afternoon': 0.1111111111111111,
  'school': 0.1111111111111111},
 {'Mary': 0.2222222222222222,
  'reads': 0.2222222222222222,
  'books': 0.2222222222222222,
  'library': 0.1111111111111111,
  'every': 0.1111111111111111,
  'evening': 0.1111111111111111},
 {'cat': 0.42857142857142855,
  'chased': 0.14285714285714285,
  'around': 0.14285714285714285,
  'yard': 0.14285714285714285,
  'tired': 0.14285714285714285},
 {'likes': 0.4, 'football': 0.4, 'sport': 0.2}]

In [None]:
#Compute KL Divergence
kl_scores = []
for i, doc_model in enumerate(doc_models):
    score = kl_divergence(query_model, doc_model)
    kl_scores.append((i, score))

# Rank documents by KL divergence (smaller = more relevant)
ranked_docs = sorted(kl_scores, key=lambda x: x[1])
ranked_docs

[(0, 11.05195659737846),
 (3, 16.11216651930399),
 (1, 21.639556568820566),
 (2, 21.639556568820566)]

In [None]:
#Output
print("Query:", query)
print("\nKL Divergence Scores (lower = more relevant):")
for doc_id, score in ranked_docs:
    print(f"Doc {doc_id+1}: {score:.4f} -> {text[doc_id]}")

Query: ['soccer', 'football', 'school', 'game']

KL Divergence Scores (lower = more relevant):
Doc 1: 11.0520 -> John likes soccer. John plays soccer every afternoon after school.
Doc 4: 16.1122 -> He likes football and she likes football more than any other sport.
Doc 2: 21.6396 -> Mary reads books. Mary reads books in the library every evening.
Doc 3: 21.6396 -> The cat chased the cat around the yard until the cat tired.
