In [10]:
from collections import Counter, defaultdict
import math
import numpy as np

In [3]:
# Training dataset
docs = [
    ["john", "plays", "football"],
    ["library", "books", "read"],
    ["mary", "likes", "tennis"],
    ["books", "novels", "library"]
]
labels = ["Sports", "Library", "Sports", "Library"]

**Naive Bayes**

In [4]:
# Calculate priors and likelihoods
class_docs = defaultdict(list)
for doc, label in zip(docs, labels):
    class_docs[label].extend(doc)

# Priors
priors = {c: len([1 for l in labels if l==c]) / len(labels) for c in set(labels)}

# Likelihoods with Laplace smoothing
likelihoods = {}
vocab = set(word for doc in docs for word in doc)
for c, words in class_docs.items():
    word_counts = Counter(words)
    total_words = len(words)
    likelihoods[c] = {w: (word_counts[w] + 1) / (total_words + len(vocab)) for w in vocab}

In [7]:

# Predict
def predict_nb(doc):
    scores = {}
    for c in priors:
        scores[c] = math.log(priors[c])
        for w in doc:
            if w in vocab:
                scores[c] += math.log(likelihoods[c][w])
    return max(scores, key=scores.get)

print("Doc: ['football', 'match'] ->", predict_nb(["football", "match"]))
print("Doc: ['read', 'books'] ->", predict_nb(["read", "books"]))

Doc: ['football', 'match'] -> Sports
Doc: ['read', 'books'] -> Library


**Decision Tree**

In [9]:
def decision_tree_predict(doc):
    if "football" in doc or "tennis" in doc:
        return "Sports"
    elif "books" in doc or "library" in doc:
        return "Library"
    else:
        return "Unknown"

print("Doc: ['football', 'game'] ->", decision_tree_predict(["football", "game"]))
print("Doc: ['books', 'library'] ->", decision_tree_predict(["books", "library"]))


Doc: ['football', 'game'] -> Sports
Doc: ['books', 'library'] -> Library


**KNN**

In [11]:
# Build vocabulary
vocab = list(set(word for doc in docs for word in doc))
word_index = {w:i for i,w in enumerate(vocab)}

# Vectorize docs
def vectorize(doc):
    vec = [0]*len(vocab)
    for w in doc:
        if w in word_index:
            vec[word_index[w]] += 1
    return np.array(vec)

X = [vectorize(doc) for doc in docs]

def cosine_sim(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1)*np.linalg.norm(v2) + 1e-6)

In [13]:
def knn_predict(query, k=3):
    q_vec = vectorize(query)
    sims = [(cosine_sim(q_vec, x), label) for x,label in zip(X, labels)]
    sims.sort(reverse=True)
    top_k = [label for _,label in sims[:k]]
    return Counter(top_k).most_common(1)[0][0]

print("Doc: ['tennis', 'game'] ->", knn_predict(["tennis", "game"]))
print("Doc: ['novels', 'library'] ->", knn_predict(["novels", "library"]))


Doc: ['tennis', 'game'] -> Sports
Doc: ['novels', 'library'] -> Library


**Rocchio**

In [21]:
query = ["football", "match"]

# Relevant docs: doc[0], doc[2]
relevant_ids = [0, 2]
# Non-relevant docs: doc[1], doc[3]
non_relevant_ids = [1, 3]

#Build Vocabulary
vocab = sorted(set(w for d in docs for w in d) | set(query))
idx = {w:i for i,w in enumerate(vocab)}

print("Vocab:", vocab)

Vocab: ['books', 'football', 'john', 'library', 'likes', 'mary', 'match', 'novels', 'plays', 'read', 'tennis']


In [22]:
# Vectorize
def vectorize(words):
    vec = np.zeros(len(vocab))
    counts = Counter(words)
    for w, c in counts.items():
        vec[idx[w]] = c
    return vec

X = [vectorize(doc) for doc in docs]
Q = vectorize(query)

In [24]:
#Rocchio Parameters
alpha, beta, gamma = 1.0, 0.75, 0.15

# Compute centroids
relevant_vec = np.mean([X[i] for i in relevant_ids], axis=0)
non_relevant_vec = np.mean([X[i] for i in non_relevant_ids], axis=0)

# New query vector
Q_new = alpha * Q + beta * relevant_vec - gamma * non_relevant_vec

# Show Top Terms
top_idx = Q_new.argsort()[::-1][:5]
expanded_terms = [vocab[i] for i in top_idx]

print("\nOriginal Query:", query)
print("Original Vector:", Q)
print("New Query Vector:", np.round(Q_new, 2))
print("Expanded Query Terms:", expanded_terms)



Original Query: ['football', 'match']
Original Vector: [0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
New Query Vector: [-0.15  1.38  0.38 -0.15  0.38  0.38  1.   -0.08  0.38 -0.08  0.38]
Expanded Query Terms: ['football', 'match', 'tennis', 'john', 'plays']
