TF-IDF and BM25 implemenations

In [5]:
import math
import numpy as np

In [10]:
doc1 = 'hi there I wanted to say tha I am a good person who is very nice'
doc2 = 'there is a big bear in the forest'
doc3 = 'there is a big wolf in the jungle'

In [11]:
q = 'bear'
docs = [doc1, doc2, doc3]

def tf_idf(query, doc):
    words = doc.split()
    tf = words.count(query) / len(words)
    idf = math.log(len(docs) / sum([1 for doc in docs if query in doc]))
    return round(tf * idf, 3)


print(tf_idf(q, doc1)) 
print(tf_idf(q, doc2))
print(tf_idf('is', doc2))
print(tf_idf(q, doc3))

0.0
0.137
0.0
0.0


In [14]:
vocab = set(doc1.split() + doc2.split() + doc3.split())
print(vocab)
print(len(vocab))

{'a', 'person', 'jungle', 'good', 'big', 'wolf', 'wanted', 'who', 'bear', 'say', 'forest', 'in', 'is', 'tha', 'to', 'nice', 'very', 'there', 'hi', 'am', 'I', 'the'}
22


In [15]:
vec_doc1 = np.array([tf_idf(word, doc1) for word in vocab])
vec_doc2 = np.array([tf_idf(word, doc2) for word in vocab])
vec_doc3 = np.array([tf_idf(word, doc3) for word in vocab])

print(vec_doc1, vec_doc1.shape)
print(vec_doc2, vec_doc2.shape)
print(vec_doc3, vec_doc3.shape)

[0.    0.069 0.    0.069 0.    0.    0.069 0.069 0.    0.069 0.    0.
 0.    0.069 0.069 0.069 0.069 0.    0.069 0.069 0.137 0.   ] (22,)
[0.    0.    0.    0.    0.051 0.    0.    0.    0.137 0.    0.137 0.051
 0.    0.    0.    0.    0.    0.    0.    0.    0.    0.   ] (22,)
[0.    0.    0.137 0.    0.051 0.137 0.    0.    0.    0.    0.    0.051
 0.    0.    0.    0.    0.    0.    0.    0.    0.    0.   ] (22,)


In [18]:
from sklearn.metrics.pairwise import cosine_similarity

# Stack the document vectors into a matrix
tf_idf_matrix = np.vstack([vec_doc1, vec_doc2, vec_doc3])

# Compute the cosine similarities
cosine_similarities = cosine_similarity(tf_idf_matrix)

print(cosine_similarities)

[[1.         0.         0.        ]
 [0.         1.         0.12171268]
 [0.         0.12171268 1.        ]]


In [19]:
# BM25 parameters
k1 = 1.5
b = 0.75

# Compute average document length
avg_doc_len = sum(len(doc.split()) for doc in docs) / len(docs)

def bm25(query, doc, k1=k1, b=b):
    words = doc.split()
    doc_len = len(words)
    score = 0
    for term in query.split():
        tf = words.count(term) / doc_len
        idf = math.log((len(docs) - sum([1 for d in docs if term in d]) + 0.5) / (sum([1 for d in docs if term in d]) + 0.5) + 1)
        score += idf * ((tf * (k1 + 1)) / (tf + k1 * (1 - b + b * (doc_len / avg_doc_len))))
    return round(score, 3)

# Example queries
queries = ['bear', 'big forest', 'nice person']

# Compute BM25 scores for each query against each document
bm25_scores = {query: [bm25(query, doc) for doc in docs] for query in queries}

print(bm25_scores)

{'bear': [0.0, 0.228, 0.0], 'big forest': [0.0, 0.337, 0.109], 'nice person': [0.144, 0.0, 0.0]}
