In [10]:
import math

documents = [
    "Python functions are defined using the def keyword and can return values.",
    "Object oriented programming in Python uses classes and methods.",
    "Lists are mutable sequences that support indexing and slicing.",
    "Dictionaries store key value pairs with unique keys.",
    "Generators use yield to produce values lazily.",
    "Decorators wrap functions and modify behavior.",
    "Exceptions are handled using try and except blocks.",
    "Context managers work with the with statement.",
    "Asyncio enables asynchronous programming with coroutines.",
    "The Global Interpreter Lock affects multithreading in Python.",
    "NumPy provides fast multidimensional array operations.",
    "Pandas offers DataFrame structures for data analysis.",
    "Virtual environments isolate project dependencies.",
    "Machine learning workflows often use Python libraries.",
    "Deep learning frameworks include PyTorch and TensorFlow.",
    "Python supports dynamic typing and duck typing.",
    "Type hints improve readability and tooling support.",
    "Lambda functions create anonymous functions.",
    "List comprehensions provide concise list construction.",
    "Set operations include union intersection and difference.",
    "Python strings are immutable sequences of characters.",
    "File handling uses open read write and close operations.",
    "JSON parsing is common in Python applications.",
    "Web development frameworks include Django and Flask.",
    "APIs are frequently implemented using FastAPI.",
    "Concurrency can be achieved with threading or asyncio.",
    "Multiprocessing bypasses the Global Interpreter Lock.",
    "Python is widely used in data science and AI.",
    "Scientific computing relies heavily on NumPy.",
    "Visualization libraries include matplotlib and seaborn.",
    "Natural language processing uses tokenization and embeddings.",
    "BM25 is a ranking function used in information retrieval.",
    "TF IDF and BM25 are classical retrieval algorithms.",
    "Neural retrieval uses dense vector embeddings.",
    "Search engines rank documents based on relevance.",
    "Tokenization splits text into smaller units.",
    "Stopwords are often removed during preprocessing.",
    "Stemming reduces words to their base forms.",
    "Evaluation metrics include precision recall and MAP.",
    "Ranking quality depends on term frequency and IDF.",
    "Python loops include for loops and while loops.",
    "Conditional logic uses if elif and else statements.",
    "Modules help organize Python code.",
    "Packages contain multiple Python modules.",
    "Dependency management uses pip or poetry.",
    "Unit tests validate program correctness.",
    "Performance optimization may use Cython or Rust.",
    "Memory efficiency is improved with generators.",
    "Data pipelines often process large datasets.",
    "Logging helps monitor application behavior."
]

In [None]:
# Computing global stats

N = len(documents)
avgl = sum(len(d.split(" ")) for d in documents) / N
k = 1.2
b = 0.75

print("Average document length: ", avgl)
print("Number of documents: ", N)
print("k: ", k)
print("b: ", b)

all_words = [[word for word in d.split(" ")] for d in documents]
all_words = [word for sublist in all_words for word in sublist]
all_words = list(set(all_words))

# List containing all words with space and dot removed
all_words = [word.replace(".", "") for word in all_words if word != " "]

# Compute index to see which document contains which word
word_to_doc_index = dict()

for word in all_words:
    for document_id in range(len(documents)):
        d = documents[document_id]

        if word in d:
            word_to_doc_id_set = word_to_doc_index.get(word, set())
            word_to_doc_id_set.add(document_id)
            word_to_doc_index[word] = word_to_doc_id_set

Average document length:  6.94
Number of documents:  50
k:  1.2
b:  0.75


In [33]:
def word_to_occurence_count(word):
    return len(word_to_doc_index.get(word, set()))


def inverse_document_frequency(word):
    word_count = word_to_occurence_count(word)
    numerator = N - word_count + 0.5
    denominator = word_count + 0.5

    return math.log(numerator / denominator + 1)


def frequency_in_document(word, document):
    return [d.replace(".", "") for d in document.split(" ")].count(word.replace(".", ""))


def score(query, document):
    split_query = query.split(" ")

    def _individual_score(q):
        f = frequency_in_document(q, document)
        idf = inverse_document_frequency(q)

        return f * idf / (f + k * (1 - b + b * (len(document.split(" ")) / avgl)))


    individual_scores = [_individual_score(q) for q in split_query if q in document]

    return (k + 1) * sum(individual_scores)


def search(query):
    candidate_tuples = [(d, score(query, d)) for d in documents]

    return [x[0] for x in sorted(candidate_tuples, key=lambda x: x[1], reverse=True)]


In [34]:
print(search("python list"))

['List comprehensions provide concise list construction.', 'Python functions are defined using the def keyword and can return values.', 'Object oriented programming in Python uses classes and methods.', 'Lists are mutable sequences that support indexing and slicing.', 'Dictionaries store key value pairs with unique keys.', 'Generators use yield to produce values lazily.', 'Decorators wrap functions and modify behavior.', 'Exceptions are handled using try and except blocks.', 'Context managers work with the with statement.', 'Asyncio enables asynchronous programming with coroutines.', 'The Global Interpreter Lock affects multithreading in Python.', 'NumPy provides fast multidimensional array operations.', 'Pandas offers DataFrame structures for data analysis.', 'Virtual environments isolate project dependencies.', 'Machine learning workflows often use Python libraries.', 'Deep learning frameworks include PyTorch and TensorFlow.', 'Python supports dynamic typing and duck typing.', 'Type 