In [1]:
import numpy as np

In [7]:
# term frequency, add type hints and doc string
def tf(term: str, document: str) -> float:
    """Computes the term frequency of a term in a document."""
    return document.count(term)

# document frequency, add type hints and doc string
def df(term: str, corpus: list) -> float:
    """Computes the document frequency of a term in a corpus."""
    num_docs_with_term = len([d for d in corpus if term in d])
    return num_docs_with_term

# inverse document frequency, add type hints and doc string
"""
The IDF quantifies how rare or common a term is across the collection of documents. 
It is used to weigh the importance of terms in information retrieval and text analysis tasks, 
such as search engines or document ranking.
"""
def idf(term: str, corpus: list) -> float:
    """Computes the inverse document frequency of a term in a corpus."""
    num_docs = len(corpus)
    num_docs_with_term = df(term, corpus)
    return np.log(num_docs / num_docs_with_term) # higher idf means the term is more rare

# term frequency-inverse document frequency, add type hints and doc string
"""
if for a given term and document, the term frequency is high, but the inverse document frequency is high,
then the term is common in the document but rare in the corpus.
Otherwise, if the term frequency is low, but the inverse document frequency is high, 
then the term is rare in the document but common in the corpus.
"""
def tf_idf(term: str, document: str, corpus: list) -> float:
    """Computes the tf-idf of a term in a document.
    params:
        term: the term to compute the tf-idf for
        document: the document to compute the tf-idf for
        corpus: the corpus from which the document comes from
    """
    return tf(term, document) * idf(term, corpus)

# test your code
corpus = [
    "I like apples",
    "I hate apples",
    "I love the apples",
]

print(tf("apples", corpus[0]))
print(df("apples", corpus))
print(idf("apples", corpus))
print(tf_idf("the", corpus[2], corpus))

1
3
0.0
1.0986122886681098
