### IDF: Measures how rare a word is across all documents. Rare words are more important.

In [1]:
import math 
from collections import Counter

def calculate_term_frequency(document):
    # Convert document to lowercase and split into words
    words = document.lower().split()
    # Count total words
    total_words = len(words)
    # Count frequency of each word
    words_count = Counter(words)
    # Calculate term frequency for each word
    return {word: count/total_words for word, count in words_count.items()}

def calculate_idf(documents):
    # Total number of documents
    N = len(documents)
    # Count documents containing each word
    word_doc_count = {}
    for doc in documents:
        unique_words = set(doc.lower().split())
        for word in unique_words:
            word_doc_count[word] = word_doc_count.get(word, 0) +1 
    # calculate the IDF for each word 
    return {word: math.log(N/count) for word, count in word_doc_count.items()}


def calculate_tfidf(documents):
    # Calculate IDF scores
    idf_scores = calculate_idf(documents=documents)
    # Calculate TF-IDF for each document
    tfidf_documents = []

    for doc in documents:
        # calculate the term frequency for current document
        tf_scores = calculate_term_frequency(doc)
        # Calcuate TF-IDF for each word 
        tfidf_score = {
            word: tf * idf_scores[word]
            for word, tf in tf_scores.items()
        }
        tfidf_documents.append(idf_scores)
    
    return tfidf_documents


# Example usage
if __name__ == "__main__":
    # Sample documents
    documents = [
        "The cat sat on the mat",
        "The dog chased the cat",
        "The mat was sitting there"
    ]
    
    # Calculate TF-IDF scores
    tfidf_scores = calculate_tfidf(documents)
    
    # Print results
    for i, doc_scores in enumerate(tfidf_scores):
        print(f"\nDocument {i + 1} TF-IDF scores:")
        # Sort words by TF-IDF score in descending order
        sorted_words = sorted(doc_scores.items(), key=lambda x: x[1], reverse=True)
        for word, score in sorted_words:
            print(f"{word}: {score:.4f}")


Document 1 TF-IDF scores:
sat: 1.0986
on: 1.0986
chased: 1.0986
dog: 1.0986
was: 1.0986
there: 1.0986
sitting: 1.0986
cat: 0.4055
mat: 0.4055
the: 0.0000

Document 2 TF-IDF scores:
sat: 1.0986
on: 1.0986
chased: 1.0986
dog: 1.0986
was: 1.0986
there: 1.0986
sitting: 1.0986
cat: 0.4055
mat: 0.4055
the: 0.0000

Document 3 TF-IDF scores:
sat: 1.0986
on: 1.0986
chased: 1.0986
dog: 1.0986
was: 1.0986
there: 1.0986
sitting: 1.0986
cat: 0.4055
mat: 0.4055
the: 0.0000


In [2]:
# Import libraries we need
import numpy as np  # For math calculations
import pandas as pd  # For creating a table to display results
from collections import Counter  # To count words easily
import math  # For logarithm function

# Our sample text documents (like short sentences)
text_documents = [
    "people watch campusx",  # Document 1
    "campusx watch campusx",  # Document 2
    "people write comment",   # Document 3
    "campusx write comment"   # Document 4
]

# Step 1: Create a list of all unique words (vocabulary)
def build_word_list(documents):
    # Create an empty set to store unique words (sets don't allow duplicates)
    unique_words = set()
    
    # Go through each document
    for doc in documents:
        # Split the document into words (e.g., "people watch" -> ["people", "watch"])
        words = doc.split()
        # Add each word to the set
        for word in words:
            unique_words.add(word)
    
    # Convert set to a sorted list for consistency
    word_list = sorted(list(unique_words))
    return word_list

# Create the vocabulary and print it
vocabulary = build_word_list(text_documents)
print("List of unique words (vocabulary):", vocabulary)

# Step 2: Calculate Term Frequency (TF)
# TF = (Number of times a word appears in a document) / (Total words in that document)
def calculate_term_frequency(document, word_list):
    # Split document into words
    words = document.split()
    # Count how many times each word appears
    word_counts = Counter(words)
    # Total number of words in the document
    total_words = len(words)
    
    # Create a dictionary to store TF values
    tf_scores = {}
    # Calculate TF for each word in the vocabulary
    for word in word_list:
        # If word isn't in document, Counter returns 0
        tf_scores[word] = word_counts[word] / total_words
    
    return tf_scores

# Calculate TF for each document
term_frequency_list = []
for doc_number, doc in enumerate(text_documents, 1):
    tf_scores = calculate_term_frequency(doc, vocabulary)
    term_frequency_list.append(tf_scores)
    print(f"Term Frequency for Document {doc_number}:", tf_scores)

# Step 3: Calculate Inverse Document Frequency (IDF)
# IDF = log(Total number of documents / Number of documents containing the word)
def calculate_inverse_document_frequency(documents, word_list):
    # Total number of documents
    total_documents = len(documents)
    # Create a dictionary to store IDF values
    idf_scores = {}
    
    # Calculate IDF for each word
    for word in word_list:
        # Count how many documents contain the word
        documents_with_word = 0
        for doc in documents:
            if word in doc.split():
                documents_with_word += 1
        # Calculate IDF using the formula
        idf_scores[word] = math.log(total_documents / documents_with_word)
    
    return idf_scores

# Calculate IDF and print it
idf_scores = calculate_inverse_document_frequency(text_documents, vocabulary)
print("\nInverse Document Frequency (IDF) scores:", idf_scores)

# Step 4: Calculate TF-IDF
# TF-IDF = Term Frequency * Inverse Document Frequency
def calculate_tf_idf(term_frequency_list, idf_scores):
    tf_idf_scores_list = []
    
    # Go through each document's TF scores
    for tf_scores in term_frequency_list:
        tf_idf_scores = {}
        # Multiply TF by IDF for each word
        for word, tf_value in tf_scores.items():
            tf_idf_scores[word] = tf_value * idf_scores[word]
        tf_idf_scores_list.append(tf_idf_scores)
    
    return tf_idf_scores_list

# Calculate TF-IDF
tf_idf_scores_list = calculate_tf_idf(term_frequency_list, idf_scores)

# Print TF-IDF for each document
for doc_number, tf_idf_scores in enumerate(tf_idf_scores_list, 1):
    print(f"\nTF-IDF scores for Document {doc_number}:", tf_idf_scores)

# Step 5: Create a matrix (table) of TF-IDF scores
def create_tf_idf_table(tf_idf_scores_list, word_list):
    tf_idf_table = []
    
    # Convert each document's TF-IDF scores into a list
    for tf_idf_scores in tf_idf_scores_list:
        document_vector = []
        for word in word_list:
            document_vector.append(tf_idf_scores[word])
        tf_idf_table.append(document_vector)
    
    return tf_idf_table

# Create the TF-IDF matrix
tf_idf_table = create_tf_idf_table(tf_idf_scores_list, vocabulary)

# Step 6: Display the TF-IDF matrix as a nice table
tf_idf_dataframe = pd.DataFrame(tf_idf_table, columns=vocabulary)
tf_idf_dataframe.index = [f"Document {i+1}" for i in range(len(text_documents))]
print("\nTF-IDF Table:")
print(tf_idf_dataframe)

# Step 7: Show each document as a vector (list of numbers)
print("\nDocument Vectors (TF-IDF scores):")
for doc_number, vector in enumerate(tf_idf_table, 1):
    print(f"Document {doc_number}: {vector}")

# Step 8: Calculate Cosine Similarity to compare documents
def calculate_cosine_similarity(vector1, vector2):
    # Calculate dot product (multiply corresponding numbers and sum them)
    dot_product = 0
    for value1, value2 in zip(vector1, vector2):
        dot_product += value1 * value2
    
    # Calculate the length (norm) of each vector
    norm_vector1 = math.sqrt(sum(value * value for value in vector1))
    norm_vector2 = math.sqrt(sum(value * value for value in vector2))
    
    # If either vector has zero length, similarity is 0
    if norm_vector1 == 0 or norm_vector2 == 0:
        return 0
    
    # Calculate cosine similarity
    return dot_product / (norm_vector1 * norm_vector2)

# Calculate and print similarities between all pairs of documents
print("\nDocument Similarities (Cosine Similarity):")
for i in range(len(text_documents)):
    for j in range(i + 1, len(text_documents)):
        similarity = calculate_cosine_similarity(tf_idf_table[i], tf_idf_table[j])
        print(f"Similarity between Document {i+1} and Document {j+1}: {similarity:.4f}")

List of unique words (vocabulary): ['campusx', 'comment', 'people', 'watch', 'write']
Term Frequency for Document 1: {'campusx': 0.3333333333333333, 'comment': 0.0, 'people': 0.3333333333333333, 'watch': 0.3333333333333333, 'write': 0.0}
Term Frequency for Document 2: {'campusx': 0.6666666666666666, 'comment': 0.0, 'people': 0.0, 'watch': 0.3333333333333333, 'write': 0.0}
Term Frequency for Document 3: {'campusx': 0.0, 'comment': 0.3333333333333333, 'people': 0.3333333333333333, 'watch': 0.0, 'write': 0.3333333333333333}
Term Frequency for Document 4: {'campusx': 0.3333333333333333, 'comment': 0.3333333333333333, 'people': 0.0, 'watch': 0.0, 'write': 0.3333333333333333}

Inverse Document Frequency (IDF) scores: {'campusx': 0.28768207245178085, 'comment': 0.6931471805599453, 'people': 0.6931471805599453, 'watch': 0.6931471805599453, 'write': 0.6931471805599453}

TF-IDF scores for Document 1: {'campusx': 0.09589402415059362, 'comment': 0.0, 'people': 0.23104906018664842, 'watch': 0.23104