In [8]:
import nltk
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample documents
documents = ["The cat sat on the mat", "The dog jumped over the fence", "The cat and the dog are friends"]

# Step 1: Tokenize the documents
tokenized_docs = [nltk.word_tokenize(doc.lower()) for doc in documents]

# Step 2: Compute TF (Term Frequency) - no. of times a word occurs / total number of words
def compute_tf(tokens):
    tf = {} # word: term frequency
    total_words = len(tokens)
    for word in tokens:
        tf[word] = tf.get(word, 0) + 1 / total_words
    return tf

tf_matrices = [compute_tf(doc) for doc in tokenized_docs]

# Step 3: Compute IDF (Inverse Document Frequency) - log (total no. of documents / no. of docs in which term appears)
def compute_idf(docs):
    idf = {} # word: IDF Scores
    total_docs = len(docs)
    all_words = set(word for doc in docs for word in doc)
    for word in all_words:
        doc_count = sum(1 for doc in docs if word in doc)
        idf[word] = np.log(total_docs / (1 + doc_count))
    return idf

idf = compute_idf(tokenized_docs)

# Step 4: Compute TF-IDF
def compute_tfidf(tf_matrix, idf):
    tfidf = {} # word: TF-IDF Scores
    for word, tf in tf_matrix.items():
        tfidf[word] = tf*idf[word]
    return tfidf

tfidf_matrices = [compute_tfidf(tf, idf) for tf in tf_matrices]

# Step 5: Convert TF-IDF into matrix format (row represent documents and columns represents words)
def vectorize(documents, idf):
    vocab = list(idf.keys()) # no. of unique words
    matrix = np.zeros((len(documents), len(vocab)))
    for i, doc in enumerate(documents):
        for j, word in enumerate(vocab):
            matrix[i, j] = tfidf_matrices[i].get(word, 0)
    return matrix

tfidf_matrix = vectorize(documents, idf)

print("TF-IDF Matrix (Using NLTK and Python):\n", tfidf_matrix)

TF-IDF Matrix (Using NLTK and Python):
 [[ 0.          0.          0.06757752  0.06757752 -0.09589402  0.06757752
   0.          0.          0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.         -0.09589402  0.
   0.06757752  0.          0.          0.          0.06757752  0.06757752]
 [ 0.          0.05792359  0.          0.         -0.08219488  0.
   0.          0.05792359  0.05792359  0.          0.          0.        ]]


In [9]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample documents
documents = [
    "The cat sat on the mat",
    "The dog jumped over the fence",
    "The cat and the dog are friends"
]

# Step 1: Initialize TfidfVectorizer
vectorizer = TfidfVectorizer(tokenizer = nltk.word_tokenize)

# Step 2: Fit and transform the documents
tfidf_matrix = vectorizer.fit_transform(documents)

# Convert the sparse matrix to dense array for better visibility
dense_tfidf_matrix = tfidf_matrix.toarray()

print("TF-IDF Matrix (Using TfidfVectorizer):\n", dense_tfidf_matrix, "\n")

TF-IDF Matrix (Using TfidfVectorizer):
 [[0.         0.         0.34101521 0.         0.         0.
  0.         0.44839402 0.44839402 0.         0.44839402 0.52965746]
 [0.         0.         0.         0.34101521 0.44839402 0.
  0.44839402 0.         0.         0.44839402 0.         0.52965746]
 [0.42439575 0.42439575 0.32276391 0.32276391 0.         0.42439575
  0.         0.         0.         0.         0.         0.50130994]] 





In [10]:
import spacy
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample documents
documents = [
    "The cat sat on the mat",
    "The dog jumped over the fence",
    "The cat and the dog are friends"
]

# Initialize spaCy
nlp = spacy.load("en_core_web_sm")

# Tokenize and lemmatize the documents using spaCy
tokenized_docs = []
for doc in documents:
    tokens = [token.lemma_ for token in nlp(doc)]
    tokenized_docs.append(" ".join(tokens))

# Use TfidfVectorizer for TF-IDF matrix extraction
vectorizer = TfidfVectorizer()

# Fit and transform the documents to compute the TF-IDF matrix
tfidf_matrix = vectorizer.fit_transform(tokenized_docs)

# Convert the sparse matrix to dense array for better visibility
dense_tfidf_matrix = tfidf_matrix.toarray()

print("TF-IDF Matrix (Using spaCy and scikit-learn):\n", dense_tfidf_matrix)

TF-IDF Matrix (Using spaCy and scikit-learn):
 [[0.         0.         0.34101521 0.         0.         0.
  0.         0.44839402 0.44839402 0.         0.44839402 0.52965746]
 [0.         0.         0.         0.34101521 0.44839402 0.
  0.44839402 0.         0.         0.44839402 0.         0.52965746]
 [0.42439575 0.42439575 0.32276391 0.32276391 0.         0.42439575
  0.         0.         0.         0.         0.         0.50130994]]
