Name: Aparna Iyer

PRN: 22070126017

Division: AI-ML A1

Batch: 2022-2026

**Implementation of Count Vectorizer and TF-IDF Vectorizer from scratch**

a. Count Vectorizer

In [1]:
import numpy as np
from collections import defaultdict

# Step 1: Tokenization
def tokenize(text):
    return text.lower().split()

# Step 2: Build Vocabulary
def build_vocabulary(corpus):
    vocabulary = set()
    for document in corpus:
        tokens = tokenize(document)
        vocabulary.update(tokens)
    return sorted(vocabulary)

# Step 3: Create Document-Term Matrix
def count_vectorizer(corpus):
    vocabulary = build_vocabulary(corpus)
    vocab_index = {word: i for i, word in enumerate(vocabulary)}
    document_term_matrix = np.zeros((len(corpus), len(vocabulary)), dtype=int)

    for doc_index, document in enumerate(corpus):
        tokens = tokenize(document)
        token_counts = defaultdict(int)
        for token in tokens:
            token_counts[token] += 1

        for token, count in token_counts.items():
            if token in vocab_index:
                index = vocab_index[token]
                document_term_matrix[doc_index][index] = count

    return document_term_matrix, vocabulary

# Example usage
corpus = ["This is a sample document", "This document is another example document"]
dtm, vocab = count_vectorizer(corpus)

print("Vocabulary:", vocab)
print("Document-Term Matrix:\n", dtm)


Vocabulary: ['a', 'another', 'document', 'example', 'is', 'sample', 'this']
Document-Term Matrix:
 [[1 0 1 0 1 1 1]
 [0 1 2 1 1 0 1]]


b. TF-IDF Vectorizer:

In [2]:
import numpy as np
from collections import defaultdict

# Step 1: Tokenization
def tokenize(text):
    return text.lower().split()

# Step 2: Build Vocabulary
def build_vocabulary(corpus):
    vocabulary = set()
    for document in corpus:
        tokens = tokenize(document)
        vocabulary.update(tokens)
    return sorted(vocabulary)

# Step 3: Compute Term Frequency (TF)
def compute_tf(document, vocab_index):
    tokens = tokenize(document)
    token_counts = defaultdict(int)
    for token in tokens:
        token_counts[token] += 1
    total_tokens = len(tokens)

    tf = np.zeros(len(vocab_index))
    for token, count in token_counts.items():
        if token in vocab_index:
            index = vocab_index[token]
            tf[index] = count / total_tokens
    return tf

# Step 4: Compute Inverse Document Frequency (IDF)
def compute_idf(corpus, vocab_index):
    num_docs = len(corpus)
    idf = np.zeros(len(vocab_index))
    for token in vocab_index:
        count = sum(1 for document in corpus if token in tokenize(document))
        idf[vocab_index[token]] = np.log((num_docs + 1) / (count + 1)) + 1
    return idf

# Step 5: Compute TF-IDF
def tfidf_vectorizer(corpus):
    vocabulary = build_vocabulary(corpus)
    vocab_index = {word: i for i, word in enumerate(vocabulary)}
    document_term_matrix = np.zeros((len(corpus), len(vocabulary)))

    idf = compute_idf(corpus, vocab_index)

    for doc_index, document in enumerate(corpus):
        tf = compute_tf(document, vocab_index)
        document_term_matrix[doc_index] = tf * idf

    return document_term_matrix, vocabulary

# Example usage
corpus = ["This is a sample document", "This document is another example document"]
tfidf_matrix, vocab = tfidf_vectorizer(corpus)

print("Vocabulary:", vocab)
print("TF-IDF Matrix:\n", tfidf_matrix)


Vocabulary: ['a', 'another', 'document', 'example', 'is', 'sample', 'this']
TF-IDF Matrix:
 [[0.28109302 0.         0.2        0.         0.2        0.28109302
  0.2       ]
 [0.         0.23424418 0.33333333 0.23424418 0.16666667 0.
  0.16666667]]


**Count Vectorizer and TF-IDF Vectorizer from the sklearn library**

a. CountVectorizer from sklearn

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

# Example corpus
corpus = ["This is a sample document", "This document is another example document"]

# Initialize the CountVectorizer
count_vectorizer = CountVectorizer()

# Fit and transform the corpus
X = count_vectorizer.fit_transform(corpus)

# Get the vocabulary (feature names)
vocabulary = count_vectorizer.get_feature_names_out()

# Convert to array for better readability
document_term_matrix = X.toarray()

print("Vocabulary:", vocabulary)
print("Document-Term Matrix:\n", document_term_matrix)


Vocabulary: ['another' 'document' 'example' 'is' 'sample' 'this']
Document-Term Matrix:
 [[0 1 0 1 1 1]
 [1 2 1 1 0 1]]


b. TfidfVectorizer from Sklearn

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Example corpus
corpus = ["This is a sample document", "This document is another example document"]

# Initialize the TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the corpus
X = tfidf_vectorizer.fit_transform(corpus)

# Get the vocabulary (feature names)
vocabulary = tfidf_vectorizer.get_feature_names_out()

# Convert to array for better readability
tfidf_matrix = X.toarray()

print("Vocabulary:", vocabulary)
print("TF-IDF Matrix:\n", tfidf_matrix)


Vocabulary: ['another' 'document' 'example' 'is' 'sample' 'this']
TF-IDF Matrix:
 [[0.         0.44832087 0.         0.44832087 0.63009934 0.44832087]
 [0.44554752 0.63402146 0.44554752 0.31701073 0.         0.31701073]]
