In [5]:
import math
from collections import Counter

class SimpleVector:
    def __init__(self, v_max=10, lowercase=True):
        self.v_max = v_max
        self.lowercase = lowercase
        self.vocabulary = []
        self.idf_scores = {}
        
    def fit(self, documents):
        
        # lowercase the documents 
        if self.lowercase:
            documents = [doc.lower() for doc in documents]
            
        # split documents into words
        tokenized_docs = [doc.split() for doc in documents]

        # count word frequencies across all documents
        word_counts = Counter(word for doc in tokenized_docs for word in doc)
        
        # build vocabulary (sorted by frequency, then alphabetically)
        self.vocabulary = [word for word, _ in sorted(word_counts.items(), key=lambda x: (-x[1], x[0]))]
        
        # limit vocabulary to v_max most frequent terms
        if self.v_max:
            self.vocabulary = self.vocabulary[:self.v_max]
            
            
        # calculate IDF scores
        num_docs = len(documents)
        doc_freq = {word: 0 for word in self.vocabulary}

        for doc in tokenized_docs:
            for word in set(doc):  # Count each word only once per document
                if word in self.vocabulary:
                    doc_freq[word] += 1
                    
        self.idf_scores = {
            word: math.log(1 + num_docs / (1 + freq)) + 1
            for word, freq in doc_freq.items()
        }
        
    def transform(self, documents):
        
        
        if self.lowercase:
            documents = [doc.lower() for doc in documents]

        # split documents into words
        tokenized_docs = [doc.split() for doc in documents]

        # create TF-IDF vectors
        tfidf_vectors = []
        for doc in tokenized_docs:
            word_counts = Counter(doc)
            doc_length = len(doc)
            tfidf_vector = []

            for word in self.vocabulary:
                tf = word_counts[word] / doc_length if doc_length > 0 else 0
                idf = self.idf_scores.get(word, 0)
                tfidf_vector.append(tf * idf)

            tfidf_vectors.append(tfidf_vector)

        return tfidf_vectors

In [6]:
corpus = [
          'The cat is playing with a fat baby',
          'The fat baby is fight a pitbull',
          'The pitbull is bites the cat and is flying',
          'A pilot is flying with a fat baby along with a cat and a pitbull'
        ]

# initialize the vectorizer
vectorizer = SimpleVector(v_max=10, lowercase=True)

# fit on the corpus
vectorizer.fit(corpus)

# transform the corpus into TF-IDF vectors
tfidf_vectors = vectorizer.transform(corpus)

# print the vocabulary and TF-IDF vectors
print("Vocabulary:", vectorizer.vocabulary)
print("TF-IDF Vectors:")
for vector in tfidf_vectors:
    print(vector)

Vocabulary: ['a', 'is', 'the', 'baby', 'cat', 'fat', 'pitbull', 'with', 'and', 'flying']
TF-IDF Vectors:
[0.21164339756999317, 0.19847333311276488, 0.21164339756999317, 0.21164339756999317, 0.21164339756999317, 0.21164339756999317, 0.0, 0.23091223254840043, 0.0, 0.0]
[0.24187816865142076, 0.22682666641458843, 0.24187816865142076, 0.24187816865142076, 0.0, 0.24187816865142076, 0.24187816865142076, 0.0, 0.0, 0.0]
[0.0, 0.35284148108935975, 0.3762549290133212, 0.0, 0.1881274645066606, 0.0, 0.1881274645066606, 0.0, 0.20525531782080036, 0.20525531782080036]
[0.45150591481598545, 0.10585244432680793, 0.0, 0.11287647870399636, 0.11287647870399636, 0.11287647870399636, 0.11287647870399636, 0.24630638138496044, 0.12315319069248022, 0.12315319069248022]


In [7]:
# now with lowercase False

import math
from collections import Counter

class SimpleVector:
    def __init__(self, v_max=10, lowercase=False):
        self.v_max = v_max
        self.lowercase = lowercase
        self.vocabulary = []
        self.idf_scores = {}
        
    def fit(self, documents):
        
        # lowercase the documents 
        if self.lowercase:
            documents = [doc.lower() for doc in documents]
            
        # split documents into words
        tokenized_docs = [doc.split() for doc in documents]

        # count word frequencies across all documents
        word_counts = Counter(word for doc in tokenized_docs for word in doc)
        
        # build vocabulary (sorted by frequency, then alphabetically)
        self.vocabulary = [word for word, _ in sorted(word_counts.items(), key=lambda x: (-x[1], x[0]))]
        
        # limit vocabulary to v_max most frequent terms
        if self.v_max:
            self.vocabulary = self.vocabulary[:self.v_max]
            
            
        # calculate IDF scores
        num_docs = len(documents)
        doc_freq = {word: 0 for word in self.vocabulary}

        for doc in tokenized_docs:
            for word in set(doc):  # Count each word only once per document
                if word in self.vocabulary:
                    doc_freq[word] += 1
                    
        self.idf_scores = {
            word: math.log(1 + num_docs / (1 + freq)) + 1
            for word, freq in doc_freq.items()
        }
        
    def transform(self, documents):
        
        
        if self.lowercase:
            documents = [doc.lower() for doc in documents]

        # split documents into words
        tokenized_docs = [doc.split() for doc in documents]

        # create TF-IDF vectors
        tfidf_vectors = []
        for doc in tokenized_docs:
            word_counts = Counter(doc)
            doc_length = len(doc)
            tfidf_vector = []

            for word in self.vocabulary:
                tf = word_counts[word] / doc_length if doc_length > 0 else 0
                idf = self.idf_scores.get(word, 0)
                tfidf_vector.append(tf * idf)

            tfidf_vectors.append(tfidf_vector)

        return tfidf_vectors

In [8]:
corpus =   [
          'The cat is playing with a fat baby',
          'The fat baby is fight a pitbull',
          'The pitbull is bites the cat and is flying',
          'A pilot is flying with a fat baby along with a cat and a pitbull'
        ]
# initialize the vectorizer
vectorizer = SimpleVector(v_max=10, lowercase=False)

# fit on the corpus
vectorizer.fit(corpus)

# transform the corpus into TF-IDF vectors
tfidf_vectors = vectorizer.transform(corpus)

# print the vocabulary and TF-IDF vectors
print("Vocabulary:", vectorizer.vocabulary)
print("TF-IDF Vectors:")
for vector in tfidf_vectors:
    print(vector)

Vocabulary: ['a', 'is', 'The', 'baby', 'cat', 'fat', 'pitbull', 'with', 'and', 'flying']
TF-IDF Vectors:
[0.21164339756999317, 0.19847333311276488, 0.21164339756999317, 0.21164339756999317, 0.21164339756999317, 0.21164339756999317, 0.0, 0.23091223254840043, 0.0, 0.0]
[0.24187816865142076, 0.22682666641458843, 0.24187816865142076, 0.24187816865142076, 0.0, 0.24187816865142076, 0.24187816865142076, 0.0, 0.0, 0.0]
[0.0, 0.35284148108935975, 0.1881274645066606, 0.0, 0.1881274645066606, 0.0, 0.1881274645066606, 0.0, 0.20525531782080036, 0.20525531782080036]
[0.3386294361119891, 0.10585244432680793, 0.0, 0.11287647870399636, 0.11287647870399636, 0.11287647870399636, 0.11287647870399636, 0.24630638138496044, 0.12315319069248022, 0.12315319069248022]


In [9]:
# part 2 extension with word removal

class StopWordVector(SimpleVector):

    def __init__(self, v_max=10, lowercase=True, remove_stopwords=False, stopwords=None):
        """
        Initialize with optional stop-word removal.
        """
        super().__init__(v_max, lowercase)
        self.remove_stopwords = remove_stopwords
        self.stopwords = stopwords or ["a", "able", "about", "across", "after", "all", "almost", "also", "am", "among", "an", "and", "any", "are", "as",
    "at", "be", "because", "been", "but", "by", "can", "cannot", "could", "dear", "did", "do", "does", "either", "else",
    "ever", "every", "for", "from", "get", "got", "had", "has", "have", "he", "her", "hers", "him", "his", "how",
    "however", "i", "if", "in", "into", "is", "it", "its", "just", "least", "let", "like", "likely", "may", "me",
    "might", "most", "must", "my", "neither", "no", "nor", "not", "of", "off", "often", "on", "only", "or", "other",
    "our", "own", "rather", "said", "say", "says", "she", "should", "since", "so", "some", "than", "that", "the",
    "their", "them", "then", "there", "these", "they", "this", "tis", "to", "too", "twas", "us", "wants", "was", "we",
    "were", "what", "when", "where", "which", "while", "who", "whom", "why", "will", "with", "would", "yet", "you",
    "your"]

    def fit(self, documents):
        
        if self.lowercase:
            documents = [doc.lower() for doc in documents]

        # tokenize remove stop words
        tokenized_docs = [doc.split() for doc in documents]
        if self.remove_stopwords:
            tokenized_docs = [[word for word in doc if word not in self.stopwords] for doc in tokenized_docs]

        # build vocabulary and calculate frequencies
        all_words = [word for doc in tokenized_docs for word in doc]
        word_counts = Counter(all_words)

        self.vocabulary = sorted(word_counts, key=lambda x: (-word_counts[x], x))
        if self.v_max:
            self.vocabulary = self.vocabulary[:self.v_max]

        # calculate IDF scores
        num_docs = len(documents)
        doc_freq = {word: sum(1 for doc in tokenized_docs if word in doc) for word in self.vocabulary}
        self.idf_scores = {word: math.log(1 + num_docs / (1 + freq)) + 1 for word, freq in doc_freq.items()}

    

In [10]:
corpus =   [
          'The cat is playing with a fat baby',
          'The fat baby is fight a pitbull',
          'The pitbull is bites the cat and is flying',
          'A pilot is flying with a fat baby along with a cat and a pitbull'
        ]

vectorizer = StopWordVector(v_max=10, lowercase=True, remove_stopwords=True)
vectorizer.fit(corpus)
tfidf_vectors = vectorizer.transform(corpus)

print("Vocabulary:", vectorizer.vocabulary)
print("TF-IDF Vectors:")
for vector in tfidf_vectors:
    print(vector)

Vocabulary: ['baby', 'cat', 'fat', 'pitbull', 'flying', 'along', 'bites', 'fight', 'pilot', 'playing']
TF-IDF Vectors:
[0.21164339756999317, 0.21164339756999317, 0.21164339756999317, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.26232653608351375]
[0.24187816865142076, 0.0, 0.24187816865142076, 0.24187816865142076, 0.0, 0.0, 0.0, 0.2998017555240157, 0.0, 0.0]
[0.0, 0.1881274645066606, 0.0, 0.1881274645066606, 0.20525531782080036, 0.0, 0.23317914318534555, 0.0, 0.0, 0.0]
[0.11287647870399636, 0.11287647870399636, 0.11287647870399636, 0.11287647870399636, 0.12315319069248022, 0.13990748591120733, 0.0, 0.0, 0.13990748591120733, 0.0]
