<h1>Machine Learning - Laboratory 3 - Bag of words</h1>

<p><b>Note</b> - Run All scripts before<p>

<h3>Exc. 1 - Test Bag of Words method</h3>

In [60]:
from sklearn.feature_extraction.text import CountVectorizer

# Example text
documents = [
    "To jest pierwszy dokument.",
    "Dokument numer dwa.",
    "Ostatni dokument w tym zbiorze."
]

# Initialize CountVectorizer object
vectorizer = CountVectorizer()

# Processing and transformation of documents with Bag of Words method
X = vectorizer.fit_transform(documents)

# Displaying the dictionary (unique words)
print("Słownik (unikalne słowa): ", vectorizer.get_feature_names_out())

# Displaying Bag of Words matricies
print("Macierz Bag of words:")
print(X.toarray())

Słownik (unikalne słowa):  ['dokument' 'dwa' 'jest' 'numer' 'ostatni' 'pierwszy' 'to' 'tym' 'zbiorze']
Macierz Bag of words:
[[1 0 1 0 0 1 1 0 0]
 [1 1 0 1 0 0 0 0 0]
 [1 0 0 0 1 0 0 1 1]]


<h3>Exc. 2 - Implement your own Bag of words</h3>

In [61]:
import re

# Define my bag of words functions 
def my_bag_of_words(documents):

    # Tokenization and cleanup of text (helper function)
    def preprocess_text(text):
        # Deletion of punctuation marks
        text = re.sub(r'[^\w\s]', '', text.lower())
        # Separate words by the space
        tokens = text.split(" ")
        
        return tokens

    # Create a dictionary and counting words 
    all_words = []

    for document in documents:
        tokens = preprocess_text(document)
        words = {}
        for token in tokens:
            if token in words:
                words[token] += 1
            else: 
                words[token] = 1 
        all_words.append(words)

    # Create vocalbulary list (unique words)
    vocabulary = list(set([item for sublist in all_words for item in sublist.keys()]))

    # Create Bag of worrds Matrix
    bow_matrix = []
    for document in all_words:
        bow_vector = [(lambda x: document[x] if (x in document) else 0)(x) for x in vocabulary]
        bow_matrix.append(bow_vector)

    return vocabulary, bow_matrix


vocabulary, bow_matrix = my_bag_of_words(documents)
# Display dictionary
print ("Unikalne słowa: ",vocabulary)

# Display bag od words matrix
print("Macierz Bag of Words:") 
for bow_vector in bow_matrix: 
    print(bow_vector)
    

Unikalne słowa:  ['numer', 'to', 'zbiorze', 'dokument', 'tym', 'w', 'ostatni', 'jest', 'dwa', 'pierwszy']
Macierz Bag of Words:
[0, 1, 0, 1, 0, 0, 0, 1, 0, 1]
[1, 0, 0, 1, 0, 0, 0, 0, 1, 0]
[0, 0, 1, 1, 1, 1, 1, 0, 0, 0]


<h3>Bonus Exc. - Test on different dataset<h3>

In [62]:
from sklearn.datasets import fetch_20newsgroups

texts = [
    "The quick brown fox jumps over the lazy dog",
    "Life is a highway.",
    "She is an early bird.",
    "Waltz, bad nymph, for quick jigs vex.",
    "How vexingly quick daft zebras jump!"    
]

new_vocab, new_bow_matrix = my_bag_of_words(texts)
# Display dictionary
print ("Unikalne słowa: ", new_vocab)

# Display bag od words matrix
print("Macierz Bag of Words:") 
for bow_vector in new_bow_matrix: 
    print(bow_vector)

Unikalne słowa:  ['how', 'the', 'fox', 'daft', 'a', 'early', 'jump', 'quick', 'for', 'bad', 'life', 'bird', 'dog', 'she', 'waltz', 'zebras', 'is', 'over', 'jigs', 'lazy', 'highway', 'vex', 'vexingly', 'brown', 'jumps', 'an', 'nymph']
Macierz Bag of Words:
[0, 2, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0]
[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]
[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1]
[1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]
