In [1]:
# Vectorizing Text
# - Bag of words
# - TF-IDF

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer # Transform text into a matrix of token counts. It breaks down text into tokens, counts the occurrences of each token, and creates a matrix of these counts.

# Example:
# Text: "I love programming in Python"
# Tokens: ["I", "love", "programming", "in", "Python"]
# Count Vectorizer: [[1, 1, 1, 0, 1]]

In [3]:
data = [' Most shark attacks occur about 10 feet from the beach since that is where the people are',
        'the efficiency with which he paired the socks in the drawer was quite admirable',
        'carol drank the blood as if she were a vampire',
        'giving directions that the mountains are to the west only works when you can see them',
        'the sign said there was road work ahead so he decided to speed up',
        'the gruff old man sat in the back of the bait shop grumbling to himself as he scooped out a handful of worms']

In [None]:
countvec = CountVectorizer()
# countvec = CountVectorizer(binary=True) - If a word appears in a document, it will be represented as 1, otherwise 0.
countvec_fit = countvec.fit_transform(data) # counts the unique words in the text and then converts the text into a matrix of token counts.

In [8]:
bag_of_words = pd.DataFrame(countvec_fit.toarray(), columns=countvec.get_feature_names_out())

# Every row will be a document, every column a word from the vocabulary, and every entry a word count.

print(bag_of_words)

   10  about  admirable  ahead  are  as  ...  which  with  work  works  worms  you
0   1      1          0      0    1   0  ...      0     0     0      0      0    0
1   0      0          1      0    0   0  ...      1     1     0      0      0    0
2   0      0          0      0    0   1  ...      0     0     0      0      0    0
3   0      0          0      0    1   0  ...      0     0     0      1      0    1
4   0      0          0      1    0   0  ...      0     0     1      0      0    0
5   0      0          0      0    0   1  ...      0     0     0      0      1    0

[6 rows x 71 columns]


In [10]:
# Limitations of Bag of Words: It only counts the presence of a word in a document, not the order or importance of the words.
# Example: "I love programming in Python" and "Python programming is love" will be represented as the same vector.
# This is where TF-IDF comes in.

# TF-IDF (Term Frequency-Inverse Document Frequency)
# - TF: Term Frequency - Measures how important a word is to a document.


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

data = [' Most shark attacks occur about 10 feet from the beach since that is where the people are',
        'the efficiency with which he paired the socks in the drawer was quite admirable',
        'carol drank the blood as if she were a vampire',
        'giving directions that the mountains are to the west only works when you can see them',
        'the sign said there was road work ahead so he decided to speed up',
        'the gruff old man sat in the back of the bait shop grumbling to himself as he scooped out a handful of worms']

In [12]:
# TF-IDF == term frequency-inverse document frequency
# - How important a word is across a corpus of documents.
# - How often a word appears in a document.
# - How common it is across all documents.
# -- Common words have a lower score, where as rare words have a higher score, as they carry more wieght in terms of importance of what the document is about.

tfidf = TfidfVectorizer()
tfidf_fit = tfidf.fit_transform(data)
tfidf_df = pd.DataFrame(tfidf_fit.toarray(), columns=tfidf.get_feature_names_out())

print(tfidf_df)







         10     about  admirable     ahead  ...      work    works    worms      you
0  0.257061  0.257061   0.000000  0.000000  ...  0.000000  0.00000  0.00000  0.00000
1  0.000000  0.000000   0.293641  0.000000  ...  0.000000  0.00000  0.00000  0.00000
2  0.000000  0.000000   0.000000  0.000000  ...  0.000000  0.00000  0.00000  0.00000
3  0.000000  0.000000   0.000000  0.000000  ...  0.000000  0.27104  0.00000  0.27104
4  0.000000  0.000000   0.000000  0.290766  ...  0.290766  0.00000  0.00000  0.00000
5  0.000000  0.000000   0.000000  0.000000  ...  0.000000  0.00000  0.21782  0.00000

[6 rows x 71 columns]
