In [1]:
from sklearn.feature_extraction.text import CountVectorizer

# Custom sentences
sentences = ['The cat sat on the mat.',
             'The cat sat near the mat.',
             'The cat played with the ball.']

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(sentences)

print('Feature names:')
print(vectorizer.get_feature_names_out())
print('Bag of Words Representation:')
print(X.toarray())

Feature names:
['ball' 'cat' 'mat' 'near' 'on' 'played' 'sat' 'the' 'with']
Bag of Words Representation:
[[0 1 1 0 1 0 1 2 0]
 [0 1 1 1 0 0 1 2 0]
 [1 1 0 0 0 1 0 2 1]]


In [2]:
from sklearn.feature_extraction.text import CountVectorizer

# Simple example sentences
sentences = ['Machine learning is fascinating.',
             'Deep learning is a subset of machine learning.',
             'We use Python for machine learning.']

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(sentences)

print('Feature names:')
print(vectorizer.get_feature_names_out())
print('Bag of Words Representation:')
print(X.toarray())

Feature names:
['deep' 'fascinating' 'for' 'is' 'learning' 'machine' 'of' 'python'
 'subset' 'use' 'we']
Bag of Words Representation:
[[0 1 0 1 1 1 0 0 0 0 0]
 [1 0 0 1 2 1 1 0 1 0 0]
 [0 0 1 0 1 1 0 1 0 1 1]]


In [3]:
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import movie_reviews

# We need to download the dataset before we can use it
nltk.download('movie_reviews', quiet=True)

# Take only the first 100 reviews for simplicity
reviews = [movie_reviews.raw(fileid) for fileid in movie_reviews.fileids()[:100]]

vectorizer = CountVectorizer()
bag_of_words = vectorizer.fit_transform(reviews)

# Print the last ten feature names
feature_names = vectorizer.get_feature_names_out()
print("Last ten feature names: ", feature_names[-10:])

Last ten feature names:  ['zingers' 'zipper' 'zombie' 'zombified' 'zone' 'zoologist' 'zoom' 'zwick'
 'zwigoff' 'zzzzzzz']


In [4]:
from sklearn.feature_extraction.text import CountVectorizer

# Simple sentences
sentences = ["I love to sing", 
             "Singing in the rain is my favorite", 
             "She sang the whole night at the concert"]

# TODO: Initialize a CountVectorizer
vectorizer = CountVectorizer()
# TODO: Fit transform the sentences
X = vectorizer.fit_transform(sentences)
print('Feature names:')
print(vectorizer.get_feature_names_out())
print('Bag of Words Representation:')
print(X.toarray())

Feature names:
['at' 'concert' 'favorite' 'in' 'is' 'love' 'my' 'night' 'rain' 'sang'
 'she' 'sing' 'singing' 'the' 'to' 'whole']
Bag of Words Representation:
[[0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0]
 [0 0 1 1 1 0 1 0 1 0 0 0 1 1 0 0]
 [1 1 0 0 0 0 0 1 0 1 1 0 0 2 0 1]]


In [5]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer

# Load stop words from NLTK and initialize a stemmer
stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()

# Define function for text cleaning and stemming
def clean_text(text):
    text = text.lower()  # Convert text to lower case
    text = re.sub(r'\S*@\S*\s?', '', text)  # Remove email addresses
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'\W', ' ', text)  # Remove punctuation and special characters
    text = re.sub(r'\d', ' ', text)  # Remove digits
    text = re.sub(r'\s\s+', ' ', text)  # Remove extra spaces

    tokenized_text = word_tokenize(text)
    filtered_text = [stemmer.stem(word) for word in tokenized_text if not word in stop_words]

    return " ".join(filtered_text)

original_sentences = ['It is a lovely day, isn\'t it?', 
                      'The sun is shining brightly!', 
                      'I love the taste of lemonade on a sunny day.']

# Preprocess the sentences
preprocessed_sentences = [clean_text(sentence) for sentence in original_sentences]

# TODO: Initialize a CountVectorizer
# TODO: Fit transform the preprocessed sentences
# TODO: Print the feature names 
# TODO: Print the Bag of Words Representation

vectorizer = CountVectorizer()
# TODO: Fit transform the sentences
X = vectorizer.fit_transform(preprocessed_sentences)
print('Feature names:')
print(vectorizer.get_feature_names_out())
print('Bag of Words Representation:')
print(X.toarray())

Feature names:
['brightli' 'day' 'lemonad' 'love' 'shine' 'sun' 'sunni' 'tast']
Bag of Words Representation:
[[0 1 0 1 0 0 0 0]
 [1 0 0 0 1 1 0 0]
 [0 1 1 1 0 0 1 1]]


In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer

sentences = [
    'This is the first document.',
    'This document is the second document.',
    'And this document is the third one.',
    'Is this the first document here?'
]

vectorizer = TfidfVectorizer()
vectorizer.fit(sentences)
vector = vectorizer.transform([sentences[1]])
print('Shape:', vector.shape)
print('Features:', vectorizer.get_feature_names_out())
print('Array:', vector.toarray())

Shape: (1, 10)
Features: ['and' 'document' 'first' 'here' 'is' 'one' 'second' 'the' 'third' 'this']
Array: [[0.         0.61221452 0.         0.         0.30610726 0.
  0.5865905  0.30610726 0.         0.30610726]]


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

sentences = ['Expedition to Mars', 'NASA launched an expedition to Mars', 'Mars expedition was successful']

# Create an instance of TfidfVectorizer
vectorizer = TfidfVectorizer()
  
# fit_transform to convert text to vector  
X = vectorizer.fit_transform(sentences)

# print the shape and array
print('Shape:', X.shape)
print('Array:', X.toarray())

# Print the data
print("Data of Sparse Matrix ", X.data)

# Get the indices of the non-zero elements 
print("Indices of non-zero elements of Sparse Matrix ", X.indices)

# Get the array that points to where the start of each row is in the data and indices array
print("Pointer to start of each row in indices and data ", X.indptr)

Shape: (3, 8)
Array: [[0.         0.52284231 0.         0.52284231 0.         0.
  0.67325467 0.        ]
 [0.48359121 0.28561676 0.48359121 0.28561676 0.48359121 0.
  0.36778358 0.        ]
 [0.         0.35959372 0.         0.35959372 0.         0.6088451
  0.         0.6088451 ]]
Data of Sparse Matrix  [0.52284231 0.67325467 0.52284231 0.48359121 0.48359121 0.48359121
 0.28561676 0.36778358 0.28561676 0.6088451  0.6088451  0.35959372
 0.35959372]
Indices of non-zero elements of Sparse Matrix  [3 6 1 0 2 4 3 6 1 5 7 3 1]
Pointer to start of each row in indices and data  [ 0  3  9 13]


In [5]:
import numpy as np
import nltk
from nltk.corpus import movie_reviews
from sklearn.feature_extraction.text import TfidfVectorizer

# nltk.download('movie_reviews', quiet=True)

reviews = [movie_reviews.raw(fileid) for fileid in movie_reviews.fileids()]

# TODO: Create an instance of TfidfVectorizer
vectorizer = TfidfVectorizer()
# TODO: Apply fit_transform on reviews
X = vectorizer.fit_transform(reviews)
print('Feature Names:', vectorizer.get_feature_names_out()[-5:])

Feature Names: ['zweibel' 'zwick' 'zwigoff' 'zycie' 'zzzzzzz']
