In [1]:
import re
import nltk
import pandas as pd

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer() 

In [2]:
paragraph = "A time may come soon, when none will return. Then there will be need of valour without renown, for none shall remember the deeds that are done in the last defense of your homes. Yet the deeds will not be less valiant because they are unpraised."

In [3]:
ps = PorterStemmer()
wordnet = WordNetLemmatizer()

In [4]:
sentences = nltk.sent_tokenize(paragraph)
corpus = []

for i in range(len(sentences)):
    review = re.sub('[^a-zA-Z]', ' ', sentences[i])

    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if word not in set(stopwords.words('english'))]
    
    review = ' '.join(review)
    
    corpus.append(review)

In [5]:
# fit_transform() returns the vector and isn't the same as the model.fit_transform we're used to
matrix = vectorizer.fit_transform(corpus).toarray()

Notice, we don't have a target. In sentiment analysis, we'd have a target (sentiment) too as `y`.

In [6]:
matrix.shape

(3, 21)

In [7]:
matrix

array([[0.42339448, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.42339448, 0.        , 0.32200242,
        0.        , 0.        , 0.42339448, 0.        , 0.42339448,
        0.42339448, 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.22769009, 0.29938511, 0.29938511, 0.29938511,
        0.29938511, 0.        , 0.        , 0.29938511, 0.22769009,
        0.29938511, 0.29938511, 0.        , 0.29938511, 0.        ,
        0.        , 0.        , 0.        , 0.29938511, 0.29938511,
        0.        ],
       [0.        , 0.35543247, 0.        , 0.        , 0.        ,
        0.        , 0.46735098, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.46735098, 0.46735098, 0.        , 0.        ,
        0.46735098]])

In [8]:
words = vectorizer.get_feature_names_out()

In [9]:
bagOfWords = pd.DataFrame(matrix, columns=words)
bagOfWords

Unnamed: 0,come,deed,defens,done,home,last,less,may,need,none,...,renown,return,shall,soon,time,unprais,valiant,valour,without,yet
0,0.423394,0.0,0.0,0.0,0.0,0.0,0.0,0.423394,0.0,0.322002,...,0.0,0.423394,0.0,0.423394,0.423394,0.0,0.0,0.0,0.0,0.0
1,0.0,0.22769,0.299385,0.299385,0.299385,0.299385,0.0,0.0,0.299385,0.22769,...,0.299385,0.0,0.299385,0.0,0.0,0.0,0.0,0.299385,0.299385,0.0
2,0.0,0.355432,0.0,0.0,0.0,0.0,0.467351,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.467351,0.467351,0.0,0.0,0.467351
