In [1]:
import re
import nltk
import pandas as pd

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer() 

In [2]:
paragraph = "And viddy films I would. Where I was taken to, Brothers, was like no cine I ever viddied before. I was bound up in a strait jacket and my Gulliver was strapped to a headrest with like wires running away from it. Then they clamped like lid locks on my eyes so that I could not shut them no matter how hard I tried. It seemed a bit crazy to me but I let them get on with it. If I was to be a free young malchick again in a fortnight's time I would put up with much in the meantime, O my Brothers. … So far the first film, was a very good professional piece of cine. Like it was done in Hollywood. The sounds were real horror show, you could slooshie the screams and moans very realistic. You could even get the heavy breathing and panting of the tolchcoking malchicks at the same time. And then what do you know, soon our dear old friend the red red vino on tap. The same in all places, like it was put out by the same big firm, began to flow. It was beautiful. It's funny how the colors of the real world only seem really real when you viddy them on the screen. Now all the time I was watching this, I was beginning to get very aware of like not feeling all that well. And this this I put down to all the rich food and vitamins. But I tried to forget this, concentrating on the next film which jumped right away on a young devotchka who was being given the old in-out, in-out. First by one malchick, then another, then another. When it came to the sixth or seventh malchick leering and smecking and going into it, I began to feel really sick. But I could not shut me glassies and even if I tried to move my glassballs about, I still not get out of the line of fire of this picture. I'm going to be sick! Get something for me to be sick in!"

In [3]:
ps = PorterStemmer()
wordnet = WordNetLemmatizer()

In [4]:
sentences = nltk.sent_tokenize(paragraph)
corpus = []

for i in range(len(sentences)):
    review = re.sub('[^a-zA-Z]', ' ', sentences[i])

    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if word not in set(stopwords.words('english'))]
    
    review = ' '.join(review)
    
    corpus.append(review)

In [5]:
# fit_transform() returns the vector and isn't the same as the model.fit_transform we're used to
matrix = vectorizer.fit_transform(corpus).toarray()

Notice, we don't have a target. In sentiment analysis, we'd have a target (sentiment) too as `y`.

In [6]:
matrix.shape

(22, 110)

In [7]:
matrix

array([[0.        , 0.        , 0.        , ..., 0.        , 0.6155706 ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.29112776, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [8]:
words = vectorizer.get_feature_names_out()

In [9]:
bagOfWords = pd.DataFrame(matrix, columns=words)
bagOfWords

Unnamed: 0,anoth,awar,away,beauti,began,begin,big,bit,bound,breath,...,tri,viddi,vino,vitamin,watch,well,wire,world,would,young
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.557258,0.0,0.0,0.0,0.0,0.0,0.0,0.615571,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.372408,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.291128,0.0,0.0,0.0,0.0,0.0,0.329997,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.329997,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.276229,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.485536,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.309044,0.309044
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.37914,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
