Both BoW and TF-IDF have a very serious chance of overfitting, use Word2Vec to overcome.

Word2Vec represents each word as a vector of dimension 32 or greater, rather than a single number. This preserves semantic information and relation between words.
Interestingly, allows operations such as `King - Man + Woman` and knows to yield result `Queen`, as it justs computes using vectors.

In [1]:
import nltk
#%pip install gensim
from gensim.models import Word2Vec
from nltk.corpus import stopwords
import re

review = """Good teacher, I took this as a second block class on accident. Lots of reading in the class, not super necessary for the lectures. 
            But a fun teacher who's excited to share English with students, normal amount of homework assignments and a small 5 page paper due at finals. 8/10"""

In [2]:
# Preprocessing text

text = re.sub(r'\[[0-9]*\]', ' ', review)       # remove special characters and numbers
text = re.sub(r'\s+', ' ', text)                # remove spaces
text = text.lower()                             # make lowercase to avoid repeat entries
text = re.sub(r'\d', ' ', text)
text = re.sub(r'\s+', ' ', text)

sentences = nltk.sent_tokenize(text)
sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
for i in range(len(sentences)):
    sentences[i] = [word for word in sentences[i] if word not in stopwords.words("english")]

sentences

[['good', 'teacher', ',', 'took', 'second', 'block', 'class', 'accident', '.'],
 ['lots', 'reading', 'class', ',', 'super', 'necessary', 'lectures', '.'],
 ['fun',
  'teacher',
  "'s",
  'excited',
  'share',
  'english',
  'students',
  ',',
  'normal',
  'amount',
  'homework',
  'assignments',
  'small',
  'page',
  'paper',
  'due',
  'finals',
  '.'],
 ['/']]

In [3]:
# Training Word2Vec model
model = Word2Vec(sentences, min_count=1)            # min_count represents minimum number of occurences of a particular word that must be present to vectorize it

#  words = model.wv.vocab           vocab attribute was depracated, replacements don't seem as helpful lol
words = model.wv.key_to_index
#words

vector = model.wv['second']        # finding word vectors
#vector

attr = model.wv.get_vecattr("teacher", "count")     # get attribute of vector
#attr

similar = model.wv.most_similar("class")      # most similar words
similar

[('lectures', 0.17826791107654572),
 ('finals', 0.13126598298549652),
 ('block', 0.07497556507587433),
 ('.', 0.0679679811000824),
 ('assignments', 0.04814743623137474),
 ('good', 0.04732733219861984),
 ('took', 0.04157735034823418),
 ('reading', 0.04130808264017105),
 ('excited', 0.04115622118115425),
 ('amount', 0.04080428555607796)]