**Bag Of Words**

In [None]:
sentences = [
    "This a fantastic movie of three prisoners who become famous. One of the actors is george clooney and...",
    "Kind of drawn in by the erotic scenes, only to realize this was one of the most amateurish and unbel...",
    "Some films just simply should not be remade. This is one of them. In and of itself it is not a bad f..."
]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

bw = vectorizer.fit_transform(sentences)

In [None]:
bw

<3x41 sparse matrix of type '<class 'numpy.int64'>'
	with 52 stored elements in Compressed Sparse Row format>

**3x41 = 3 sentences , 41 vocabulary**

In [None]:
vectorizer.vocabulary_

{'this': 35,
 'fantastic': 11,
 'movie': 21,
 'of': 23,
 'three': 36,
 'prisoners': 26,
 'who': 40,
 'become': 5,
 'famous': 10,
 'one': 24,
 'the': 33,
 'actors': 0,
 'is': 15,
 'george': 13,
 'clooney': 7,
 'and': 2,
 'kind': 19,
 'drawn': 8,
 'in': 14,
 'by': 6,
 'erotic': 9,
 'scenes': 29,
 'only': 25,
 'to': 37,
 'realize': 27,
 'was': 39,
 'most': 20,
 'amateurish': 1,
 'unbel': 38,
 'some': 32,
 'films': 12,
 'just': 18,
 'simply': 31,
 'should': 30,
 'not': 22,
 'be': 4,
 'remade': 28,
 'them': 34,
 'itself': 17,
 'it': 16,
 'bad': 3}

**Tokenization**

In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer
token = Tokenizer(num_words = 20000)

In [8]:
token.fit_on_texts(sentences)
sequence = token.texts_to_sequences(sentences)

In [9]:
sequence

[[2, 7, 10, 11, 1, 12, 13, 14, 15, 16, 3, 1, 4, 17, 5, 18, 19, 6],
 [20, 1, 21, 8, 22, 4, 23, 24, 25, 26, 27, 2, 28, 3, 1, 4, 29, 30, 6, 31],
 [32,
  33,
  34,
  35,
  36,
  9,
  37,
  38,
  2,
  5,
  3,
  1,
  39,
  8,
  6,
  1,
  40,
  41,
  5,
  9,
  7,
  42,
  43]]

In [10]:
token.word_index

{'of': 1,
 'this': 2,
 'one': 3,
 'the': 4,
 'is': 5,
 'and': 6,
 'a': 7,
 'in': 8,
 'not': 9,
 'fantastic': 10,
 'movie': 11,
 'three': 12,
 'prisoners': 13,
 'who': 14,
 'become': 15,
 'famous': 16,
 'actors': 17,
 'george': 18,
 'clooney': 19,
 'kind': 20,
 'drawn': 21,
 'by': 22,
 'erotic': 23,
 'scenes': 24,
 'only': 25,
 'to': 26,
 'realize': 27,
 'was': 28,
 'most': 29,
 'amateurish': 30,
 'unbel': 31,
 'some': 32,
 'films': 33,
 'just': 34,
 'simply': 35,
 'should': 36,
 'be': 37,
 'remade': 38,
 'them': 39,
 'itself': 40,
 'it': 41,
 'bad': 42,
 'f': 43}

**Stop Words Removal**

In [13]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [14]:
from nltk.corpus import stopwords

s = set(stopwords.words("english"))
s

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [17]:
for sen in sentences:
  print([word for word in sen.split() if word not in s])

['This', 'fantastic', 'movie', 'three', 'prisoners', 'become', 'famous.', 'One', 'actors', 'george', 'clooney', 'and...']
['Kind', 'drawn', 'erotic', 'scenes,', 'realize', 'one', 'amateurish', 'unbel...']
['Some', 'films', 'simply', 'remade.', 'This', 'one', 'them.', 'In', 'bad', 'f...']


In [19]:
import string
punc = set(string.punctuation)
punc

{'!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 '{',
 '|',
 '}',
 '~'}

**Stemming**

In [23]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
stemmer.stem("News")

'new'

**Lemmatization**

In [24]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [26]:
from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize("went","v")

'go'