In [1]:
from nltk.corpus import movie_reviews

In [2]:
movie_reviews.categories()

['neg', 'pos']

In [4]:
movie_reviews.words(movie_reviews.fileids()[5])

['capsule', ':', 'in', '2176', 'on', 'the', 'planet', ...]

In [5]:
data = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        data.append((movie_reviews.words(fileid), category))
data[0:5]

[(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg'),
 (['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], 'neg'),
 (['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...], 'neg'),
 (['"', 'quest', 'for', 'camelot', '"', 'is', 'warner', ...], 'neg'),
 (['synopsis', ':', 'a', 'mentally', 'unstable', 'man', ...], 'neg')]

In [6]:
len(data)

2000

In [7]:
import random
random.shuffle(data)
data[0:5]

[(['numerous', 'comparisons', 'can', 'be', 'made', ...], 'neg'),
 (['the', 'yet', '-', 'to', '-', 'be', '-', 'released', ...], 'neg'),
 (['the', 'dream', 'team', 'is', 'a', 'thoroughly', ...], 'pos'),
 (['fit', 'for', 'a', 'ghoul', "'", 's', 'night', 'out', ...], 'neg'),
 (['the', 'makers', 'of', 'jurassic', 'park', '&', 'the', ...], 'pos')]

In [8]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [9]:
from nltk.corpus import wordnet
def get_simple_pos(tag):
    
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [10]:
from nltk import pos_tag
w = "better"
pos_tag([w])

[('better', 'RBR')]

In [1]:
from nltk.corpus import stopwords
import string
stops = set(stopwords.words('english'))
punctuations = list(string.punctuation)
stops.update(punctuations)


In [12]:
def clean_review(words):
    output_words = []
    for w in words:
        if w.lower() not in stops:
            pos = pos_tag([w])
            clean_word = lemmatizer.lemmatize(w, pos = get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

In [15]:
categories = [category for document, category in data]

In [16]:
text_documents = [" ".join(document) for document, category in data]

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
x_train, x_test, y_train, y_test = train_test_split(text_documents, categories)

In [19]:
count_vec = CountVectorizer(max_features = 2000)
x_train_features = count_vec.fit_transform(x_train)
x_train_features.todense()

matrix([[0, 0, 0, ..., 1, 0, 0],
        [0, 0, 0, ..., 1, 0, 0],
        [0, 1, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 3, 1, 0],
        [0, 9, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [20]:
count_vec.get_feature_names_out()

array(['000', '10', '100', ..., 'york', 'young', 'zero'], dtype=object)

In [21]:
x_test_features = count_vec.transform(x_test)

In [22]:
x_test_features

<500x2000 sparse matrix of type '<class 'numpy.int64'>'
	with 86694 stored elements in Compressed Sparse Row format>