### Import minimal packages for text processing

In [1]:
from nltk import sent_tokenize, word_tokenize
import pandas as pd

### Load the data in with pandas

In [2]:
pos = pd.DataFrame(
    [x.rstrip() for x
     in open("../data/rt-polaritydata/rt-polarity.pos").readlines()],
columns=['review'])
pos['type'] = 'positive'
neg = pd.DataFrame(
    [x.rstrip() for x
     in open("../data/rt-polaritydata/rt-polarity.neg").readlines()],
columns=['review'])
neg['type'] = 'negative'

In [3]:
all_reviews = pd.concat([pos, neg]).sample(frac=1).reset_index()

### Break sentences up into words

In [4]:
word_tokenize(all_reviews['review'].head(1).values[0])

['passionate',
 ',',
 'irrational',
 ',',
 'long-suffering',
 'but',
 'cruel',
 'as',
 'a',
 'tarantula',
 ',',
 'helga',
 'figures',
 'prominently',
 'in',
 'this',
 'movie',
 ',',
 'and',
 'helps',
 'keep',
 'the',
 'proceedings',
 'as',
 'funny',
 'for',
 'grown-ups',
 'as',
 'for',
 'rugrats',
 '.']

In [5]:
all_reviews['review'].apply(lambda x: word_tokenize(x))

0        [passionate, ,, irrational, ,, long-suffering,...
1        [apart, from, its, own, considerable, achievem...
2        [at, times, ,, however, ,, dogtown, and, z-boy...
3        [kids, five, and, up, will, be, delighted, wit...
4        [anyway, ,, for, one, reason, or, another, ,, ...
                               ...                        
10657    [a, wretched, movie, that, reduces, the, secon...
10658    [the, holes, in, this, film, remain, agape, --...
10659    [the, entire, film, is, one, big, excuse, to, ...
10660    [the, ill-conceived, modern-day, ending, falls...
10661    [too, sincere, to, exploit, its, subjects, and...
Name: review, Length: 10662, dtype: object

### Lemmatization

In [6]:
from nltk.stem.wordnet import WordNetLemmatizer

In [7]:
wnl = WordNetLemmatizer()
all_reviews['review'].apply(lambda x: [wnl.lemmatize(y) for y in word_tokenize(x)])

0        [passionate, ,, irrational, ,, long-suffering,...
1        [apart, from, it, own, considerable, achieveme...
2        [at, time, ,, however, ,, dogtown, and, z-boys...
3        [kid, five, and, up, will, be, delighted, with...
4        [anyway, ,, for, one, reason, or, another, ,, ...
                               ...                        
10657    [a, wretched, movie, that, reduces, the, secon...
10658    [the, hole, in, this, film, remain, agape, --,...
10659    [the, entire, film, is, one, big, excuse, to, ...
10660    [the, ill-conceived, modern-day, ending, fall,...
10661    [too, sincere, to, exploit, it, subject, and, ...
Name: review, Length: 10662, dtype: object

In [8]:
from nltk.corpus import stopwords
from nltk import download as nltk_download

In [9]:
nltk_download('stopwords')
english_stopwords = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/constantine/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
all_reviews['review'].apply(
    lambda x: [wnl.lemmatize(y) for y in word_tokenize(x)
               if y not in english_stopwords])

0        [passionate, ,, irrational, ,, long-suffering,...
1        [apart, considerable, achievement, ,, metropol...
2        [time, ,, however, ,, dogtown, z-boys, lapse, ...
3        [kid, five, delighted, fast, ,, funny, ,, even...
4        [anyway, ,, one, reason, another, ,, crush, tu...
                               ...                        
10657    [wretched, movie, reduces, second, world, war,...
10658    [hole, film, remain, agape, --, hole, punched,...
10659    [entire, film, one, big, excuse, play, one, le...
10660    [ill-conceived, modern-day, ending, fall, flat...
10661    [sincere, exploit, subject, honest, manipulate...
Name: review, Length: 10662, dtype: object

In [11]:
all_reviews['review'].apply(
    lambda x: [y for y in word_tokenize(x)
               if y not in english_stopwords])

0        [passionate, ,, irrational, ,, long-suffering,...
1        [apart, considerable, achievement, ,, metropol...
2        [times, ,, however, ,, dogtown, z-boys, lapses...
3        [kids, five, delighted, fast, ,, funny, ,, eve...
4        [anyway, ,, one, reason, another, ,, crush, tu...
                               ...                        
10657    [wretched, movie, reduces, second, world, war,...
10658    [holes, film, remain, agape, --, holes, punche...
10659    [entire, film, one, big, excuse, play, one, le...
10660    [ill-conceived, modern-day, ending, falls, fla...
10661    [sincere, exploit, subjects, honest, manipulat...
Name: review, Length: 10662, dtype: object