In [1]:
import pandas as pd

df = pd.read_csv("../data/imdb_dataset.csv")
print(df.columns)

df = df[['review', 'sentiment']].dropna()

#select only first 1000 data from dataset
df = df[:1000]
print(len(df))
df.head()

Index(['review', 'sentiment'], dtype='object')
1000


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [2]:
#resources for NLTK and spaCy
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

import spacy


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/aakashshrestha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aakashshrestha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/aakashshrestha/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/aakashshrestha/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [3]:
#tokenization
import nltk
from nltk.tokenize import word_tokenize

df['tokens'] = df['review'].apply(lambda x: word_tokenize(x))
df.head(10)

Unnamed: 0,review,sentiment,tokens
0,One of the other reviewers has mentioned that ...,positive,"[One, of, the, other, reviewers, has, mentione..."
1,A wonderful little production. <br /><br />The...,positive,"[A, wonderful, little, production, ., <, br, /..."
2,I thought this was a wonderful way to spend ti...,positive,"[I, thought, this, was, a, wonderful, way, to,..."
3,Basically there's a family where a little boy ...,negative,"[Basically, there, 's, a, family, where, a, li..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"[Petter, Mattei, 's, ``, Love, in, the, Time, ..."
5,"Probably my all-time favorite movie, a story o...",positive,"[Probably, my, all-time, favorite, movie, ,, a..."
6,I sure would like to see a resurrection of a u...,positive,"[I, sure, would, like, to, see, a, resurrectio..."
7,"This show was an amazing, fresh & innovative i...",negative,"[This, show, was, an, amazing, ,, fresh, &, in..."
8,Encouraged by the positive comments about this...,negative,"[Encouraged, by, the, positive, comments, abou..."
9,If you like original gut wrenching laughter yo...,positive,"[If, you, like, original, gut, wrenching, laug..."


In [4]:
#case folding
df['tokens_lower'] = df['tokens'].apply(lambda x: [word.lower() for word in x])
df.head()

Unnamed: 0,review,sentiment,tokens,tokens_lower
0,One of the other reviewers has mentioned that ...,positive,"[One, of, the, other, reviewers, has, mentione...","[one, of, the, other, reviewers, has, mentione..."
1,A wonderful little production. <br /><br />The...,positive,"[A, wonderful, little, production, ., <, br, /...","[a, wonderful, little, production, ., <, br, /..."
2,I thought this was a wonderful way to spend ti...,positive,"[I, thought, this, was, a, wonderful, way, to,...","[i, thought, this, was, a, wonderful, way, to,..."
3,Basically there's a family where a little boy ...,negative,"[Basically, there, 's, a, family, where, a, li...","[basically, there, 's, a, family, where, a, li..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"[Petter, Mattei, 's, ``, Love, in, the, Time, ...","[petter, mattei, 's, ``, love, in, the, time, ..."


In [5]:
#stop word removal
from nltk.corpus import stopwords

stopwords = set(stopwords.words('english'))
print(len(stopwords))

df['tokens_nostopwords'] = df['tokens_lower'].apply(lambda x: [word for word in  x if word.isalpha() and word not in stopwords]);
df['tokens_nostopwords'].head(20)

198


0     [one, reviewers, mentioned, watching, oz, epis...
1     [wonderful, little, production, br, br, filmin...
2     [thought, wonderful, way, spend, time, hot, su...
3     [basically, family, little, boy, jake, thinks,...
4     [petter, mattei, love, time, money, visually, ...
5     [probably, favorite, movie, story, selflessnes...
6     [sure, would, like, see, resurrection, dated, ...
7     [show, amazing, fresh, innovative, idea, first...
8     [encouraged, positive, comments, film, looking...
9     [like, original, gut, wrenching, laughter, lik...
10    [phil, alien, one, quirky, films, humour, base...
11    [saw, movie, came, recall, scariest, scene, bi...
12    [im, big, fan, boll, work, many, enjoyed, movi...
13    [cast, played, br, br, shakespeare, br, br, ap...
14    [fantastic, movie, three, prisoners, become, f...
15    [kind, drawn, erotic, scenes, realize, one, am...
16    [films, simply, remade, one, bad, film, fails,...
17    [movie, made, one, top, awful, movies, hor

In [6]:
#stemming

from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
df['tokens_stemmed'] = df['tokens_nostopwords'].apply(lambda x: [stemmer.stem(word) for word in x])
df['tokens_stemmed'].head(10)

0    [one, review, mention, watch, oz, episod, hook...
1    [wonder, littl, product, br, br, film, techniq...
2    [thought, wonder, way, spend, time, hot, summe...
3    [basic, famili, littl, boy, jake, think, zombi...
4    [petter, mattei, love, time, money, visual, st...
5    [probabl, favorit, movi, stori, selfless, sacr...
6    [sure, would, like, see, resurrect, date, seah...
7    [show, amaz, fresh, innov, idea, first, air, f...
8    [encourag, posit, comment, film, look, forward...
9    [like, origin, gut, wrench, laughter, like, mo...
Name: tokens_stemmed, dtype: object

In [7]:
#Lemmatization
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
df['tokens_lemmatized'] = df['tokens_nostopwords'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
df.head(10)

Unnamed: 0,review,sentiment,tokens,tokens_lower,tokens_nostopwords,tokens_stemmed,tokens_lemmatized
0,One of the other reviewers has mentioned that ...,positive,"[One, of, the, other, reviewers, has, mentione...","[one, of, the, other, reviewers, has, mentione...","[one, reviewers, mentioned, watching, oz, epis...","[one, review, mention, watch, oz, episod, hook...","[one, reviewer, mentioned, watching, oz, episo..."
1,A wonderful little production. <br /><br />The...,positive,"[A, wonderful, little, production, ., <, br, /...","[a, wonderful, little, production, ., <, br, /...","[wonderful, little, production, br, br, filmin...","[wonder, littl, product, br, br, film, techniq...","[wonderful, little, production, br, br, filmin..."
2,I thought this was a wonderful way to spend ti...,positive,"[I, thought, this, was, a, wonderful, way, to,...","[i, thought, this, was, a, wonderful, way, to,...","[thought, wonderful, way, spend, time, hot, su...","[thought, wonder, way, spend, time, hot, summe...","[thought, wonderful, way, spend, time, hot, su..."
3,Basically there's a family where a little boy ...,negative,"[Basically, there, 's, a, family, where, a, li...","[basically, there, 's, a, family, where, a, li...","[basically, family, little, boy, jake, thinks,...","[basic, famili, littl, boy, jake, think, zombi...","[basically, family, little, boy, jake, think, ..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"[Petter, Mattei, 's, ``, Love, in, the, Time, ...","[petter, mattei, 's, ``, love, in, the, time, ...","[petter, mattei, love, time, money, visually, ...","[petter, mattei, love, time, money, visual, st...","[petter, mattei, love, time, money, visually, ..."
5,"Probably my all-time favorite movie, a story o...",positive,"[Probably, my, all-time, favorite, movie, ,, a...","[probably, my, all-time, favorite, movie, ,, a...","[probably, favorite, movie, story, selflessnes...","[probabl, favorit, movi, stori, selfless, sacr...","[probably, favorite, movie, story, selflessnes..."
6,I sure would like to see a resurrection of a u...,positive,"[I, sure, would, like, to, see, a, resurrectio...","[i, sure, would, like, to, see, a, resurrectio...","[sure, would, like, see, resurrection, dated, ...","[sure, would, like, see, resurrect, date, seah...","[sure, would, like, see, resurrection, dated, ..."
7,"This show was an amazing, fresh & innovative i...",negative,"[This, show, was, an, amazing, ,, fresh, &, in...","[this, show, was, an, amazing, ,, fresh, &, in...","[show, amazing, fresh, innovative, idea, first...","[show, amaz, fresh, innov, idea, first, air, f...","[show, amazing, fresh, innovative, idea, first..."
8,Encouraged by the positive comments about this...,negative,"[Encouraged, by, the, positive, comments, abou...","[encouraged, by, the, positive, comments, abou...","[encouraged, positive, comments, film, looking...","[encourag, posit, comment, film, look, forward...","[encouraged, positive, comment, film, looking,..."
9,If you like original gut wrenching laughter yo...,positive,"[If, you, like, original, gut, wrenching, laug...","[if, you, like, original, gut, wrenching, laug...","[like, original, gut, wrenching, laughter, lik...","[like, origin, gut, wrench, laughter, like, mo...","[like, original, gut, wrenching, laughter, lik..."
