In [282]:
import pandas as pd
import numpy as np

In [283]:
df = pd.read_csv('./data/imdb_master.csv',encoding="ISO-8859-1")

In [284]:
df.head()

Unnamed: 0.1,Unnamed: 0,type,review,label,file
0,0,test,Once again Mr. Costner has dragged out a movie...,neg,0_2.txt
1,1,test,This is an example of why the majority of acti...,neg,10000_4.txt
2,2,test,"First of all I hate those moronic rappers, who...",neg,10001_1.txt
3,3,test,Not even the Beatles could write songs everyon...,neg,10002_3.txt
4,4,test,Brass pictures (movies is not a fitting word f...,neg,10003_3.txt


In [285]:
#remove columns that are not required
df.drop(['Unnamed: 0','file'], axis=1, inplace=True)
df.head()

Unnamed: 0,type,review,label
0,test,Once again Mr. Costner has dragged out a movie...,neg
1,test,This is an example of why the majority of acti...,neg
2,test,"First of all I hate those moronic rappers, who...",neg
3,test,Not even the Beatles could write songs everyon...,neg
4,test,Brass pictures (movies is not a fitting word f...,neg


In [286]:
#split dataframe into train and test sets

train = df[df['type']== 'train']
print('No. of observations in train set: ' + str(len(train)))

test = df[df['type']== 'test']
print('No. of observations in test set: ' + str(len(test)))


No. of observations in train set: 75000
No. of observations in test set: 25000


In [287]:
# remove unlabelled samples

train = train[train['label'] != 'unsup']
test = test[test['label'] != 'unsup']

print('No. of observations in train set after removing unlabelled samples: ' + str(len(train)))
print('No. of observations in test set after removing unlabelled samples: ' + str(len(test)))

No. of observations in train set after removing unlabelled samples: 25000
No. of observations in test set after removing unlabelled samples: 25000


### Text preprocessing

In [288]:
train.head()

Unnamed: 0,type,review,label
25000,train,Story of a man who has unnatural feelings for ...,neg
25001,train,Airport '77 starts as a brand new luxury 747 p...,neg
25002,train,This film lacked something I couldn't put my f...,neg
25003,train,"Sorry everyone,,, I know this is supposed to b...",neg
25004,train,When I was little my parents took me along to ...,neg


In [289]:
train['label'].value_counts()

neg    12500
pos    12500
Name: label, dtype: int64

In [290]:
#lower casing
def lower_case(text):
    return text.lower()
train['review'] = train['review'].apply(lambda x: lower_case(x))
train.head()

Unnamed: 0,type,review,label
25000,train,story of a man who has unnatural feelings for ...,neg
25001,train,airport '77 starts as a brand new luxury 747 p...,neg
25002,train,this film lacked something i couldn't put my f...,neg
25003,train,"sorry everyone,,, i know this is supposed to b...",neg
25004,train,when i was little my parents took me along to ...,neg


In [291]:
train.iloc[24993,1]

"i have not read the other comments on the film, but judging from the average rating i can see that they are unlikely to be very complementary.<br /><br />i watched it for the second time with my children. they absolutely loved it. true, it did not have the adults rolling around the floor, but the sound of the children's enjoyment made it seem so.<br /><br />it is a true mel brooks farce, with plenty of moral content - how sad it is to be loved for our money, not for whom we are, and how fickle are our friends and associates. there are many other films on a similar subject matter, no doubt, many of which will have a greater comic or emotional impact on adults. it's hard for me to imagine such an impact on the junior members of the family, however.<br /><br />hence, for the children, a 9/10 from me."

In [292]:
import re

def remove_htmltags(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

remove_htmltags(train.iloc[24993,1])

"i have not read the other comments on the film, but judging from the average rating i can see that they are unlikely to be very complementary.i watched it for the second time with my children. they absolutely loved it. true, it did not have the adults rolling around the floor, but the sound of the children's enjoyment made it seem so.it is a true mel brooks farce, with plenty of moral content - how sad it is to be loved for our money, not for whom we are, and how fickle are our friends and associates. there are many other films on a similar subject matter, no doubt, many of which will have a greater comic or emotional impact on adults. it's hard for me to imagine such an impact on the junior members of the family, however.hence, for the children, a 9/10 from me."

In [293]:
train['review'] = train['review'].apply(lambda x: remove_htmltags(x))
train.iloc[24993,1]

"i have not read the other comments on the film, but judging from the average rating i can see that they are unlikely to be very complementary.i watched it for the second time with my children. they absolutely loved it. true, it did not have the adults rolling around the floor, but the sound of the children's enjoyment made it seem so.it is a true mel brooks farce, with plenty of moral content - how sad it is to be loved for our money, not for whom we are, and how fickle are our friends and associates. there are many other films on a similar subject matter, no doubt, many of which will have a greater comic or emotional impact on adults. it's hard for me to imagine such an impact on the junior members of the family, however.hence, for the children, a 9/10 from me."

In [299]:
def remove_numbers(text):
    output = re.sub(r'\d+', '', text)
    return output

train['review'] = train['review'].apply(lambda x: remove_numbers(x))
train.iloc[24993,1]


"i have not read the other comments on the film, but judging from the average rating i can see that they are unlikely to be very complementary.i watched it for the second time with my children. they absolutely loved it. true, it did not have the adults rolling around the floor, but the sound of the children's enjoyment made it seem so.it is a true mel brooks farce, with plenty of moral content - how sad it is to be loved for our money, not for whom we are, and how fickle are our friends and associates. there are many other films on a similar subject matter, no doubt, many of which will have a greater comic or emotional impact on adults. it's hard for me to imagine such an impact on the junior members of the family, however.hence, for the children, a / from me."

In [300]:
import string

def remove_punctuation(text):
    text_p = "".join([char for char in text if char not in string.punctuation])
    return text_p

train['review'] = train['review'].apply(lambda x: remove_punctuation(x))
train.head()

Unnamed: 0,type,review,label
25000,train,story of a man who has unnatural feelings for ...,neg
25001,train,airport starts as a brand new luxury plane i...,neg
25002,train,this film lacked something i couldnt put my fi...,neg
25003,train,sorry everyone i know this is supposed to be a...,neg
25004,train,when i was little my parents took me along to ...,neg


In [301]:
train.iloc[24993,1]

'i have not read the other comments on the film but judging from the average rating i can see that they are unlikely to be very complementaryi watched it for the second time with my children they absolutely loved it true it did not have the adults rolling around the floor but the sound of the childrens enjoyment made it seem soit is a true mel brooks farce with plenty of moral content  how sad it is to be loved for our money not for whom we are and how fickle are our friends and associates there are many other films on a similar subject matter no doubt many of which will have a greater comic or emotional impact on adults its hard for me to imagine such an impact on the junior members of the family howeverhence for the children a  from me'

In [302]:
import nltk
nltk.download('punkt')
from nltk import word_tokenize

def tokenize(text):
    words = word_tokenize(text)
    return words

train['review'] = train['review'].apply(lambda x: tokenize(x))
train.head()

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,type,review,label
25000,train,"[story, of, a, man, who, has, unnatural, feeli...",neg
25001,train,"[airport, starts, as, a, brand, new, luxury, p...",neg
25002,train,"[this, film, lacked, something, i, couldnt, pu...",neg
25003,train,"[sorry, everyone, i, know, this, is, supposed,...",neg
25004,train,"[when, i, was, little, my, parents, took, me, ...",neg


In [303]:
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = stopwords.words('english')
print(stop_words)
def remove_stopwords(text):
    filtered_words = [word for word in text if word not in stop_words]
    return filtered_words



['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [304]:
train['review'] = train['review'].apply(lambda x: remove_stopwords(x))
train.head()

Unnamed: 0,type,review,label
25000,train,"[story, man, unnatural, feelings, pig, starts,...",neg
25001,train,"[airport, starts, brand, new, luxury, plane, l...",neg
25002,train,"[film, lacked, something, couldnt, put, finger...",neg
25003,train,"[sorry, everyone, know, supposed, art, film, w...",neg
25004,train,"[little, parents, took, along, theater, see, i...",neg


In [305]:
from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()

def stem_words(text):
    stemmed = [stemmer.stem(word) for word in text]
    return stemmed

train['review'] = train['review'].apply(lambda x: stem_words(x))
train.head()

Unnamed: 0,type,review,label
25000,train,"[stori, man, unnatur, feel, pig, start, open, ...",neg
25001,train,"[airport, start, brand, new, luxuri, plane, lo...",neg
25002,train,"[film, lack, someth, couldnt, put, finger, fir...",neg
25003,train,"[sorri, everyon, know, suppos, art, film, wow,...",neg
25004,train,"[littl, parent, took, along, theater, see, int...",neg


In [306]:
# def remove_numbers(text):
#     return [word for word in text if not word.isdigit()]
# train['review'] = train['review'].apply(lambda x: remove_numbers(x))
# train.head()

In [236]:
a = train.iloc[0,1]
b = train.iloc[1,1]
c = list(set(a+b))
len(c)

357

In [237]:
len(a)

63

In [238]:
len(b)

434

In [307]:
all_reviews = train['review'].tolist()
vocab = [item for sublist in all_reviews for item in sublist ]
len(vocab)
    

3000532

In [309]:
vocab = list(set(vocab))
len(vocab)

109953

In [310]:
train_pos = train[train['label']== 'pos']
train_neg = train[train['label']== 'neg']

In [311]:
bag_vec = np.zeros(shape=(1,len(vocab)))

bag_vec.sum()

0.0

In [312]:
a = train.iloc[5,1]
for word in a:
    for i,token in enumerate(vocab):
        if token == word:
            bag_vec[0,i] += 1
        
        
    

In [317]:
list(train['review'])

[['stori',
  'man',
  'unnatur',
  'feel',
  'pig',
  'start',
  'open',
  'scene',
  'terrif',
  'exampl',
  'absurd',
  'comedi',
  'formal',
  'orchestra',
  'audienc',
  'turn',
  'insan',
  'violent',
  'mob',
  'crazi',
  'chant',
  'singer',
  'unfortun',
  'stay',
  'absurd',
  'whole',
  'time',
  'gener',
  'narr',
  'eventu',
  'make',
  'put',
  'even',
  'era',
  'turn',
  'cryptic',
  'dialogu',
  'would',
  'make',
  'shakespear',
  'seem',
  'easi',
  'third',
  'grader',
  'technic',
  'level',
  'better',
  'might',
  'think',
  'good',
  'cinematographi',
  'futur',
  'great',
  'vilmo',
  'zsigmond',
  'futur',
  'star',
  'salli',
  'kirkland',
  'freder',
  'forrest',
  'seen',
  'briefli'],
 ['airport',
  'start',
  'brand',
  'new',
  'luxuri',
  'plane',
  'load',
  'valuabl',
  'paint',
  'belong',
  'rich',
  'businessman',
  'philip',
  'steven',
  'jame',
  'stewart',
  'fli',
  'bunch',
  'vip',
  'estat',
  'prepar',
  'open',
  'public',
  'museum',
  'a

In [318]:
from sklearn.feature_extraction.text import CountVectorizer

def dummy(text):
    return text


all_reviews = list(train['review'])

cv = CountVectorizer(tokenizer=dummy,preprocessor=dummy, lowercase=False)
cv.fit(all_reviews)
len(cv.get_feature_names())

109953

In [328]:
skvocab = cv.get_feature_names()

In [259]:
len(train.iloc[5,1])

54