In [2]:
import pandas as pd
import numpy as np
import nltk

In [3]:
review_data = pd.read_csv('IMDB Dataset.csv',nrows=10000)

In [4]:
review_data.shape

(10000, 2)

In [5]:
review_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     10000 non-null  object
 1   sentiment  10000 non-null  object
dtypes: object(2)
memory usage: 78.2+ KB


In [6]:
review_data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [7]:
import re

In [8]:
def data_cleaning(text):
    clean_text = re.sub('[^A-Za-z]+'," ",text)
    return clean_text

In [9]:
review_data["review"]=review_data["review"].apply(lambda text: data_cleaning(text))

In [10]:
review_data["review"]=review_data["review"].apply(lambda text: text.lower())

In [11]:
review_data['token_text'] = review_data['review'].apply(lambda text : text.split())

In [12]:
review_data

Unnamed: 0,review,sentiment,token_text
0,one of the other reviewers has mentioned that ...,positive,"[one, of, the, other, reviewers, has, mentione..."
1,a wonderful little production br br the filmin...,positive,"[a, wonderful, little, production, br, br, the..."
2,i thought this was a wonderful way to spend ti...,positive,"[i, thought, this, was, a, wonderful, way, to,..."
3,basically there s a family where a little boy ...,negative,"[basically, there, s, a, family, where, a, lit..."
4,petter mattei s love in the time of money is a...,positive,"[petter, mattei, s, love, in, the, time, of, m..."
...,...,...,...
9995,fun entertaining movie about wwii german spy j...,positive,"[fun, entertaining, movie, about, wwii, german..."
9996,give me a break how can anyone say that this i...,negative,"[give, me, a, break, how, can, anyone, say, th..."
9997,this movie is a bad movie but after watching a...,negative,"[this, movie, is, a, bad, movie, but, after, w..."
9998,this is a movie that was probably made to ente...,negative,"[this, is, a, movie, that, was, probably, made..."


In [13]:
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ashish\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
stop_words = stopwords.words('english')
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [16]:
def remove_stopwords(token_text):
    words=[]
    for word in token_text:
        if word not in stop_words:
            words.append(word)
            
    return words

In [17]:
review_data['token_text'] = review_data['token_text'].apply(lambda token_text : remove_stopwords(token_text))

In [18]:
review_data

Unnamed: 0,review,sentiment,token_text
0,one of the other reviewers has mentioned that ...,positive,"[one, reviewers, mentioned, watching, oz, epis..."
1,a wonderful little production br br the filmin...,positive,"[wonderful, little, production, br, br, filmin..."
2,i thought this was a wonderful way to spend ti...,positive,"[thought, wonderful, way, spend, time, hot, su..."
3,basically there s a family where a little boy ...,negative,"[basically, family, little, boy, jake, thinks,..."
4,petter mattei s love in the time of money is a...,positive,"[petter, mattei, love, time, money, visually, ..."
...,...,...,...
9995,fun entertaining movie about wwii german spy j...,positive,"[fun, entertaining, movie, wwii, german, spy, ..."
9996,give me a break how can anyone say that this i...,negative,"[give, break, anyone, say, good, hockey, movie..."
9997,this movie is a bad movie but after watching a...,negative,"[movie, bad, movie, watching, endless, series,..."
9998,this is a movie that was probably made to ente...,negative,"[movie, probably, made, entertain, middle, sch..."


In [19]:
from nltk.stem import PorterStemmer    
ps = PorterStemmer()

In [20]:
def porter_stemming(token_text):
    
    stem_words=[]
    for word in token_text:
        stem_word = ps.stem(word)
        stem_words.append(stem_word)
        
    return stem_words

In [21]:
review_data["stem_words"]=review_data["token_text"].apply(lambda token_text : porter_stemming(token_text))

In [22]:
review_data

Unnamed: 0,review,sentiment,token_text,stem_words
0,one of the other reviewers has mentioned that ...,positive,"[one, reviewers, mentioned, watching, oz, epis...","[one, review, mention, watch, oz, episod, hook..."
1,a wonderful little production br br the filmin...,positive,"[wonderful, little, production, br, br, filmin...","[wonder, littl, product, br, br, film, techniq..."
2,i thought this was a wonderful way to spend ti...,positive,"[thought, wonderful, way, spend, time, hot, su...","[thought, wonder, way, spend, time, hot, summe..."
3,basically there s a family where a little boy ...,negative,"[basically, family, little, boy, jake, thinks,...","[basic, famili, littl, boy, jake, think, zombi..."
4,petter mattei s love in the time of money is a...,positive,"[petter, mattei, love, time, money, visually, ...","[petter, mattei, love, time, money, visual, st..."
...,...,...,...,...
9995,fun entertaining movie about wwii german spy j...,positive,"[fun, entertaining, movie, wwii, german, spy, ...","[fun, entertain, movi, wwii, german, spi, juli..."
9996,give me a break how can anyone say that this i...,negative,"[give, break, anyone, say, good, hockey, movie...","[give, break, anyon, say, good, hockey, movi, ..."
9997,this movie is a bad movie but after watching a...,negative,"[movie, bad, movie, watching, endless, series,...","[movi, bad, movi, watch, endless, seri, bad, h..."
9998,this is a movie that was probably made to ente...,negative,"[movie, probably, made, entertain, middle, sch...","[movi, probabl, made, entertain, middl, school..."


In [23]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

wl=WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ashish\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


In [25]:
def wordnet_lemma(token_text):
    lemma_words=[]
    for word in token_text:
        lemma_word = wl.lemmatize(word,pos='v')
        lemma_words.append(lemma_word)
        
    return lemma_words

In [26]:
review_data["lemma_words"]=review_data["token_text"].apply(lambda token_text : wordnet_lemma(token_text))

In [27]:
review_data

Unnamed: 0,review,sentiment,token_text,stem_words,lemma_words
0,one of the other reviewers has mentioned that ...,positive,"[one, reviewers, mentioned, watching, oz, epis...","[one, review, mention, watch, oz, episod, hook...","[one, reviewers, mention, watch, oz, episode, ..."
1,a wonderful little production br br the filmin...,positive,"[wonderful, little, production, br, br, filmin...","[wonder, littl, product, br, br, film, techniq...","[wonderful, little, production, br, br, film, ..."
2,i thought this was a wonderful way to spend ti...,positive,"[thought, wonderful, way, spend, time, hot, su...","[thought, wonder, way, spend, time, hot, summe...","[think, wonderful, way, spend, time, hot, summ..."
3,basically there s a family where a little boy ...,negative,"[basically, family, little, boy, jake, thinks,...","[basic, famili, littl, boy, jake, think, zombi...","[basically, family, little, boy, jake, think, ..."
4,petter mattei s love in the time of money is a...,positive,"[petter, mattei, love, time, money, visually, ...","[petter, mattei, love, time, money, visual, st...","[petter, mattei, love, time, money, visually, ..."
...,...,...,...,...,...
9995,fun entertaining movie about wwii german spy j...,positive,"[fun, entertaining, movie, wwii, german, spy, ...","[fun, entertain, movi, wwii, german, spi, juli...","[fun, entertain, movie, wwii, german, spy, jul..."
9996,give me a break how can anyone say that this i...,negative,"[give, break, anyone, say, good, hockey, movie...","[give, break, anyon, say, good, hockey, movi, ...","[give, break, anyone, say, good, hockey, movie..."
9997,this movie is a bad movie but after watching a...,negative,"[movie, bad, movie, watching, endless, series,...","[movi, bad, movi, watch, endless, seri, bad, h...","[movie, bad, movie, watch, endless, series, ba..."
9998,this is a movie that was probably made to ente...,negative,"[movie, probably, made, entertain, middle, sch...","[movi, probabl, made, entertain, middl, school...","[movie, probably, make, entertain, middle, sch..."


In [28]:
review_data['cleaned_text'] = review_data['stem_words'].apply(lambda text: ' '.join(text))

In [29]:
from sklearn.feature_extraction.text import CountVectorizer
review_vectorizer = CountVectorizer()

review_features   = review_vectorizer.fit_transform(review_data['cleaned_text'])
review_features.get_shape()

(10000, 34935)

In [30]:
review_features

<10000x34935 sparse matrix of type '<class 'numpy.int64'>'
	with 935453 stored elements in Compressed Sparse Row format>

In [32]:
X = review_features    #### Features
y = review_data['sentiment']  #### Target

In [33]:
#Using the train_test_split to create train and test sets.
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 47, test_size = 0.25)

In [34]:
print('Training set :', X_train.shape)
print('Testing set :', X_test.shape)

Training set : (7500, 34935)
Testing set : (2500, 34935)


In [35]:
#Importing the Decision tree classifier from the sklearn library.
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(criterion = 'entropy')

In [36]:
#Training the decision tree classifier. 
clf.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy')

In [37]:
#Predicting labels on the test set.
y_pred =  clf.predict(X_test)

In [38]:
#Importing the accuracy metric from sklearn.metrics library

from sklearn.metrics import accuracy_score
print('Accuracy Score on train data: ', accuracy_score(y_true=y_train, y_pred=clf.predict(X_train)))
print('Accuracy Score on test data: ', accuracy_score(y_true=y_test, y_pred=y_pred))

Accuracy Score on train data:  1.0
Accuracy Score on test data:  0.7164


In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_review_vectorizer = TfidfVectorizer()
tfidf_review_features = tfidf_review_vectorizer.fit_transform(review_data['cleaned_text'])

In [40]:
tfidf_review_features.shape

(10000, 34935)

In [43]:
X = review_features    #### Features
y = review_data['sentiment']  #### Target

#Using the train_test_split to create train and test sets.
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 47, test_size = 0.25)

print('Training set :', X_train.shape)
print('Testing set :', X_test.shape)

#Importing the Decision tree classifier from the sklearn library.
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(criterion = 'entropy')

#Training the decision tree classifier. 
clf.fit(X_train, y_train)

#Predicting labels on the test set.
y_pred =  clf.predict(X_test)

#Importing the accuracy metric from sklearn.metrics library

from sklearn.metrics import accuracy_score
print('Accuracy Score on train data: ', accuracy_score(y_true=y_train, y_pred=clf.predict(X_train)))
print('Accuracy Score on test data: ', accuracy_score(y_true=y_test, y_pred=y_pred))

Training set : (7500, 34935)
Testing set : (2500, 34935)
Accuracy Score on train data:  1.0
Accuracy Score on test data:  0.7108


In [45]:
print(review_data['review'][9996])

give me a break how can anyone say that this is a good hockey movie i know that movies tend to do a pretty p poor job of portraying hockey to the general public and yes this was made back when the u s hadn t embraced our sport to the extent is has today but really i have played hockey all my life and have watched even more and this my friends is sheer lunacy the scenes on the ice were stupefyingly bizzare the particular instance to which i am referring is the sword fight er i mean the stick fight at the end of the film during which everyone is just standing around and watching not with fascination that this is actually happening but in wonder as to who will win the duel between youngblood and his nemesis rakkie yes the story off the ice is a little better i do stress little br br i don t know maybe there is no point in going on i mean let s face it the film is right hockey is just one big battle on ice oh yeah with a little piece of vulcanized rubber bouncing around occasionally into w

In [46]:
clf.predict(X_test[-4])

array(['negative'], dtype=object)