In [1]:
import pandas as pd
import numpy as np

In [2]:
temp_df = pd.read_csv("IMDB Dataset.csv")

In [3]:
df = temp_df.iloc[:10000]

In [4]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
df['review'][1]

'A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master\'s of comedy and his life. <br /><br />The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell\'s murals decorating every surface) are terribly well d

In [6]:
df['sentiment'].value_counts()

sentiment
positive    5028
negative    4972
Name: count, dtype: int64

In [7]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [8]:
df.duplicated().sum()

17

In [9]:
df = df.drop_duplicates()

In [10]:
df.duplicated().sum()

0

In [11]:
### Basic Preprocessing
### Remove Tags, Lowercase, Stopwords

import re
def remove_tags(raw_text):
    cleaned_text = re.sub(re.compile('<.*?>'),'',raw_text)
    return cleaned_text

In [12]:
df['review'] = df['review'].apply(remove_tags)

In [13]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [14]:
df['review'] = df['review'].apply(lambda x:x.lower())

In [15]:
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


In [16]:
from nltk.corpus import stopwords
sw_list = stopwords.words('english')
print(sw_list)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [17]:
df['review'] = df['review'].apply(lambda x: [item for item in x.split() if item not in sw_list]).apply(lambda x:" ".join(x))

In [18]:
df.head()

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching 1 oz episode ...,positive
1,wonderful little production. filming technique...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically there's family little boy (jake) thi...,negative
4,"petter mattei's ""love time money"" visually stu...",positive


In [19]:
X = df.iloc[:,0:1]
y = df['sentiment']

In [20]:
X

Unnamed: 0,review
0,one reviewers mentioned watching 1 oz episode ...
1,wonderful little production. filming technique...
2,thought wonderful way spend time hot summer we...
3,basically there's family little boy (jake) thi...
4,"petter mattei's ""love time money"" visually stu..."
...,...
9995,"fun, entertaining movie wwii german spy (julie..."
9996,"give break. anyone say ""good hockey movie""? kn..."
9997,movie bad movie. watching endless series bad h...
9998,"movie probably made entertain middle school, e..."


In [21]:
y

0       positive
1       positive
2       positive
3       negative
4       positive
          ...   
9995    positive
9996    negative
9997    negative
9998    negative
9999    positive
Name: sentiment, Length: 9983, dtype: object

In [22]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y =le.fit_transform(y)

In [23]:
y

array([1, 1, 1, ..., 0, 0, 1])

In [24]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [25]:
X_train.shape,X_test.shape

((7986, 1), (1997, 1))

In [26]:
### Applying BOW
from sklearn.feature_extraction.text import CountVectorizer
bow = CountVectorizer()

In [None]:
X_train_bow = bow.fit_transform(X_train['review']).toarray() ## since countvectorizer expects 1D input so we need to specify the column.
X_test_bow = bow.transform(X_test['review']).toarray() ## bow gives us a sparse matrix which needs to be converted into dense array

In [None]:
from sklearn.naive_bayes import GaussianNB
gb = GaussianNB()
## gaussian algorithm does not accept the sparse matrix so we convert it to array explicitly.
gb.fit(X_train_bow,y_train)

In [31]:
y_pred = gb.predict(X_test_bow)

In [32]:
from sklearn.metrics import accuracy_score,confusion_matrix
acc_score = accuracy_score(y_test,y_pred)
cm = confusion_matrix(y_test,y_pred)
print(acc_score)
print(cm)

0.6324486730095142
[[717 235]
 [499 546]]


In [33]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train_bow,y_train)
y_pred1 = rf.predict(X_test_bow)
print(accuracy_score(y_test,y_pred1))
print(confusion_matrix(y_test,y_pred1))

0.8542814221331998
[[815 137]
 [154 891]]


In [34]:
## Using different n-grams
bow = CountVectorizer(ngram_range=(1,2))
X_train_bow = bow.fit_transform(X_train['review']).toarray() 
X_test_bow = bow.transform(X_test['review']).toarray()
rf = RandomForestClassifier()
rf.fit(X_train_bow,y_train)
y_pred1 = rf.predict(X_test_bow)
print(accuracy_score(y_test,y_pred1))
print(confusion_matrix(y_test,y_pred1))
## This is causing error because the feature size is increasing as with (1,2) it accumulates both uni and bigram in the vocabulary which becomes huge.

MemoryError: Unable to allocate 44.5 GiB for an array with shape (7986, 748659) and data type int64

In [None]:
## To help with the above issue we will take only top 5000 features.
bow = CountVectorizer(ngram_range=(1,2),max_features=5000)
X_train_bow = bow.fit_transform(X_train['review']) 
X_test_bow = bow.transform(X_test['review'])
rf = RandomForestClassifier()
rf.fit(X_train_bow,y_train)
y_pred1 = rf.predict(X_test_bow)
print(accuracy_score(y_test,y_pred1))
print(confusion_matrix(y_test,y_pred1))
## Not much improvement so we can use different hyperparameter tuning on rf model and look with the text cleaning and such to improve accuracy.
### Random forest accepts sparse matrix in its fit method so we don't need to convert it to arrays, but it also works with arrays.

0.8387581372058087
[[802 150]
 [172 873]]


### Using TFIDF

In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer()

In [37]:
x_train_tf = tf.fit_transform(X_train['review'])
X_test_tf = tf.transform(X_test['review'])

In [38]:
rf.fit(x_train_tf,y_train)
y_pred1 = rf.predict(X_test_tf)
print(accuracy_score(y_test,y_pred1))
print(confusion_matrix(y_test,y_pred1))

0.8407611417125689
[[809 143]
 [175 870]]


In [39]:
### TFIDF is generally used for Information Retrieval but we can try on classification as well.