In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from bs4 import BeautifulSoup
import gensim
from nltk.tokenize import sent_tokenize
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [2]:
data = pd.read_csv('IMDB Dataset.csv')

In [3]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
data.shape

(50000, 2)

In [5]:
data.isnull().sum()

review       0
sentiment    0
dtype: int64

In [6]:
data.duplicated().sum()

418

In [7]:
data.drop_duplicates(inplace=True)

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49582 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     49582 non-null  object
 1   sentiment  49582 non-null  object
dtypes: object(2)
memory usage: 1.1+ MB


In [9]:
df = data.sample(25000)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25000 entries, 15255 to 49866
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     25000 non-null  object
 1   sentiment  25000 non-null  object
dtypes: object(2)
memory usage: 585.9+ KB


In [11]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

In [12]:
def preprocess(review):
    
    #lowercase
    review = review.lower()
    
    #removal of HTML tags
    review = BeautifulSoup(review).text
    
    #removal of punctuations
    for i in string.punctuation:
        review = review.replace(i,'')   
    
    #removal of non-alphabetic characters
    review = review.replace('[^a-zA-Z]', '')
    review = review.replace('\s+',' ')
    
    #tokenization
    token_list = word_tokenize(review)
    
    #removal of stop words
    ans = []
    for word in token_list:
        if word not in stop_words:
            ans.append(word)
    
    #stemming
    temp_list = []
    for word in ans:
        temp_list.append(stemmer.stem(word))
    final_ans = ' '.join(temp_list)
    
    return final_ans

In [13]:
df['preprocessed_review'] = df['review'].apply(preprocess)

In [14]:
df.head()

Unnamed: 0,review,sentiment,preprocessed_review
15255,The Pallbearer is a disappointment and at time...,negative,pallbear disappoint time extrem bore love stor...
3747,"The film is a joy to watch, not just for the p...",positive,film joy watch plot grip also superb perform a...
43978,"It was a good story, but not very well told. I...",negative,good stori well told like theme main stori lin...
510,Chinese Ghost Story III is a totally superfluo...,negative,chines ghost stori iii total superflu sequel t...
44628,Hoot is a nice young person's film about a gro...,positive,hoot nice young person film group middl school...


In [15]:
model = Word2Vec(min_count=2, window=10)

In [16]:
story = []

for doc in df['preprocessed_review']:
    raw_sent = sent_tokenize(doc)
    for sent in raw_sent:
        story.append(simple_preprocess(sent))

In [17]:
model.build_vocab(story)

In [18]:
model.train(story, epochs=model.epochs, total_examples=model.corpus_count)

(13727450, 14841565)

In [19]:
model.wv.most_similar("good")

[('decent', 0.7323631644248962),
 ('great', 0.6552632451057434),
 ('bad', 0.6402526497840881),
 ('nice', 0.6349842548370361),
 ('fine', 0.5863806009292603),
 ('averag', 0.5778854489326477),
 ('cool', 0.5754561424255371),
 ('okay', 0.5690698623657227),
 ('ok', 0.5480450987815857),
 ('alright', 0.542766273021698)]

In [20]:
model.wv.most_similar("bad")

[('terribl', 0.7194075584411621),
 ('aw', 0.7070541381835938),
 ('horribl', 0.7054831385612488),
 ('suck', 0.6765496730804443),
 ('crappi', 0.6556411385536194),
 ('good', 0.6402526497840881),
 ('lousi', 0.624571681022644),
 ('cheesi', 0.6093074679374695),
 ('poor', 0.6075887680053711),
 ('lame', 0.6057199239730835)]

In [21]:
len(model.wv.index_to_key)

40130

In [22]:
model.wv

<gensim.models.keyedvectors.KeyedVectors at 0x22c8e669d20>

In [23]:
def document_vector(doc):
    doc = [word for word in doc.split(" ") if word in model.wv.index_to_key]
    return np.mean(model.wv[doc], axis = 0)

In [24]:
X = []

for doc in df['preprocessed_review'].values:
    X.append(document_vector(doc))

In [25]:
X = np.array(X)

In [26]:
X.shape

(25000, 100)

In [27]:
df['sentiment'] = LabelEncoder().fit_transform(df['sentiment'])

In [28]:
y = df['sentiment'].values

In [29]:
y.shape

(25000,)

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=2)

In [31]:
xgb = XGBClassifier()

In [32]:
xgb.fit(X_train,y_train)

In [33]:
y_pred = xgb.predict(X_test)

In [34]:
accuracy_score(y_test, y_pred)

0.841

In [35]:
confusion_matrix(y_test, y_pred)

array([[2129,  412],
       [ 383, 2076]], dtype=int64)

In [36]:
def sentiment_decoder(sent):
    sent = preprocess(sent)
    X = []
    X.append(document_vector(sent))
    X = np.array(X)
    if xgb.predict(X) == 0:
        return "Negative"
    else:
        return "Positive"

In [37]:
sentiment_decoder("The best movie in history and the best ending in any entertainment business")

'Positive'

In [38]:
sentiment_decoder("I have no idea, why this movie has received such a good rating. It´s an average prison movie without any highlights. Typical Hollywood stuff. There are hundreds of better movies around. The movie drags, the actors are average. It´s just this typical 'heartbreaking' stuff and far away from other movies up on this list.")

'Negative'