In [1]:
import pandas as pd
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from bs4 import BeautifulSoup
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import warnings
warnings.filterwarnings("ignore")

# Data Acquistion

In [2]:
data = pd.read_csv('IMDB Dataset.csv')

In [3]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
data.shape

(50000, 2)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [6]:
data.isnull().sum()

review       0
sentiment    0
dtype: int64

In [7]:
data.duplicated().sum()

418

In [8]:
data.drop_duplicates(inplace=True)

In [9]:
df = data[:20000]

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20000 entries, 0 to 20074
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     20000 non-null  object
 1   sentiment  20000 non-null  object
dtypes: object(2)
memory usage: 468.8+ KB


# Text Preprocessing
### - lowercase
### - removal of HTML Tags
### - removal of punctuations
### - removal of non-alphabetic characters
### - tokenization
### - removal of stop words
### - stemming

In [11]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

In [12]:
def preprocess(review):
    
    #lowercase
    review = review.lower()
    
    #removal of HTML tags
    review = BeautifulSoup(review).text
    
    #removal of punctuations
    for i in string.punctuation:
        review = review.replace(i,'')   
    
    #removal of non-alphabetic characters
    review = review.replace('[^a-zA-Z]', '')
    review = review.replace('\s+',' ')
    
    #tokenization
    token_list = word_tokenize(review)
    
    #removal of stop words
    ans = []
    for word in token_list:
        if word not in stop_words:
            ans.append(word)
    
    #stemming
    temp_list = []
    for word in ans:
        temp_list.append(stemmer.stem(word))
    final_ans = ' '.join(temp_list)
    
    return final_ans

In [13]:
df['preprocessed_review'] = df['review'].apply(preprocess)

In [14]:
df.head()

Unnamed: 0,review,sentiment,preprocessed_review
0,One of the other reviewers has mentioned that ...,positive,one review mention watch 1 oz episod youll hoo...
1,A wonderful little production. <br /><br />The...,positive,wonder littl product film techniqu unassum old...
2,I thought this was a wonderful way to spend ti...,positive,thought wonder way spend time hot summer weeke...
3,Basically there's a family where a little boy ...,negative,basic there famili littl boy jake think there ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter mattei love time money visual stun film...


In [15]:
cv = CountVectorizer(max_features=5000)

In [16]:
X = cv.fit_transform(df['preprocessed_review']).toarray()

In [17]:
new_df = df.copy()

In [18]:
test_df = pd.concat([new_df, pd.DataFrame(X, index=new_df.index)], axis=1)

In [19]:
test_df.shape

(20000, 5003)

In [20]:
test_df['sentiment'] = LabelEncoder().fit_transform(test_df['sentiment'])

In [21]:
y = test_df['sentiment'].values

In [22]:
y.shape

(20000,)

In [23]:
X = test_df.drop(['review', 'preprocessed_review', 'sentiment'], axis=1)

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=2)

In [25]:
classifier = RandomForestClassifier(n_estimators=100, random_state=2)

In [26]:
classifier.fit(X_train,y_train)

In [27]:
y_pred = classifier.predict(X_test)

In [28]:
accuracy_score(y_test, y_pred)

0.84275

In [29]:
confusion_matrix(y_test, y_pred)

array([[1742,  301],
       [ 328, 1629]], dtype=int64)

In [30]:
def sentiment_predictor(review):
    review = preprocess(review)
    review = cv.transform([review]).toarray()
    if classifier.predict(review) == 0:
        return "Negative"
    else:
        return "Positive"

In [31]:
sentiment_predictor("This is one of the worst directoral venture of Srijit Mukhopadhyay.. He tried to capitalise only on the Karishma of the Mahanayak, but failed miserably... The highlight of the movie is extremely weak storyline and even weaker technology usage... To make such an absurd storyline believable, the kind of technological support and mixing effect required, is totally missing in the film..")

'Negative'