In [4]:
import pandas as pd
import numpy as np
import nltk, keras, string, re, html, math

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter, defaultdict
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from sklearn.metrics import accuracy_score, classification_report

In [5]:
#Loads the IMDB dataset. We load it using pandas as dataframe
data = pd.read_csv('/home/jadebski/PycharmProjects/IUI_ED/IMDB Dataset.csv')
print("Data shape - ", data.shape, "\n")                                  #prints the number of rows and columns

for col in data.columns:
    print("The number of null values - ", col, data[col].isnull().sum())
#prints the number of null values in each column

data["review"]= data["review"].str.lower()
data["sentiment"]= data["sentiment"].str.lower()             #converts every value in the column to lowercase
data.head()

Data shape -  (50000, 2) 

The number of null values -  review 0
The number of null values -  sentiment 0


Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


In [8]:
def cleaning(data):
    clean = re.sub('<.*?>', ' ', str(data))
#removes HTML tags
    clean = re.sub('\'.*?\s',' ', clean)
#removes all hanging letters afer apostrophes (s in it's)
    clean = re.sub(r'http\S+',' ', clean)
#removes URLs
    clean = re.sub('\W+',' ', clean)
#replacing the non alphanumeric characters
    return html.unescape(clean)
data['cleaned'] = data['review'].apply(cleaning)


def tokenizing(data):
    review = data['cleaned']
#tokenizing is done
    tokens = nltk.word_tokenize(review)
    return tokens
data['tokens'] = data.apply(tokenizing, axis=1)


stop_words = set(stopwords.words('english'))
def remove_stops(data):
    my_list = data['tokens']
    meaningful_words = [w for w in my_list if not w in stop_words]           #stopwords are removed from the tokenized data
    return (meaningful_words)
data['tokens'] = data.apply(remove_stops, axis=1)


lemmatizer = WordNetLemmatizer()
def lemmatizing(data):
    my_list = data['tokens']
    lemmatized_list = [lemmatizer.lemmatize(word) for word in my_list]
#lemmatizing is performed. It's more efficient than stemming.
    return (lemmatized_list)
data['tokens'] = data.apply(lemmatizing, axis=1)

def rejoin_words(data):
    my_list = data['tokens']
    joined_words = ( " ".join(my_list))
#rejoins all stemmed words
    return joined_words
data['cleaned'] = data.apply(rejoin_words, axis=1)

data.head()

Unnamed: 0,review,sentiment,cleaned,tokens
0,one of the other reviewers has mentioned that ...,positive,one reviewer mentioned watching 1 oz episode h...,"[one, reviewer, mentioned, watching, 1, oz, ep..."
1,a wonderful little production. <br /><br />the...,positive,wonderful little production filming technique ...,"[wonderful, little, production, filming, techn..."
2,i thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...,"[thought, wonderful, way, spend, time, hot, su..."
3,basically there's a family where a little boy ...,negative,basically family little boy jake think zombie ...,"[basically, family, little, boy, jake, think, ..."
4,"petter mattei's ""love in the time of money"" is...",positive,petter mattei love time money visually stunnin...,"[petter, mattei, love, time, money, visually, ..."


Unnamed: 0,review,sentiment,cleaned,tokens
0,one of the other reviewers has mentioned that ...,positive,one reviewer mentioned watching 1 oz episode h...,"[one, reviewer, mentioned, watching, 1, oz, ep..."
1,a wonderful little production. <br /><br />the...,positive,wonderful little production filming technique ...,"[wonderful, little, production, filming, techn..."
2,i thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...,"[thought, wonderful, way, spend, time, hot, su..."
3,basically there's a family where a little boy ...,negative,basically family little boy jake think zombie ...,"[basically, family, little, boy, jake, think, ..."
4,"petter mattei's ""love in the time of money"" is...",positive,petter mattei love time money visually stunnin...,"[petter, mattei, love, time, money, visually, ..."


In [9]:
# Prints statistics of Data like avg length of sentence , proportion of data w.r.t class labels
def sents(data):
    clean = re.sub('<.*?>', ' ', str(data))
#removes HTML tags
    clean = re.sub('\'.*?\s',' ', clean)
#removes all hanging letters afer apostrophes (s in it's)
    clean = re.sub(r'http\S+',' ', clean)
#removes URLs
    clean = re.sub('[^a-zA-Z0-9\.]+',' ', clean)
#removes all non-alphanumeric characters except periods.
    tokens = nltk.sent_tokenize(clean)
#sentence tokenizing is done
    return tokens
sents = data['review'].apply(sents)

length_s = 0
for i in range(data.shape[0]):
    length_s+= len(sents[i])
print("The number of sentences is - ", length_s)
#prints the number of sentences

length_t = 0
for i in range(data.shape[0]):
    length_t+= len(data['tokens'][i])
print("\nThe number of tokens is - ", length_t)
#prints the number of tokens

average_tokens = round(length_t/length_s)
print("\nThe average number of tokens per sentence is - ", average_tokens)
#prints the average number of tokens per sentence

positive = negative = 0
for i in range(data.shape[0]):
    if (data['sentiment'][i]=='positive'):
        positive += 1
#finds the proprtion of positive and negative sentiments
    else:
        negative += 1

print("\nThe number of positive examples are - ", positive)
print("\nThe number of negative examples are - ", negative)
print("\nThe proportion of positive to negative sentiments are -", positive/negative)

The number of sentences is -  542611

The number of tokens is -  5961690

The average number of tokens per sentence is -  11

The number of positive examples are -  25000

The number of negative examples are -  25000

The proportion of positive to negative sentiments are - 1.0


In [10]:
# gets reviews column from df
reviews = data['cleaned'].values

# gets labels column from df
labels = data['sentiment'].values
# Uses label encoder to encode labels. Convert to 0/1
encoder = LabelEncoder()
encoded_labels = encoder.fit_transform(labels)
data['encoded']= encoded_labels
print(data['encoded'].head())

# prints(enc.classes_)
encoder_mapping = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))
print("\nThe encoded classes are - ", encoder_mapping)

labels = data['encoded']

0    1
1    1
2    1
3    0
4    1
Name: encoded, dtype: int64

The encoded classes are -  {'negative': 0, 'positive': 1}


In [11]:
# Splits the data into train and test (80% - 20%).
# Uses stratify in train_test_split so that both train and test have similar ratio of positive and negative samples.
train_sentences, test_sentences, train_labels, test_labels = train_test_split(reviews, labels, test_size=0.2, random_state=42, stratify=labels)

In [12]:
# Uses Count vectorizer to get frequency of the words
vectorizer = CountVectorizer(max_features = 3000)

sents_encoded = vectorizer.fit_transform(train_sentences)         #encodes all training sentences
counts = sents_encoded.sum(axis=0).A1
vocab = list(vectorizer.get_feature_names())



In [13]:
class MultinomialNaiveBayes:

    def __init__(self, classes, tokenizer):
      #self.tokenizer = tokenizer
      self.classes = classes

    def group_by_class(self, X, y):
      data = dict()
      for c in self.classes:
#grouping by positive and negative sentiments
        data[c] = X[np.where(y == c)]
      return data

    def fit(self, X, y):
        self.n_class_items = {}
        self.log_class_priors = {}
        self.word_counts = {}
        self.vocab = vocab
#using the pre-made vocabulary of 3000 most frequent training words

        n = len(X)

        grouped_data = self.group_by_class(X, y)

        for c, data in grouped_data.items():
          self.n_class_items[c] = len(data)
          self.log_class_priors[c]=math.log(self.n_class_items[c]/n)
#taking log for easier calculation
          self.word_counts[c] = defaultdict(lambda: 0)

          for text in data:
            counts = Counter(nltk.word_tokenize(text))
            for word, count in counts.items():
                self.word_counts[c][word] += count

        return self
    def laplace_smoothing(self, word, text_class):          #smoothing
      num = self.word_counts[text_class][word] + 1
      denom = self.n_class_items[text_class] + len(self.vocab)
      return math.log(num / denom)

    def predict(self, X):
        result = []
        for text in X:

          class_scores = {c: self.log_class_priors[c] for c in self.classes}

          words = set(nltk.word_tokenize(text))
          for word in words:
              if word not in self.vocab: continue

              for c in self.classes:

                log_w_given_c = self.laplace_smoothing(word, c)
                class_scores[c] += log_w_given_c

          result.append(max(class_scores, key=class_scores.get))

        return result

In [14]:
MNB = MultinomialNaiveBayes(
    classes=np.unique(labels),
    tokenizer=Tokenizer()
).fit(train_sentences, train_labels)

# Tests the model on test set and reports the Accuracy
predicted_labels = MNB.predict(test_sentences)
print("The accuracy of the MNB classifier is ", accuracy_score(test_labels, predicted_labels))
print("\nThe classification report with metrics - \n", classification_report(test_labels, predicted_labels))

The accuracy of the MNB classifier is  0.8533

The classification report with metrics - 
               precision    recall  f1-score   support

           0       0.86      0.85      0.85      5000
           1       0.85      0.86      0.85      5000

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000

