### Importing necessary libraries.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from nltk.corpus import stopwords, wordnet
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.metrics import accuracy_score
from sklearn.datasets import make_classification
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
import nltk
nltk.download('stopwords')
for dependency in ("brown", "names", "wordnet", "averaged_perceptron_tagger", "universal_tagset"):
    nltk.download(dependency)
import re
import string
from sklearn.base import TransformerMixin

In [None]:
def print_confusion_matrix(confusion_matrix, class_names, figsize = (11,7), fontsize=9, ylabel='True label', xlabel='Predicted label'):
    df_cm = pd.DataFrame(
        confusion_matrix, index=class_names, columns=class_names, 
    )
    fig = plt.figure(figsize=figsize)
    try:
        heatmap = sns.heatmap(df_cm, annot=True , fmt="d")
        
    except ValueError:
        
        raise ValueError("Confusion matrix values must be integers")
        
    heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=fontsize)
    heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=0, ha='right', fontsize=fontsize)
    plt.ylabel(ylabel)
    plt.xlabel(xlabel)

In [None]:
def evaluate_model(model, X, y, X_test, y_test, target_names=None):
    scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    scores_test = cross_val_score(model, X_test, y_test, cv=5, scoring='accuracy')
    
    print("Accuracy train: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))
    print("Accuracy test: %0.2f (+/- %0.2f)" % (scores_test.mean(), scores_test.std()))
    
    print("Test classification report: ")
    if target_names is None:
        target_names = model.classes_
    print(classification_report(y_test, model.predict(X_test), target_names=target_names))
    print("Test confusion matrix: ")
    print_confusion_matrix(confusion_matrix(y_test, model.predict(X_test)), class_names=target_names)

### Importing dataset consisting of IMDB reviews with their subsequent sentiment classification.

In [None]:
imdb_data = pd.read_csv("IMDB Dataset.csv")

In [None]:
# Looking at first 10 rows of the dataset
imdb_data.head(10)

In [None]:
# Counting the number of positive and negative sentiments to ensure there is no presence of class imbalance.
imdb_data.groupby(['sentiment']).agg('count')

In [None]:
ax=imdb_data.sentiment.value_counts().plot(kind='bar')

### Preprocessing the data using Spacy.

In [None]:
# Load English tokenizer, tagger, parser, NER and word vectors
#nlp = spacy.load("en", disable=['parser', 'tagger', 'ner'])

nlp = spacy.load("en_core_web_lg")

In [None]:
imdb_data.review[0]

In [None]:
STOPLIST = set(stopwords.words('english') + ["n't", "'s", "'m", "ca"] + list(ENGLISH_STOP_WORDS) + list(STOP_WORDS))

In [None]:
class TextPreprocessor(TransformerMixin):
    def __init__(self, text_attribute):
        self.text_attribute = text_attribute
        
    def transform(self, X, *_):
        X_copy = X.copy()
        X_copy[self.text_attribute] = X_copy[self.text_attribute].apply(self._preprocess_text)
        return X_copy
    
    def _preprocess_part(self, part):
        return part.apply(self._preprocess_text)

    def _preprocess_text(self, text):
        normalized_text = self._normalize(text)
        doc = nlp(normalized_text)
        removed_punct = self._remove_punct(doc)
        removed_stop_words = self._remove_stop_words(removed_punct)
        removed_non_alpha = self._remove_non_alpha(removed_stop_words)
        return self._lemmatize(removed_non_alpha)
    
    def _normalize(self, text):
        try:
            return ' '.join(normalise(text, variety=self.variety, user_abbrevs=self.user_abbrevs, verbose=False))
        except:
            return text

    def _remove_punct(self, doc):
        return [t for t in doc if t.text not in string.punctuation]

    def _remove_stop_words(self, doc):
        #return [t for t in doc if not t.is_stop]
        return [t for t in doc if t.text not in STOPLIST]
    
    def _remove_non_alpha(self, doc):
        filtered = []
        for t in doc:
            lexeme = nlp.vocab[str(t)]
            if lexeme.is_stop == False and lexeme.is_alpha == True and lexeme.is_oov == False:
                filtered.append(t)
        return filtered

    def _lemmatize(self, text):
        lemma_list = []
        for t in text:
            if(len(t.text)>1):
                lemma_list.append(t.lower_)
        return ' '.join([t.lemma_ for t in text])
    
    def fit(self, *_):
        return self

In [None]:
# Takes almost an hour to run
text_preprocessor = TextPreprocessor(text_attribute='review')
imdb_preprocessed = text_preprocessor.transform(imdb_data)

In [None]:
# Converting all text under "review" to lowercase
imdb_preprocessed.review = imdb_preprocessed.review.str.lower()

# Taking a glance at the preprocessed text of the first review
imdb_preprocessed.review[3]

### Splitting the dataset into 70:30 train/test parts.

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(imdb_preprocessed, test_size=0.3)

### Obtaining TF-IDF vectors on the data.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(analyzer = "word")

X_tfidf_train = tfidf.fit_transform(train['review'])
X_tfidf_test = tfidf.transform(test['review'])

In [None]:
y_train = train['sentiment']
y_test = test['sentiment']

### Obtaining predictions using Naive Bayes model.

In [None]:
from sklearn.naive_bayes import MultinomialNB

start_time = time.time()

nb_clf = MultinomialNB().fit(X_tfidf_train, y_train)
print("--- %s seconds ---" % (time.time() - start_time))



nb_y_pred = nb_clf.predict(X_tfidf_test)

print("--- %s seconds ---" % (time.time() - start_time))



In [None]:
nb_cm = confusion_matrix(y_test, nb_y_pred, labels = ["negative", "positive"])
print(nb_cm)
print("\nAccuracy : ", round(accuracy_score(y_test, nb_y_pred)*100,2),"%\n")
print(classification_report(y_test, nb_y_pred))

In [None]:
evaluate_model(nb_clf, X_tfidf_train, y_train, X_tfidf_test, y_test)

### Obtaining predictions using Random Forest model (bagging).

In [None]:
from sklearn.ensemble import RandomForestClassifier

start_time = time.time()

# As you increase the max_depth, accuracy increases. Try playing around with the hyperparameter values
rf_clf = RandomForestClassifier(max_depth=10, random_state=0).fit(X_tfidf_train, y_train)

print("--- %s seconds ---" % (time.time() - start_time))

rf_y_pred = rf_clf.predict(X_tfidf_test)

print("--- %s seconds ---" % (time.time() - start_time))


In [None]:
rf_cm = confusion_matrix(y_test, rf_y_pred, labels = ["negative", "positive"])
print(rf_cm)
print("\nAccuracy : ", round(accuracy_score(y_test, rf_y_pred)*100,2),"%\n")
print(classification_report(y_test, rf_y_pred))

In [None]:
evaluate_model(rf_clf, X_tfidf_train, y_train, X_tfidf_test, y_test)

### Obtaining predictions using Gradient Boosting model (boosting).

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

start_time = time.time()

gb_clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, 
                                    max_depth=1, random_state=0).fit(X_tfidf_train, y_train)

print("--- %s seconds ---" % (time.time() - start_time))

gb_y_pred = gb_clf.predict(X_tfidf_test)

print("--- %s seconds ---" % (time.time() - start_time))


In [None]:
gb_cm = confusion_matrix(y_test, gb_y_pred, labels = ["negative", "positive"])
print(gb_cm)
print("\nAccuracy : ", round(accuracy_score(y_test, gb_y_pred)*100,2),"%\n")
print(classification_report(y_test, gb_y_pred))

In [None]:
evaluate_model(gb_clf, X_tfidf_train, y_train, X_tfidf_test, y_test)

In [None]:
import time