# IMDB Sentiment Analysis

In [166]:
import os
from os import listdir
import random
import string
import nltk
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import set_config
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from nltk.corpus import stopwords
set_config(print_changed_only=False)


## Classes

In [167]:
class Review:
    def __init__(self, text, polarity):
        self.text = text.translate(str.maketrans(' ', ' ',
                                   string.punctuation)).replace("br", "")
        self.polarity = polarity


class Features:
    def __init__(self, reviews):
        self.reviews = reviews.text
        self.new_text = self.Stopwords()

    def Stopwords(self):
        sw_nltk = stopwords.words('english')
        words = [word for word in self.reviews.split()
                 if word.lower() not in sw_nltk]
        new_text = " ".join(words)
        return new_text


class Target:
    def __init__(self, reviews):
        self.reviews = reviews.polarity


class Vectorization:
    def __init__(self, X):
        self.X = X

        # X_train, X_test CountVectorizer() transformations
        self.X_train_vectors_count = self.count_vectorization()[0]
        self.X_test_vectors_count = self.count_vectorization()[1]
        self.Vectorizer_count = self.count_vectorization()[2]
        # X_train, X_test TfidfVectorizer() transformation
        self.X_train_vectors_tfidf = self.tfidf_vectorization()[0]
        self.X_test_vectors_tfidf = self.tfidf_vectorization()[1]
        self.Vectorizer_tfidf = self.tfidf_vectorization()[2]

    def count_vectorization(self):
        vectorizer = CountVectorizer()
        X_train_vectors = vectorizer.fit_transform(self.X[0])
        X_test_vectors = vectorizer.transform(self.X[1])
        return (X_train_vectors, X_test_vectors, vectorizer)

    def tfidf_vectorization(self):
        vectorizer = TfidfVectorizer(lowercase=True, min_df=0.0001)
        X_train_vectors = vectorizer.fit_transform(self.X[0])
        X_test_vectors = vectorizer.transform(self.X[1])
        return (X_train_vectors, X_test_vectors, vectorizer)


class Logistic_Regression:

    def __init__(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train
        self.model = self.Model()

    def Model(self):
        clf_log = LogisticRegression(max_iter=1000)
        clf_log.fit(self.X_train, self.y_train)
        return (clf_log)


class Metrics:

    def __init__(self, model, X_test, y_test):
        self.model = model
        self.X_test = X_test
        self.y_test = y_test
        self.mean_score = self.mean_score()
        self.f1_score = self.F1_score()

    def mean_score(self):
        return self.model.score(self.X_test, self.y_test)

    def F1_score(self):
        return f1_score(self.y_test, self.model.predict(self.X_test),
                        average=None, labels=['positive', 'negative'])


### Store negative reviews

In [168]:
reviews = []
for links in os.listdir('./aclImdb_v1/aclImdb/test/neg'):
    with open('./aclImdb_v1/aclImdb/test/neg/{}'.format(links),
              encoding="utf8") as f:
        for line in f:
            reviews.append(Review(line, 'negative'))


### Store positive reviews

In [169]:
for links in os.listdir('./aclImdb_v1/aclImdb/test/pos'):
    with open('./aclImdb_v1/aclImdb/test/pos/{}'.format(links),
              encoding="utf8") as f:
        for line in f:
            reviews.append(Review(line, 'positive'))


### Shuffle the reviews so that it isn't directly positive reviews followed by negative

In [170]:
random.shuffle(reviews)


#### Seperate into X and y variables

In [171]:
X = []

for line in reviews:
    X.append(Features(line).new_text)


In [172]:
y = []
for line in reviews:
    y.append(Target(line).reviews)


#### Train-test split

In [173]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.33,
                                                    random_state=42)


### Vectorize X_train and X_test

In [174]:
vectorize = Vectorization((X_train, X_test))


#### Bag of words vectorization

In [175]:
X_train_vectors_bow = vectorize.X_train_vectors_count
X_test_vectors_bow = vectorize.X_test_vectors_count


#### Tfidf Vectorization

In [176]:
X_train_vectors_tfidf = vectorize.X_train_vectors_tfidf
X_test_vectors_tfidf = vectorize.X_test_vectors_tfidf


### Classification

#### Logistic Regression

In [177]:
model_bow = Logistic_Regression(X_train_vectors_bow, y_train).model
model_tfidf = Logistic_Regression(X_train_vectors_tfidf, y_train).model


In [179]:
X_train_vectors_tfidf

<16750x41712 sparse matrix of type '<class 'numpy.float64'>'
	with 1604814 stored elements in Compressed Sparse Row format>

### Evaluation

#### Mean Accuracy

In [180]:
# Mean Accuracy
print (Metrics(model_bow, X_test_vectors_bow, y_test).mean_score)
print (Metrics(model_tfidf, X_test_vectors_tfidf, y_test).mean_score)


0.8797575757575757
0.8900606060606061


#### F1 Scores

In [181]:
print (Metrics(model_bow, X_test_vectors_bow, y_test).f1_score)
print (Metrics(model_tfidf, X_test_vectors_tfidf, y_test).f1_score)


[0.8804243  0.87908337]
[0.89164974 0.88842416]


##### I want to improve our scores. There are a couple of things that we can do.
* <s>Making each string lowercase</s>
* <s>Removing punctuation</s>
* <s>Removing common words such as "and", "to", "or", "the", "is", and more</s>
* <s>Adding weights to words</s>

### Save our model, and vectorizer

In [186]:
import pickle
model_name = 'imdb.pkl'
pickle.dump(model_tfidf, open('./webapp/model/{}'.format(model_name), 'wb'))
loaded_model = pickle.load(open('./webapp/model/{}'.format(model_name), 'rb'))

Vector = vectorize.Vectorizer_tfidf
vector_name = 'tfidf1.pkl'
pickle.dump(Vector, open('./webapp/model/{}'.format(vector_name), "wb"))
tf1 = pickle.load(open('./webapp/model/{}'.format(vector_name), 'rb'))

### Testing out pickled model and vectorizer

#### Converting string into vectorizer form

In [187]:
string = ['bad bad bad bad bad bad', 'good good good good', 'bad good bad', 'This movie is awful. How can anyone spend their time with this garbage?!']
tf1_vectored = tf1.transform(string)

#### Predicting the result

In [188]:
loaded_model.predict(tf1_vectored)

array(['negative', 'positive', 'negative', 'negative'], dtype='<U8')

## References

Andrew L. Maas, Raymond E. Daly, Peter T. Pham, Dan Huang, Andrew Y. Ng, and Christopher Potts. 2011. Learning Word Vectors for Sentiment Analysis. (June 2011). Retrieved November 2021 from http://www.aclweb.org/anthology/P11-1015 