# IMDB Sentiment Analysis

In [89]:
import os
import numpy as np
from os import listdir
import pandas as pd
import requests as re
import json
import random
import string
import nltk
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import set_config
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from nltk.corpus import stopwords
set_config(print_changed_only=False)


## Classes

In [150]:
class Review:
    def __init__(self, text, polarity):
        self.text = text.translate(str.maketrans(' ', ' ',
                                   string.punctuation)).replace("br", "")
        self.polarity = polarity


class Features:
    def __init__(self, reviews):
        self.reviews = reviews.text
        self.new_text = self.Stopwords()

    def Stopwords(self):
        sw_nltk = stopwords.words('english')
        words = [word for word in self.reviews.split()
                 if word.lower() not in sw_nltk]
        new_text = " ".join(words)
        return new_text


class Target:
    def __init__(self, reviews):
        self.reviews = reviews.polarity


class Vectorization:
    def __init__(self, X):
        self.X = X

        # X_train, X_test CountVectorizer() transformations
        self.X_train_vectors_count = self.count_vectorization()[0]
        self.X_test_vectors_count = self.count_vectorization()[1]
        # X_train, X_test TfidfVectorizer() transformation
        self.X_train_vectors_tfidf = self.tfidf_vectorization()[0]
        self.X_test_vectors_tfidf = self.tfidf_vectorization()[1]

    def count_vectorization(self):
        vectorizer = CountVectorizer()
        X_train_vectors = vectorizer.fit_transform(self.X[0])
        X_test_vectors = vectorizer.transform(self.X[1])
        return (X_train_vectors, X_test_vectors)

    def tfidf_vectorization(self):
        vectorizer = TfidfVectorizer(lowercase=True, min_df=0.0001)
        X_train_vectors = vectorizer.fit_transform(self.X[0])
        X_test_vectors = vectorizer.transform(self.X[1])
        return (X_train_vectors, X_test_vectors)


class Logistic_Regression:

    def __init__(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train
        self.model = self.Model()

    def Model(self):
        clf_log = LogisticRegression(max_iter=1000)
        clf_log.fit(self.X_train, self.y_train)
        return (clf_log)


class Metrics:

    def __init__(self, model, X_test, y_test):
        self.model = model
        self.X_test = X_test
        self.y_test = y_test
        self.mean_score = self.mean_score()
        self.f1_score = self.F1_score()

    def mean_score(self):
        return self.model.score(self.X_test, self.y_test)

    def F1_score(self):
        return f1_score(self.y_test, self.model.predict(self.X_test),
                        average=None, labels=['positive', 'negative'])


### Store negative reviews

In [151]:
reviews = []
for links in os.listdir('./aclImdb_v1/aclImdb/test/neg'):
    with open('./aclImdb_v1/aclImdb/test/neg/{}'.format(links),
              encoding="utf8") as f:
        for line in f:
            reviews.append(Review(line, 'negative'))


### Store positive reviews

In [152]:
for links in os.listdir('./aclImdb_v1/aclImdb/test/pos'):
    with open('./aclImdb_v1/aclImdb/test/pos/{}'.format(links),
              encoding="utf8") as f:
        for line in f:
            reviews.append(Review(line, 'positive'))


### Shuffle the reviews so that it isn't directly positive reviews followed by negative

In [153]:
random.shuffle(reviews)


#### Seperate into X and y variables

In [154]:
X = []

for line in reviews:
    X.append(Features(line).new_text)


In [155]:
y = []
for line in reviews:
    y.append(Target(line).reviews)


#### Train-test split

In [156]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.33,
                                                    random_state=42)


### Vectorize X_train and X_test

In [157]:
vectorize = Vectorization((X_train, X_test))


#### Bag of words vectorization

In [158]:
X_train_vectors_bow = vectorize.X_train_vectors_count
X_test_vectors_bow = vectorize.X_test_vectors_count


#### Tfidf Vectorization

In [159]:
X_train_vectors_tfidf = vectorize.X_train_vectors_tfidf
X_test_vectors_tfidf = vectorize.X_test_vectors_tfidf


### Classification

#### Logistic Regression

In [160]:
model_bow = Logistic_Regression(X_train_vectors_bow, y_train).model
model_tfidf = Logistic_Regression(X_train_vectors_tfidf, y_train).model


### Evaluation

#### Mean Accuracy

In [161]:
# Mean Accuracy
print (Metrics(model_bow, X_test_vectors_bow, y_test).mean_score)
print (Metrics(model_tfidf, X_test_vectors_tfidf, y_test).mean_score)


0.8865454545454545
0.8936969696969697


#### F1 Scores

In [162]:
print (Metrics(model_bow, X_test_vectors_bow, y_test).f1_score)
print (Metrics(model_tfidf, X_test_vectors_tfidf, y_test).f1_score)


[0.88741881 0.88565844]
[0.89555794 0.89176848]


##### I want to improve our scores. There are a couple of things that we can do.
* <s>Making each string lowercase</s>
* <s>Removing punctuation</s>
* <s>Removing common words such as "and", "to", "or", "the", "is", and more</s>
* <s>Adding weights to words</s>

### Analyzing word frequencies

In [122]:
X1 = []

for row in X:
    for split in row.split():
        X1.append(split)
        

In [123]:
Counter(X1).most_common()[0:100]


[('like', 18649),
 ('good', 13650),
 ('would', 11750),
 ('time', 11576),
 ('see', 10963),
 ('even', 10622),
 ('story', 10539),
 ('bad', 8406),
 ('great', 8297),
 ('well', 8095),
 ('make', 7507),
 ('first', 7462),
 ('made', 7409),
 ('way', 7340),
 ('could', 7325),
 ('dont', 7237),
 ('seen', 6597),
 ('character', 6443),
 ('watch', 6401),
 ('many', 6299),
 ('know', 6211),
 ('plot', 6065),
 ('acting', 6052),
 ('never', 6012),
 ('show', 5878),
 ('love', 5840),
 ('best', 5650),
 ('ever', 5646),
 ('little', 5608),
 ('life', 5551),
 ('better', 5501),
 ('end', 5144),
 ('something', 4756),
 ('still', 4670),
 ('Im', 4432),
 ('didnt', 4380),
 ('thing', 4365),
 ('real', 4350),
 ('back', 4337),
 ('man', 4305),
 ('watching', 4281),
 ('doesnt', 4276),
 ('funny', 4238),
 ('years', 4132),
 ('makes', 4056),
 ('find', 4050),
 ('work', 4000),
 ('lot', 3989),
 ('actually', 3970),
 ('going', 3960),
 ('look', 3838),
 ('though', 3683),
 ('cant', 3653),
 ('another', 3643),
 ('part', 3636),
 ('old', 3577),
 ('no

## References

Andrew L. Maas, Raymond E. Daly, Peter T. Pham, Dan Huang, Andrew Y. Ng, and Christopher Potts. 2011. Learning Word Vectors for Sentiment Analysis. (June 2011). Retrieved November 2021 from http://www.aclweb.org/anthology/P11-1015 