# Topic Modeling (with LSA) & Sentiment Analysis (with NB and LR)

## 1. Libraries

In [1]:
import numpy as np
import pandas as pd
import regex as re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import treetaggerwrapper
from sklearn.feature_extraction import stop_words
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from tqdm import tqdm_notebook

  re.IGNORECASE | re.VERBOSE)
  re.VERBOSE | re.IGNORECASE)
  UrlMatch_re = re.compile(UrlMatch_expression, re.VERBOSE | re.IGNORECASE)
  EmailMatch_re = re.compile(EmailMatch_expression, re.VERBOSE | re.IGNORECASE)


## 2. Data Loading & Pre-Processing

In [2]:
# Loading the Data
csv = pd.read_csv('IMDB.csv') # 50,000 labeled {positive, negative} movie reviews

# Collection of reviews <- 'Review' column from the csv
collection = [review for review in csv.iloc[:,0]]

# 'y' into SKLearn (without OneHotEncoding) <- 'Sentiment' column from the csv
y = csv.iloc[:, 1]

## 3. Functions

### 3.1 Generating the Corpus & the Vocabulary

In [3]:
def corpus_voc(collection):
    
    """
    Description: Normalizes each document in a collection to generate a corpus & the vocabulary.
    
    - Input: 
        (1) A list of raw documents [[doc1],[doc2],...,[docN]] where each doc is a raw string.
        
    - Output:
        (1) A list of processed documents [[doc1], [doc2],..., [docN]] where on each doc
        the following operations have been applied:
            - Removal of non-alphabetic characters
            - Case folding of all words
            - Tokenization for each '\w+'
            - Removal of stopwords
            - Lemmatization
        (2) A list [[word1],[word2],...,[wordN]] of all the unique words in the corpus.
    """
    
    # --------------------OBJECTS--------------------
    
    # Stopwords (NLTK & SK-Learn)
    stopwords_nltk = set(stopwords.words('english'))
    stopwords_sklearn = set(stop_words.ENGLISH_STOP_WORDS)
    # Lemmatizer (TreeTagger)
    tagger = treetaggerwrapper.TreeTagger(TAGLANG='en')
    
    # --------------------LOOP--------------------
    
    corpus = []
    for doc in tqdm_notebook(collection):
        # Alphabetization
        _doc = re.sub('[^A-Za-z]', ' ', doc)
        # Case Folding
        _doc = _doc.lower()
        # Tokenization
        _doc = word_tokenize(_doc)
        # StopWords
        _doc = [word for word in _doc if not word in stopwords_nltk.union(stopwords_sklearn)]
        # Lemmatization
        _doc = [re.split(r'\t', word)[2] for word in tagger.tag_text(_doc)]
        # Joining
        _doc = ' '.join(_doc)
        # Appending
        if _doc != '':
            corpus.append(_doc)
    
    # --------------------VOCABULARY--------------------
    
    vocabulary = sorted(list(set(' '.join(corpus).split(" "))))
    
    # --------------------PRINT--------------------
    
    print("No. of Documents in the Corpus:", len(corpus))
    print("No. of Unique Words in the Corpus:", len(vocabulary))
    
    return corpus, vocabulary

### 3.2 Generating a Bag-Of-Words (BOW) Matrix

In [4]:
def bow(corpus, vocabulary, min_df):
    
    """
    Description: Generates a Bag-Of-Words Matrix.
    
    - Input: A corpus of pre-processed documents [doc1, doc2,..., docN].
    
    - Output: A (m x n) bag-of-words matrix, where m is the number of documents 
    in the corpus and n = |V|.
    """
    
    # Model
    bow_vectorizer = CountVectorizer(min_df=min_df,
                                     vocabulary=vocabulary)
    # Fit & Transform
    bow_matrix = bow_vectorizer.transform(collection)

    print("A", bow_matrix.shape, "BOW Matrix has been generated.\n")
    
    return bow_matrix

### 3.3 Transforming BOW Matrix into TF-IDF Matrix

In [5]:
def tfidf(bow_matrix):
    
    """
    Description: Transforms a BOW Matrix into a TF-IDF Matrix.
    
    - Input: A BOW Matrix.
    
    - Output: A (m x n) TF-IDF matrix, where m is the number of documents
    in the corpus and n = |V|.
    """

    # Model
    tfidf_transformer = TfidfTransformer()
    # Fit & Transform
    tfidf_matrix = tfidf_transformer.fit_transform(bow_matrix).toarray()
    
    print("A", bow_matrix.shape, "BOW Matrix has been transformed into a", tfidf_matrix.shape, "TF-IDF Matrix.\n")
    
    return tfidf_matrix

### 3.4 Transforming BOW Matrix into PPMI Matrix

In [6]:
def observed_over_expected(matrix):
    
    # colsum(X,j)
    col_totals = matrix.sum(axis=0)
    # sum(X)
    total = col_totals.sum()
    # rowsum(X,i)
    row_totals = matrix.sum(axis=1)
    # expected
    expected = np.outer(row_totals, col_totals) / total
    # observed / expected
    oe = matrix / expected
    
    return oe


def ppmi(matrix, positive=True):
    matrix = observed_over_expected(matrix)
    # Silence distracting warnings about log(0):
    with np.errstate(divide='ignore'):
        matrix = np.log(matrix)
    matrix[np.isinf(matrix)] = 0.0  # log(0) = 0
    if positive:
        matrix[matrix < 0] = 0.0
    matrix[np.isnan(matrix)] = 0.0
        
    print("A", bow_matrix.shape, "BOW Matrix has been transformed into a", matrix.shape, "PPMI Matrix.\n")

    return matrix

### 3.5 Transforming TF-IDF/PPMI Matrix into (Truncated) SVD Matrix

In [7]:
def svd(matrix, n_components, n_iter):
    
    """
    Description: Transforms a TF-IDF/PPMI Matrix into a (TruncateD) SVD Matrix.
    
    - Input: A TF-IDF Matrix, a specified number of maximum components
    (latent topics), & a specified number of iterations of the algorithm.
    
    - Output: A (m x n) SVD matrix, where m is the number of documents in the 
    corpus and n is the number of specified maximum components.
    """
    
    # Normalization
    normalizer = Normalizer(copy=False)    
    matrix = normalizer.fit_transform(matrix)
    # Model
    svd = TruncatedSVD(n_components=n_components, n_iter=n_iter)
    # Fit 
    svd.fit(matrix)
    # Topics
    topics = svd.components_
    # Index
    index = ["document{}".format(i) for i in range(len(matrix))]
    # Columns
    columns = ["topic{}".format(i) for i in range(n_components)]
    # DataFrame
    svd_matrix = pd.DataFrame(svd.transform(matrix),
                              index=index,
                              columns=columns)
    
    print("A", tfidf_matrix.shape, "TF-IDF/PPMI Matrix has been transformed into a", svd_matrix.shape, "SVD Matrix.\n")
    
    return svd_matrix, topics

### 3.6 Extracting Latent Topics from (Truncated) SVD Matrix 

In [30]:
def latent_topics(terms, topics):
    for i, topic in tqdm_notebook(enumerate(topics[0:20])):
        terms_topic = zip(terms, topic)
        sorted_terms = sorted(terms_topic, key = lambda x : x[1], reverse=True)[:10]
        print("Topic " + str(i) + ": ")
        for t in sorted_terms:
            print(t[0])

### 3.7 Computing Most (Cosine-Wise) Similar Terms

In [9]:
def cos_sim(matrix, term):
    """
    Computes the cosine similarity of one TF-IDF-vectorized term with respect to
    the vocabulary and shows the top 10 most similar ones.
    
    - Input: Term.
    - Output: Real-valued number [0,1].
        
    """
    
    _matrix = pd.DataFrame(matrix, columns=vocabulary)
    cos_sim = {}
    for i in tqdm_notebook(vocabulary):
        cos_sim[i] = cosine_similarity([_matrix[term].values], [_matrix[i].values]).round(3)
    _cos_sim = pd.Series(cos_sim).sort_values(ascending=False)
        
    print(f"Top 20 terms most (cosine-wise) similar to '{term}':")

    return pd.DataFrame(_cos_sim, columns=['Cosine']).head(21)

### 3.8 (Multinomial) Naïve Bayes Classifier

In [10]:
def multinomial_NB_classifier(matrix):
    
    """
    (Multinomial) Naïve Bayes Classifier.
    """
    
    # Splitting Data into Train & Test Set
    X_train, X_test, y_train, y_test = train_test_split(matrix, # BOW, TF-IDF or PPMI
                                                        y,
                                                        test_size=0.20,
                                                        random_state=0)
    # Model
    naiveBayes = MultinomialNB()
    # Fit
    naiveBayes.fit(X_train, y_train)
    # Predictions
    y_pred = naiveBayes.predict(X_test)
    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)

    accuracy = ((cm[0][0] + cm[1][1]) / (cm[0][0] + cm[0][1] + cm[1][0] + cm[1][1])) * 100
    
    print("(Multinomial) Naïve Bayes has been successfully applied to the data.\n")
    print("Confusion Matrix:\n\n", pd.DataFrame(cm))
    print("\nAccuracy:", accuracy, "%")

### 3.10 Logistic Regression Classifier

In [11]:
def logistic_regression_classifier(matrix):
    
    """
    Logistic Regression Classifier.
    """
    
    # Splitting Data into Train & Test Set
    X_train, X_test, y_train, y_test = train_test_split(matrix, # SVD
                                                        y,
                                                        test_size=0.20,
                                                        random_state=0)
    # Model
    logisticRegression = LogisticRegression(solver='lbfgs',
                                            random_state = 0)
    # Fit
    logisticRegression.fit(X_train, y_train)
    # Predictions
    y_pred = logisticRegression.predict(X_test)
    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    # Accuracy
    accuracy = ((cm[0][0] + cm[1][1]) / (cm[0][0] + cm[0][1] + cm[1][0] + cm[1][1])) * 100
    
    print("Logistic Regression has been successfully applied to the data.\n")
    print("Confusion Matrix:\n\n", pd.DataFrame(cm))
    print("\nAccuracy:", accuracy, "%")

## 4. Execution & Evaluation

Using the functions above...

First, I generate:
    - Corpus <- IMBD.csv
    - BOW Matrix <- Corpus
    - TF-IDF Matrix <- Bow Matrix
    - PPMI Matrix <- Bow Matrix
    - (Truncated) SVD Matrix (TF-IDF) <- TF-IDF Matrix
    - (Truncated) SVD Matrix (PPMI) <- PPMI Matrix

Then I extract the 1000 Latent Topics from the SVD Matrix, and compute
the top 20 terms cosine-wise most similar terms for 'bad' and 'good'. 

Finally, I apply the following algorithms to the data:
    - (Multinomial) Naïve Bayes <- BOW Matrix
    - (Multinomial) Naïve Bayes <- TF-IDF Matrix
    - (Multinomial) Naïve Bayes <- PPMI Matrix
    - Logistic Regression <- (Truncated) SVD Matrix (TF-IDF)
    - Logistic Regression <- (Truncated) SVD Matrix (PPMI)

And for each model I calculate the % of accuracy. 

In [12]:
# Corpus <- IMBD.csv
corpus, vocabulary = corpus_voc(collection)

HBox(children=(IntProgress(value=0, max=50000), HTML(value='')))


No. of Documents in the Corpus: 50000
No. of Unique Words in the Corpus: 86624


In [13]:
# BOW Matrix <- Corpus
bow_matrix = bow(corpus,
                 min_df=2,
                 vocabulary=vocabulary)

A (50000, 86624) BOW Matrix has been generated.



In [14]:
# TF-IDF Matrix <- BOW Matrix
tfidf_matrix = tfidf(bow_matrix)

A (50000, 86624) BOW Matrix has been transformed into a (50000, 86624) TF-IDF Matrix.



In [15]:
# PPMI Matrix <- BOW Matrix
ppmi_matrix = ppmi(bow_matrix)

  return np.true_divide(self.todense(), other)
  return np.true_divide(self.todense(), other)


A (50000, 86624) BOW Matrix has been transformed into a (50000, 86624) PPMI Matrix.



In [17]:
# Term: 'bad'
cos_sim(ppmi_matrix, 'bad')

HBox(children=(IntProgress(value=0, max=86624), HTML(value='')))


Top 20 terms most (cosine-wise) similar to 'bad':


Unnamed: 0,Cosine
bad,[[1.0]]
movie,[[0.33]]
acting,[[0.329]]
so,[[0.307]]
good,[[0.296]]
just,[[0.291]]
even,[[0.283]]
not,[[0.281]]
like,[[0.275]]
but,[[0.274]]


In [18]:
# Term: 'good'
cos_sim(ppmi_matrix, 'good')

HBox(children=(IntProgress(value=0, max=86624), HTML(value='')))


Top 20 terms most (cosine-wise) similar to 'good':


Unnamed: 0,Cosine
good,[[1.0]]
movie,[[0.372]]
but,[[0.369]]
not,[[0.326]]
like,[[0.305]]
if,[[0.304]]
really,[[0.304]]
have,[[0.297]]
bad,[[0.296]]
so,[[0.296]]


In [19]:
# Term: 'enjoy'
cos_sim(ppmi_matrix, 'enjoy')

HBox(children=(IntProgress(value=0, max=86624), HTML(value='')))


Top 20 terms most (cosine-wise) similar to 'enjoy':


Unnamed: 0,Cosine
enjoy,[[1.0]]
you,[[0.206]]
if,[[0.177]]
will,[[0.175]]
movie,[[0.161]]
and,[[0.16]]
can,[[0.153]]
but,[[0.149]]
watch,[[0.147]]
just,[[0.146]]


In [34]:
# Term: 'like'
cos_sim(ppmi_matrix, 'like')

HBox(children=(IntProgress(value=0, max=86624), HTML(value='')))


Top 20 terms most (cosine-wise) similar to 'like':


Unnamed: 0,Cosine
like,[[1.0]]
movie,[[0.361]]
you,[[0.353]]
if,[[0.353]]
just,[[0.346]]
but,[[0.342]]
so,[[0.313]]
really,[[0.31]]
don,[[0.308]]
not,[[0.308]]


In [20]:
# Term: 'love'
cos_sim(ppmi_matrix, 'love')

HBox(children=(IntProgress(value=0, max=86624), HTML(value='')))


Top 20 terms most (cosine-wise) similar to 'love':


Unnamed: 0,Cosine
love,[[1.0]]
and,[[0.272]]
story,[[0.235]]
you,[[0.233]]
so,[[0.226]]
in,[[0.224]]
movie,[[0.221]]
but,[[0.219]]
all,[[0.218]]
great,[[0.212]]


In [21]:
# Term: 'movie'
cos_sim(ppmi_matrix, 'movie')

HBox(children=(IntProgress(value=0, max=86624), HTML(value='')))


Top 20 terms most (cosine-wise) similar to 'movie':


Unnamed: 0,Cosine
movie,[[1.0]]
you,[[0.389]]
not,[[0.382]]
if,[[0.379]]
have,[[0.374]]
good,[[0.372]]
but,[[0.368]]
like,[[0.361]]
so,[[0.351]]
just,[[0.35]]


In [22]:
# Term: 'acting'
cos_sim(ppmi_matrix, 'acting')

HBox(children=(IntProgress(value=0, max=86624), HTML(value='')))


Top 20 terms most (cosine-wise) similar to 'acting':


Unnamed: 0,Cosine
acting,[[1.0]]
bad,[[0.329]]
movie,[[0.309]]
good,[[0.287]]
plot,[[0.275]]
not,[[0.258]]
but,[[0.258]]
have,[[0.255]]
so,[[0.251]]
even,[[0.25]]


In [23]:
# Term: 'fun'
cos_sim(ppmi_matrix, 'fun')

HBox(children=(IntProgress(value=0, max=86624), HTML(value='')))


Top 20 terms most (cosine-wise) similar to 'fun':


Unnamed: 0,Cosine
fun,[[1.0]]
watch,[[0.183]]
you,[[0.175]]
and,[[0.173]]
great,[[0.171]]
but,[[0.17]]
just,[[0.167]]
good,[[0.166]]
if,[[0.162]]
movie,[[0.158]]


In [24]:
# Term: 'boring'
cos_sim(ppmi_matrix, 'boring')

HBox(children=(IntProgress(value=0, max=86624), HTML(value='')))


Top 20 terms most (cosine-wise) similar to 'boring':


Unnamed: 0,Cosine
boring,[[1.0]]
movie,[[0.169]]
just,[[0.162]]
but,[[0.157]]
not,[[0.156]]
so,[[0.153]]
like,[[0.152]]
no,[[0.151]]
even,[[0.151]]
bad,[[0.146]]


In [25]:
# Term: 'terrible'
cos_sim(ppmi_matrix, 'terrible')

HBox(children=(IntProgress(value=0, max=86624), HTML(value='')))


Top 20 terms most (cosine-wise) similar to 'terrible':


Unnamed: 0,Cosine
terrible,[[1.0]]
acting,[[0.203]]
bad,[[0.191]]
worst,[[0.173]]
movie,[[0.167]]
even,[[0.157]]
awful,[[0.154]]
just,[[0.15]]
no,[[0.146]]
have,[[0.143]]


In [26]:
# Term: 'laugh'
cos_sim(ppmi_matrix, 'laugh')

HBox(children=(IntProgress(value=0, max=86624), HTML(value='')))


Top 20 terms most (cosine-wise) similar to 'laugh':


Unnamed: 0,Cosine
laugh,[[1.0]]
funny,[[0.215]]
you,[[0.191]]
loud,[[0.191]]
comedy,[[0.179]]
if,[[0.163]]
movie,[[0.158]]
watch,[[0.15]]
make,[[0.15]]
out,[[0.147]]


In [32]:
# Term: 'horror'
cos_sim(ppmi_matrix, 'horror')

HBox(children=(IntProgress(value=0, max=86624), HTML(value='')))


Top 20 terms most (cosine-wise) similar to 'horror':


Unnamed: 0,Cosine
horror,[[1.0]]
gore,[[0.272]]
scary,[[0.222]]
creepy,[[0.196]]
blood,[[0.19]]
genre,[[0.178]]
out,[[0.168]]
movie,[[0.168]]
budget,[[0.166]]
film,[[0.163]]


In [33]:
# Term: 'funny'
cos_sim(ppmi_matrix, 'funny')

HBox(children=(IntProgress(value=0, max=86624), HTML(value='')))


Top 20 terms most (cosine-wise) similar to 'funny':


Unnamed: 0,Cosine
funny,[[1.0]]
comedy,[[0.28]]
movie,[[0.227]]
but,[[0.223]]
just,[[0.221]]
be,[[0.216]]
so,[[0.215]]
not,[[0.215]]
laugh,[[0.215]]
like,[[0.211]]


In [27]:
# SVD Matrix <- TF-IDF Matrix
svd_matrix_tfidf, topics_tfidf = svd(tfidf_matrix, 
                                     n_components=1000,
                                     n_iter=10)

A (50000, 86624) TF-IDF/PPMI Matrix has been transformed into a (50000, 1000) SVD Matrix.



In [35]:
# SVD Matrix <- PPMI Matrix
svd_matrix_ppmi, topics_ppmi = svd(ppmi_matrix, 
                                   n_components=1000,
                                   n_iter=10)

A (50000, 86624) TF-IDF/PPMI Matrix has been transformed into a (50000, 1000) SVD Matrix.



In [31]:
# Latent Topics <- SVD Matrix (TF-IDF)
latent_topics(vocabulary, topics_tfidf)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Topic 0: 
br
and
to
in
movie
film
you
but
not
on
Topic 1: 
br
spoiler
fiend
uk
match
govinda
humour
team
nuclear
http
Topic 2: 
movie
you
bad
if
don
watch
br
just
good
worst
Topic 3: 
you
film
if
don
bad
will
can
re
to
have
Topic 4: 
show
you
series
episode
to
and
season
tv
if
on
Topic 5: 
you
and
great
film
love
if
will
story
best
movie
Topic 6: 
show
bad
good
and
really
just
funny
acting
but
like
Topic 7: 
and
bad
you
horror
no
plot
up
guy
re
off
Topic 8: 
in
one
you
series
bad
scene
on
best
first
role
Topic 9: 
good
but
really
story
great
horror
to
well
pretty
not
Topic 10: 
horror
series
on
to
movie
great
out
dvd
tv
budget
Topic 11: 
have
book
series
story
not
seen
read
would
better
version
Topic 12: 
bad
to
show
acting
good
you
great
story
in
cast
Topic 13: 
show
not
horror
movie
book
no
plot
and
story
you
Topic 14: 
funny
comedy
not
be
one
seen
have
laugh
but
worst
Topic 15: 
horror
one
have
seen
show
love
great
story
worst
life
Topic 16: 
book
have
horror
bad
would
good
read
lov

In [36]:
# Latent Topics <- SVD Matrix (PPMI)
latent_topics(vocabulary, topics_ppmi)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Topic 0: 
movie
good
watch
acting
if
see
don
really
seen
bad
Topic 1: 
movie
bad
worst
watch
don
acting
waste
you
seen
if
Topic 2: 
great
love
best
recommend
wonderful
see
excellent
saw
will
watch
Topic 3: 
acting
plot
script
story
good
cast
great
excellent
direction
film
Topic 4: 
worst
waste
life
seen
read
people
understand
money
believe
world
Topic 5: 
worst
seen
funny
comedy
tv
show
dvd
best
cast
laugh
Topic 6: 
dvd
saw
video
horror
budget
series
low
tv
watched
ago
Topic 7: 
acting
worst
seen
girl
saw
love
young
mother
father
woman
Topic 8: 
you
horror
seen
laugh
if
will
want
budget
low
worst
Topic 9: 
seen
worst
funny
recommend
saw
watched
comedy
found
times
laugh
Topic 10: 
worth
waste
watching
money
video
dvd
saw
rent
time
recommend
Topic 11: 
watch
watching
recommend
series
show
episode
tv
plot
waste
highly
Topic 12: 
seen
waste
worth
best
money
if
you
worst
don
want
Topic 13: 
music
waste
boring
time
action
love
beautiful
story
animation
song
Topic 14: 
interesting
waste
money

In [37]:
# (Multinomial) Naïve Bayes <- BOW Matrix
multinomial_NB_classifier(bow_matrix)

(Multinomial) Naïve Bayes has been successfully applied to the data.

Confusion Matrix:

       0     1
0  4430   605
1   902  4063

Accuracy: 84.93 %


In [38]:
# (Multinomial) Naïve Bayes <- TF-IDF Matrix
multinomial_NB_classifier(tfidf_matrix)

(Multinomial) Naïve Bayes has been successfully applied to the data.

Confusion Matrix:

       0     1
0  4460   575
1   843  4122

Accuracy: 85.82 %


In [39]:
# (Multinomial) Naïve Bayes <- PPMI Matrix
multinomial_NB_classifier(ppmi_matrix)

(Multinomial) Naïve Bayes has been successfully applied to the data.

Confusion Matrix:

       0     1
0  4415   620
1   807  4158

Accuracy: 85.72999999999999 %


In [40]:
# Logistic Regression <- (Truncated) SVD Matrix (TF-IDF)
logistic_regression_classifier(svd_matrix_tfidf)

Logistic Regression has been successfully applied to the data.

Confusion Matrix:

       0     1
0  4363   672
1   559  4406

Accuracy: 87.69 %


In [41]:
# Logistic Regression <- (Truncated) SVD Matrix (PPMI)
logistic_regression_classifier(svd_matrix_ppmi)

Logistic Regression has been successfully applied to the data.

Confusion Matrix:

       0     1
0  4394   641
1   529  4436

Accuracy: 88.3 %
