# Topic Modeling (with LSA) & Sentiment Analysis (with NB and LR)

## 1. Libraries

In [2]:
import numpy as np
import pandas as pd
import regex as re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import treetaggerwrapper
from sklearn.feature_extraction import stop_words
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
import scipy.spatial.distance
from tqdm import tqdm_notebook

## 2. Data Loading & Pre-Processing

In [3]:
# Loading the Data
csv = pd.read_csv('IMDB.csv') # 50,000 labeled {positive, negative} movie reviews

# Collection of reviews <- 'Review' column from the csv
collection = [review for review in csv.iloc[:,0]]

# 'y' into SKLearn (without OneHotEncoding) <- 'Sentiment' column from the csv
y = csv.iloc[:, 1]

## 3. Functions

### 3.1 Generating the Corpus & the Vocabulary

In [4]:
def corpus(collection):
    
    """
    Description: Normalizes each document in a collection to generate a corpus.
    
    - Input: 
        (1) A list of raw documents [[doc1],[doc2],...,[docN]] where each doc is a raw string.
        
    - Output:
        A list of processed documents [[doc1], [doc2],..., [docN]] where on each doc
        the following operations have been applied:
            - Removal of non-alphabetic characters
            - Case folding of all words
            - Tokenization for each '\w+'
            - Removal of stopwords
            - Lemmatization
    """
    
    # --------------------OBJECTS--------------------
    
    # Stopwords (NLTK & SK-Learn)
    stopWords = list(set(stopwords.words('english')).union(set(stop_words.ENGLISH_STOP_WORDS)))
    # Lemmatizer (TreeTagger)
    tagger = treetaggerwrapper.TreeTagger(TAGLANG='en')
    
    # --------------------LOOP--------------------
    
    corpus = []
    for doc in tqdm_notebook(collection):
        # Alphabetization
        _doc = re.sub('[^A-Za-z]', ' ', doc)
        # Case Folding
        _doc = _doc.lower()
        # Tokenization
        _doc = word_tokenize(_doc)
        # StopWords
        _doc = [word for word in _doc if not word in stopWords]
        # Lemmatization
        _doc = [re.split(r'\t', word)[2] for word in tagger.tag_text(_doc)]
        # Joining
        _doc = ' '.join(_doc)
        # Appending
        if _doc != '':
            corpus.append(_doc)
    
    # --------------------PRINT--------------------
    
    print("No. of Documents in the Corpus:", len(corpus))
    
    return corpus

### 3.2 Generating a Bag-Of-Words (BOW) Matrix

In [5]:
def bow(corpus, max_features=50000):
    
    """
    Description: Generates a Bag-Of-Words Matrix.
    
    - Input: A corpus of pre-processed documents [doc1, doc2,..., docN].
    
    - Output:
    (1) A (m x n) bag-of-words matrix, where m is the number of documents 
    in the corpus and n = |V|.
    (2) Vocabulary.
    """
    
    # Model
    vectorizer = CountVectorizer(max_features=max_features)                   
    # Fit
    vectorizer.fit(corpus)
    # Vocabulary
    vocabulary = vectorizer.get_feature_names()
    # Stopwords
    stopwords = vectorizer.stop_words_
    # Transform
    matrix = vectorizer.transform(corpus)
    
    print("A", matrix.shape, "BoW Matrix has been generated.\n")
    
    return matrix, vocabulary

### 3.3 Transforming BOW Matrix into TF-IDF Matrix

In [6]:
def tfidf(matrix):
    
    """
    Description: Transforms a BOW Matrix into a TF-IDF Matrix.
    
    - Input: A BoW Matrix.
    
    - Output: A (m x n) TF-IDF matrix, where m is the number of documents
    in the corpus and n = |V|.
    """

    # Model
    transformer = TfidfTransformer()
    # Fit
    transformer.fit(matrix)
    # Transform
    _matrix = transformer.transform(matrix)
    
    print("A", _matrix.shape, "TF-IDF Matrix has been generated.\n")
    
    return _matrix

### 3.4 Transforming BOW Matrix into PPMI Matrix

In [7]:
def observed_over_expected(matrix):
    col_totals = matrix.sum(axis=0)
    total = col_totals.sum()
    row_totals = matrix.sum(axis=1)
    expected = np.outer(row_totals, col_totals) / total
    oe = matrix / expected
    
    return oe


def pmi(matrix, positive=True):
    matrix = observed_over_expected(matrix)
    # Silence distracting warnings about log(0):
    with np.errstate(divide='ignore'):
        matrix = np.log(matrix)
    matrix[np.isinf(matrix)] = 0.0  # log(0) = 0
    if positive:
        matrix[matrix < 0] = 0.0
        print("A", matrix.shape, "PPMI Matrix has been generated.\n")
        
    return matrix

### 3.5 Transforming TF-IDF/PPMI Matrix into (Truncated) SVD Matrix

In [8]:
def svd(matrix, n_components, n_iter, latentTopics=True):
    
    """
    Description: Transforms a TF-IDF/PPMI Matrix into a (TruncateD) SVD Matrix.
    
    - Input: A TF-IDF Matrix, a specified number of maximum components
    (latent topics), & a specified number of iterations of the algorithm.
    
    - Output: A (m x n) SVD matrix, where m is the number of documents in the 
    corpus and n is the number of specified maximum components.
    """
    
    # Normalization
    normalizer = Normalizer(copy=False)    
    matrix = normalizer.fit_transform(matrix)
    # Model
    svd = TruncatedSVD(n_components=n_components, n_iter=n_iter)
    # Fit 
    svd.fit(matrix)
    # Topics
    topics = svd.components_
    # Transform
    matrix = svd.transform(matrix)
    
    print("A", matrix.shape, "SVD Matrix has been generated.\n")
    
    if latentTopics:
        return matrix, topics
    else:
        return matrix

### 3.6 Latent Topics

In [194]:
def latentTopics(voc, topics, n_topics=20):
    dict_with_topics = {}
    for i, topic in tqdm_notebook(enumerate(topics[0:n_topics])):
        terms_topic = zip(voc, topic)
        sorted_terms = sorted(terms_topic, key = lambda x : x[1], reverse=True)[:10]
        
        dict_with_topics["Topic" + str(i)] = [t[0] for t in sorted_terms]
    
    return pd.DataFrame(dict_with_topics)

### 3.7 Near-Neighbors (Cosine Similarity)

In [136]:
def toDataFrame(matrix, vocabulary):
    return pd.DataFrame(matrix, index=vocabulary)

def cosine(u, v):
    return scipy.spatial.distance.cosine(u, v)

def neighbors(word, df, distfunc=cosine, top_n=11):
    """
    Tool for finding the nearest neighbors of 'word' in 'df' according
    to 'distfunc'. The comparisons are between row vectors.

    Parameters
    ----------
    word : str
        The anchor word. Assumed to be in 'rownames'.
    df : pd.DataFrame
        The vector-space model.
    distfunc : function mapping vector pairs to floats (default: 'cosine').

    Raises
    ------
    ValueError
        If word is not in `df.index`.

    Returns
    -------
    pd.Series
        Ordered by closeness to 'word' (Top-10 neighbors).
    """
    
    if word not in df.index:
        raise ValueError('"{}" is not in this VSM'.format(word))
    w = df.loc[word]
    dists = df.apply(lambda x: distfunc(w, x), axis=1)
    
    return dists.sort_values()[1:top_n]

### 3.8 (Multinomial) Naïve Bayes Classifier

In [11]:
def naiveBayes(matrix):
    
    """
    (Multinomial) Naïve Bayes Classifier.
    """
    
    # Splitting Data into Train & Test Set
    X_train, X_test, y_train, y_test = train_test_split(matrix, # BOW, TF-IDF or PPMI
                                                        y,
                                                        test_size=0.20,
                                                        random_state=0)
    # Model
    naiveBayes = MultinomialNB()
    # Fit
    naiveBayes.fit(X_train, y_train)
    # Predictions
    y_pred = naiveBayes.predict(X_test)
    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)

    accuracy = ((cm[0][0] + cm[1][1]) / (cm[0][0] + cm[0][1] + cm[1][0] + cm[1][1])) * 100
    
    print("(Multinomial) Naïve Bayes has been successfully applied to the data.\n")
    print("Confusion Matrix:\n\n", pd.DataFrame(cm))
    print("\nAccuracy:", accuracy, "%")

### 3.10 Logistic Regression Classifier

In [12]:
def logisticRegression(matrix):
    
    """
    Logistic Regression Classifier.
    """
    
    # Splitting Data into Train & Test Set
    X_train, X_test, y_train, y_test = train_test_split(matrix, # SVD
                                                        y,
                                                        test_size=0.20,
                                                        random_state=0)
    # Model
    logisticRegression = LogisticRegression(solver='lbfgs',
                                            random_state = 0)
    # Fit
    logisticRegression.fit(X_train, y_train)
    # Predictions
    y_pred = logisticRegression.predict(X_test)
    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    # Accuracy
    accuracy = ((cm[0][0] + cm[1][1]) / (cm[0][0] + cm[0][1] + cm[1][0] + cm[1][1])) * 100
    
    print("Logistic Regression has been successfully applied to the data.\n")
    print("Confusion Matrix:\n\n", pd.DataFrame(cm))
    print("\nAccuracy:", accuracy, "%")

## 4. Execution & Evaluation

Using the functions above...

(1) First, I generate:
    - Corpus <- IMBD.csv
    - BOW Matrix <- Corpus
    - TF-IDF Matrix <- Bow Matrix
    - PPMI Matrix <- Bow Matrix
    - SVD Matrix TM <- TF-IDF Matrix
    - SVD Matrix TM <- PPMI Matrix
    - SVD Matrix SA <- TF-IDF Matrix
    - SVD Matrix SA <- PPMI Matrix
    - Term-Document Matrix <- SVD Matrix TM (PPMI)

(2) Then I compute the top 20 terms cosine-wise most similar terms for some selected terms, and

(3) I extract the Top-20 Latent Topics from each SVD Matrix (TF-IDF & PPMI).

(4) Finally, I apply the following algorithms to the data:
    - (Multinomial) Naïve Bayes <- BOW Matrix
    - (Multinomial) Naïve Bayes <- TF-IDF Matrix
    - (Multinomial) Naïve Bayes <- PPMI Matrix
    - Logistic Regression <- SVD Matrix (TF-IDF)
    - Logistic Regression <- SVD Matrix (PPMI)

And for each model I calculate the % of accuracy. 

### 4.1 Generating Matrices

In [13]:
# Corpus <- IMBD.csv
corpus = corpus(collection)

HBox(children=(IntProgress(value=0, max=50000), HTML(value='')))


No. of Documents in the Corpus: 50000


In [14]:
# BOW Matrix <- Corpus
bow_matrix, vocabulary = bow(corpus)

A (50000, 50000) BoW Matrix has been generated.



In [15]:
# TF-IDF Matrix <- BOW Matrix
tfidf_matrix = tfidf(bow_matrix)

A (50000, 50000) TF-IDF Matrix has been generated.



In [16]:
# PPMI Matrix <- BOW Matrix
ppmi_matrix = pmi(bow_matrix)

A (50000, 50000) PPMI Matrix has been generated.



In [18]:
# SVD Matrix <- TF-IDF Matrix
svd_tfidf, topics_tfidf = svd(tfidf_matrix, 
                              n_components=1000,
                              n_iter=10)

A (50000, 1000) SVD Matrix has been generated.



In [33]:
# SVD Matrix <- PPMI Matrix
svd_ppmi, topics_ppmi = svd(ppmi_matrix, 
                            n_components=1000,
                            n_iter=10)

A (50000, 1000) SVD Matrix has been generated.



In [20]:
# SVD Matrix <- TF-IDF Matrix
svd_tfidf_SIM = svd(tfidf_matrix.T,
                    n_components=300,
                    n_iter=10,
                    latentTopics=False)

A (50000, 300) SVD Matrix has been generated.



In [21]:
# SVD Matrix <- PPMI Matrix
svd_ppmi_SIM = svd(ppmi_matrix.T, 
                   n_components=300,
                   n_iter=10,
                   latentTopics=False)

A (50000, 300) SVD Matrix has been generated.



### 4.2 Terms Similarity

In [197]:
# Matrices into DataFrames for function 'neighbors'
df_tfidf = toDataFrame(svd_tfidf_SIM, vocabulary)
df_ppmi = toDataFrame(svd_ppmi_SIM, vocabulary)
# List of terms to be computed
terms = ['bad', 'good', 'fun', 'boring', 'horror', 'man', 'woman','enjoy', 'recommend', 'amazing', 'terrible', 'dog', 'animal', 'cat']

In [142]:
# Top-10 most similar (cosine-wise) terms: TF-IDF
pd.DataFrame({term: list(neighbors(term, df_tfidf).index) for term in terms})

Unnamed: 0,bad,good,fun,boring,horror,man,woman,enjoy,recommend,amazing,terrible,dog,animal,cat
0,horrible,really,entertaining,bore,scare,take,male,recommend,definitely,great,bad,old,dog,dalmation
1,terrible,well,fan,acting,scary,help,man,definitely,enjoy,fantastic,horrible,kid,polar,kitten
2,awful,worth,lot,bad,halloween,face,girl,see,see,recommend,waste,animal,water,douchess
3,waste,see,enjoy,waste,blood,turn,young,great,great,definitely,awful,run,eat,feline
4,worst,lot,like,terrible,creepy,course,husband,good,saw,enjoy,worst,head,bear,aristocats
5,bother,time,nice,minute,gore,catch,female,time,good,incredible,worse,get,elephant,scat
6,lame,like,stuff,act,slasher,lose,meet,saw,really,perfect,bother,little,cow,gabor
7,unless,movie,little,actually,scared,involve,couple,really,highly,especially,boring,puppy,adult,ev
8,worse,expect,pretty,horrible,poltergeist,begin,face,watch,surprised,highly,bore,decide,cheetah,rybody
9,minute,sure,big,bother,gory,break,behavior,think,especially,excellent,crap,bite,kid,malley


In [143]:
# Top-10 most similar (cosine-wise) terms: PPMI
pd.DataFrame({term: list(neighbors(term, df_ppmi).index) for term in terms})

Unnamed: 0,bad,good,fun,boring,horror,man,woman,enjoy,recommend,amazing,terrible,dog,animal,cat
0,horrible,really,pretty,waste,scary,take,man,time,enjoy,great,awful,old,dog,dog
1,waste,see,lot,bad,scare,help,young,see,see,story,horrible,stop,eat,feline
2,rent,time,little,terrible,gore,turn,girl,great,story,recommend,bad,decide,stop,pet
3,terrible,watch,bit,bore,creepy,leave,fall,good,great,especially,waste,run,creature,vil
4,awful,like,fan,minute,blood,lose,leave,recommend,definitely,enjoy,boring,call,use,dalmation
5,like,movie,get,acting,halloween,place,turn,really,time,fantastic,ok,break,human,kitten
6,ok,think,look,plot,flick,begin,help,lot,good,love,worst,save,obviously,greenlighted
7,boring,act,big,actually,jeepers,day,couple,br,film,definitely,minute,turn,away,classic
8,worst,actor,entertaining,horrible,gory,right,take,make,really,year,bore,talk,bear,old
9,stupid,make,enjoy,awful,hellraiser,give,away,come,lot,film,stupid,little,head,walt


### 4.3 Latent Topics

In [195]:
# Top-20 Latent Topics <- SVD Matrix (TF-IDF)
latentTopics(vocabulary, topics_tfidf)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,Topic10,Topic11,Topic12,Topic13,Topic14,Topic15,Topic16,Topic17,Topic18,Topic19
0,br,br,movie,bad,bad,funny,good,book,bad,character,love,book,horror,watch,good,kid,story,good,watch,game
1,movie,spoiler,br,film,episode,watch,great,bad,funny,really,really,read,story,time,family,character,kid,story,like,great
2,film,episode,bad,horror,series,episode,character,episode,comedy,like,bad,funny,love,waste,think,film,child,scene,look,watch
3,good,fiend,watch,waste,guy,comedy,series,series,play,think,good,version,character,scene,life,child,game,funny,love,book
4,like,uk,funny,act,like,laugh,actor,story,life,story,great,novel,watch,plot,role,series,make,action,actor,action
5,bad,season,stupid,terrible,get,great,story,watch,actor,feel,story,kid,funny,game,actor,action,family,war,make,war
6,watch,match,rent,awful,thing,series,bad,read,man,people,kid,comedy,family,minute,performance,movie,effect,love,budget,fun
7,time,bbc,horrible,budget,look,think,episode,life,love,end,girl,like,bad,play,child,like,disney,fight,act,story
8,make,london,waste,plot,season,love,cast,character,laugh,funny,like,horror,plot,kid,film,episode,animation,tell,people,bad
9,character,nuclear,laugh,horrible,kill,really,performance,time,role,main,think,original,end,character,waste,bad,special,effect,low,fight


In [196]:
# Top-20 Latent Topics <- SVD Matrix (PPMI)
latentTopics(vocabulary, topics_ppmi)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,Topic10,Topic11,Topic12,Topic13,Topic14,Topic15,Topic16,Topic17,Topic18,Topic19
0,movie,movie,great,funny,horror,comedy,effect,rent,action,episode,gooding,effect,eyre,gooding,gooding,rent,carrey,carrey,portman,carrey
1,good,bad,love,kid,good,funny,sci,video,fight,series,laugh,special,book,cuba,cuba,eyre,kid,jim,natalie,jim
2,watch,watch,good,laugh,great,laugh,fi,waste,martial,season,cuba,bad,dalton,jr,jr,effect,child,freeman,gooding,grinch
3,like,see,movie,old,effect,joke,dvd,money,seagal,sci,sci,terrible,zelah,harris,see,special,disney,almighty,sarandon,almighty
4,see,good,enjoy,joke,gore,script,special,buy,movie,fi,fi,sci,rochester,radio,waste,paltrow,adult,aniston,jackie,aniston
5,bad,waste,recommend,girl,scary,cast,vhs,year,gooding,tv,comedy,fi,jane,brashear,carrey,jane,jim,rent,episode,morgan
6,time,like,wonderful,year,special,bad,video,dvd,guy,book,funny,gooding,read,deniro,movie,budget,sequel,grinch,kung,freeman
7,think,think,story,fun,fan,actor,animation,see,damme,cancel,horror,child,austen,coach,time,rental,grinch,morgan,hong,dancing
8,really,time,see,hilarious,suspense,waste,copy,copy,good,end,excellent,horrible,timothy,editor,disney,austen,original,bruce,chan,fu
9,film,rent,think,comedy,slasher,role,year,vhs,jackie,read,thriller,act,scarlett,diver,harris,zelah,myers,na,fu,kung


### 4.4 Classification Algorithms

In [27]:
# (Multinomial) Naïve Bayes <- BOW Matrix
naiveBayes(bow_matrix)

(Multinomial) Naïve Bayes has been successfully applied to the data.

Confusion Matrix:

       0     1
0  4398   637
1   841  4124

Accuracy: 85.22 %


In [28]:
# (Multinomial) Naïve Bayes <- TF-IDF Matrix
naiveBayes(tfidf_matrix)

(Multinomial) Naïve Bayes has been successfully applied to the data.

Confusion Matrix:

       0     1
0  4383   652
1   752  4213

Accuracy: 85.96000000000001 %


In [29]:
# (Multinomial) Naïve Bayes <- PPMI Matrix
naiveBayes(ppmi_matrix)

(Multinomial) Naïve Bayes has been successfully applied to the data.

Confusion Matrix:

       0     1
0  4363   672
1  1075  3890

Accuracy: 82.53 %


In [51]:
# Logistic Regression <- (Truncated) SVD Matrix (TF-IDF)
logisticRegression(svd_tfidf)

Logistic Regression has been successfully applied to the data.

Confusion Matrix:

       0     1
0  4340   695
1   544  4421

Accuracy: 87.61 %


In [52]:
# Logistic Regression <- (Truncated) SVD Matrix (PPMI)
logisticRegression(svd_ppmi)

Logistic Regression has been successfully applied to the data.

Confusion Matrix:

       0     1
0  4364   671
1   502  4463

Accuracy: 88.27000000000001 %
