# Topic Modeling with SK-Learn

## Libraries

In [1120]:
import numpy as np
import pandas as pd
import regex as re
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.snowball import PorterStemmer
import treetaggerwrapper
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.utils.validation import column_or_1d
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

### Loading the data

In [1100]:
csv = pd.read_csv('IMDB.csv') # 50,000 labeled {positive, negative} movie reviews

In [1102]:
csv.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


### Data Pre-Processing

In [None]:
# Collection of reviews <- Reviews from the csv
collection = [review for review in csv.iloc[:,0]]

# 'y' into SKLearn (without OneHotEncoding) <- Sentiments from the csv
y = csv.iloc[:, 1]

In [1121]:
# This is what the raw documents in the collection look like:
collection[0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

### Generating the Corpus

In [1091]:
# OBJECTS & SETS

# Tokenizer (Regex)
tokenizer = RegexpTokenizer(r'\w+')
# Lemmatizer (TreeTagger)
tagger = treetaggerwrapper.TreeTagger(TAGLANG='en')
# Stemmer (PorterStemmer)
stemmer = PorterStemmer()
# Stopwords (NLTK)
stopWords = [stemmer.stem(sw) for sw in stopwords.words('english')]
# Punctuation (Custom)
punct = list(r"!`\"»«',(-....:;<>?)")

In [1092]:
def generate_corpus(collection):
    
    """
    Description: Normalizes each document in a collection to generate a corpus.
    
    - Input: 
        A list of raw documents [[doc1], [doc2],..., [docN]] where each doc is a raw string.
        
    - Output:
        A list of processed documents [[doc1], [doc2],..., [docN]] where on each doc
        the following operations have been applied:
            - Removal of non-alphabetic characters
            - Case folding of all words
            - Tokenization for each '\w+'
            - Removal of stopwords and punctuation
            - Lemmatization & stemming
    """
    
    corpus = []
    for doc in collection:
        # Alphabetization
        _doc = re.sub('[^A-Za-z]', ' ', doc)
        # Case Folding
        _doc = _doc.lower()
        # Tokenization
        _doc = tokenizer.tokenize(_doc)
        # Stop Words
        _doc = [word for word in _doc if word not in set(stopWords)]
        # Punctuation
        _doc = [word for word in _doc if word not in set(punct)]
        # Lemmatization
        _doc = [re.split(r'\t', word)[2] for word in tagger.tag_text(_doc)]
        # Stemming
        _doc = [stemmer.stem(word) for word in _doc]
        # Joining
        _doc = ' '.join(_doc)
        # Appending
        if _doc != []:
            corpus.append(_doc)
    print("No. of Documents in the Corpus:", len(corpus))
    
    return corpus

## Term-Document Matrix

### Bag-Of-Words (BOW) Matrix

In [1106]:
def generate_bow_matrix(corpus, max_features):
    
    """
    Description: Generates a Bag-Of-Words Matrix.
    
    - Input: A corpus of processed documents [doc1, doc2,..., docN] & a specified number
    of maximum features (unique words).
    
    - Output: A (m x n) bag-of-words matrix, where m is the number of documents in the 
    corpus and n is the number of specified maximum features.
    """
    
    # Model
    bow_vectorizer = CountVectorizer(min_df=1,
                                     max_features=max_features)
    # Fit
    bow_vectorizer.fit(corpus)
    # Index
    index = ["document{}".format(i) for i in range(len(corpus))]
    # Columns
    columns = bow_vectorizer.get_feature_names() # Vocabulary
    # DataFrame
    bow_matrix = pd.DataFrame(bow_vectorizer.transform(collection).toarray(),
                       index=index,
                       columns=columns)
    
    print("A", bow_matrix.shape, "BOW Matrix has been generated.\n")
    
    return bow_matrix

### Term Frequency-Inverse Document Frequency (TF-IDF) Matrix

In [1107]:
def generate_tfidf_matrix(corpus, max_features):
    
    """
    Description: Generates a TF-IDF Matrix.
    
    - Input: A corpus of processed documents [doc1, doc2,..., docN] & a specified number
    of maximum features (unique words).
    
    - Output: A (m x n) TF-IDF matrix, where m is the number of documents in the 
    corpus and n is the number of specified maximum features.
    """

    # Model
    tfidf_vectorizer = TfidfVectorizer(min_df=1,
                                       max_features=max_features)
    # Fit 
    tfidf_vectorizer.fit(corpus)
    # Index
    index = ["document{}".format(i) for i in range(len(corpus))]
    # Columns
    columns = tfidf_vectorizer.get_feature_names() # Vocabulary
    # DataFrame
    tfidf_matrix = pd.DataFrame(tfidf_vectorizer.transform(collection).toarray(),
                       index=index,
                       columns=columns) # columns=Vocabulary
    
    print("A", tfidf_matrix.shape, "TF-IDF Matrix has been generated.\n")
    
    return tfidf_matrix

## Encoding Matrix

### Principal Component Analysis (PCA) Matrix

In [1052]:
def generate_pca_matrix(matrix, n_components):
    
    """
    Description: Generates a PCA Matrix.
    
    - Input: A matrix (BOW or TF-IDF) & a specified number of maximum components
    (latent topics).
    
    - Output: A (m x n) PCA matrix, where m is the number of documents in the 
    corpus and n is the number of specified maximum components.
    """
    
    # Normalization
    matrix = matrix - matrix.mean()
    # Model
    pca = PCA(n_components=n_components)
    # Index
    index = ["document{}".format(i) for i in range(len(matrix))]
    # Columns
    columns = ["topic{}".format(i) for i in range(n_components)]
    # DataFrame
    pca_matrix = pd.DataFrame(pca.fit_transform(matrix),
                           index=index,
                           columns=columns)
    
    print("A", pca_matrix.shape, "PCA Matrix has been generated.\n")
    
    return pca_matrix

### (Truncated) Singular Value Decomposition (SVD) Matrix

In [1053]:
def generate_svd_matrix(matrix, n_components, n_iter):
    
    """
    Description: Generates a (TruncateD) SVD Matrix.
    
    - Input: A matrix (BOW or TF-IDF), a specified number of maximum components
    (latent topics) & the number of iterations of the algorithm over the data.
    
    - Output: A (m x n) SVD matrix, where m is the number of documents in the 
    corpus and n is the number of specified maximum components.
    """
    
    # Normalization
    matrix = matrix - matrix.mean()
    # Model
    svd = TruncatedSVD(n_components=n_components, n_iter=n_iter)
    # Index
    index = ["document{}".format(i) for i in range(len(matrix))]
    # Columns
    columns = ["topic{}".format(i) for i in range(n_components)]
    # DataFrame
    svd_matrix = pd.DataFrame(svd.fit_transform(matrix),
                           index=index,
                           columns=columns)
    
    print("A", svd_matrix.shape, "SVD Matrix has been generated.\n")
    
    return svd_matrix

## Similarity Matrix

### Cosine Similarity Matrix

In [1093]:
def generate_cos_similarities(matrix):
    
    """
    Description: Generates Cosine-Similarity Matrix
    
    - Input: A vectorized matrix (TF-IDF, PCA or SVD).
    
    - Output: A (m x m) matrix where m is the number of documents in the vectorized
    matrix & each element x_ij of the matrix is defined as:
    
    x_ij = cos(i,j) = dot(i,j) / norm(i)*norm(j)
    """
    
    cos_sim_matrix = pd.DataFrame(cosine_similarity(matrix, matrix))
    
    print("A", cos_sim_matrix.shape, "Cosine Similarity Matrix has been generated.\n")
    
    return cos_sim_matrix

## Text Classification: Sentiment Analysis

First, I use the functions above to generate:
    - Corpus (from IMBD)
    - BOW Matrix
    - TF-IDF Matrix
    - PCA Matrix <- TF-IDF Matrix
    - SVD Matrix <- TF-IDF Matrix
    
Then I apply the following algorithms to the data:
    - (Multinomial) Naïve Bayes (BOW & TF-IDF)
    - Logistic Regression (PCA & SVD)

Finally, for each model I calculate the accuracy %. 

In [1094]:
# Corpus
corpus = generate_corpus(collection)

No. of Documents in the Corpus: 50000


In [1122]:
# This is what the processed documents in the corpus look like:
corpus[0]

'one review have mention watch oz episod hook right thi exactli happen br br first thing strike Oz be it brutal unflinch scene violenc set right word go trust thi show faint heart timid thi show pull punch regard drug sex violenc it hardcor classic use word br br call oz nicknam give oswald maximum secur state penitentari focu mainli emerald citi experiment section prison cell glass front face inward privaci high agenda em citi home mani aryan muslim gangsta latino christian italian irish scuffl death stare dodgi deal shadi agreement never far away br br would say main appeal show due fact go show dare forget pretti pictur paint mainstream audienc forget charm forget romanc oz mess around first episod ever see strike nasti be surreal say be readi watch develop tast oz get accustom high level graphic violenc violenc injustic crook guard sell nickel inmat kill order get away well manner middl class inmat be turn prison bitch due lack street skill prison experi watch oz may becom comfort 

In [1108]:
# BOW Matrix
bow_matrix = generate_bow_matrix(corpus, 30000)

A (50000, 30000) BOW Matrix has been generated.



In [1109]:
# TF-IDF Matrix
tfidf_matrix = generate_tfidf_matrix(corpus, 30000)

A (50000, 30000) TF-IDF Matrix has been generated.



In [1117]:
# PCA Matrix <- TF-IDF Matrix
pca_matrix = generate_pca_matrix(tfidf_matrix, 1000)

A (50000, 1000) PCA Matrix has been generated.



In [1119]:
# SVD Matrix <- TF-IDF Matrix
svd_matrix = generate_svd_matrix(tfidf_matrix, 1000, 10)

A (50000, 1000) SVD Matrix has been generated.



### (Multinomial) Naïve Bayes

In [1113]:
def multinomial_NB_classifier(matrix):
    
    """
    (Multinomial) Naïve Bayes Classifier.
    """
    
    # Splitting Data into Train & Test Set
    X_train, X_test, y_train, y_test = train_test_split(matrix, # BOW or TF-IDF
                                                        y,
                                                        test_size=0.20,
                                                        random_state=0)
    # Model
    naiveBayes = MultinomialNB()
    # Fit
    naiveBayes.fit(X_train, y_train)
    # Predictions
    y_pred = naiveBayes.predict(X_test)
    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)

    accuracy = ((cm[0][0] + cm[1][1]) / (cm[0][0] + cm[0][1] + cm[1][0] + cm[1][1])) * 100
    
    print("(Multinomial) Naïve Bayes has been successfully applied to the data.\n")
    print("Confusion Matrix:\n\n", pd.DataFrame(cm))
    print("\nAccuracy:", accuracy, "%")

#### BOW Matrix

In [1114]:
multinomial_NB_classifier(bow_matrix)

(Multinomial) Naïve Bayes has been successfully to the data.

Confusion Matrix:

       0     1
0  4258   777
1  1085  3880

Accuracy: 81.38 %


#### TF-IDF Matrix

In [1115]:
multinomial_NB_classifier(tfidf_matrix)

(Multinomial) Naïve Bayes has been successfully to the data.

Confusion Matrix:

       0     1
0  4328   707
1  1237  3728

Accuracy: 80.56 %


### Logistic Regression

In [1116]:
def logistic_regression_classifier(matrix):
    
    """
    Logistic Regression Classifier.
    """
    
    # Splitting Data into Train & Test Set
    X_train, X_test, y_train, y_test = train_test_split(matrix, # BOW, TF-IDF, PCA or SVD 
                                                        y,
                                                        test_size=0.20,
                                                        random_state=0)
    # Model
    logisticRegression = LogisticRegression(solver='lbfgs', random_state = 0)
    # Fit
    logisticRegression.fit(X_train, y_train)
    # Predictions
    y_pred = logisticRegression.predict(X_test)
    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    # Accuracy
    accuracy = ((cm[0][0] + cm[1][1]) / (cm[0][0] + cm[0][1] + cm[1][0] + cm[1][1])) * 100
    
    print("Logistic Regression has been successfully applied to the data.\n")
    print("Confusion Matrix:\n\n", pd.DataFrame(cm))
    print("\nAccuracy:", accuracy, "%")

#### PCA Matrix

In [1118]:
logistic_regression_classifier(pca_matrix)

Logistic Regression has been successfully to the data.

Confusion Matrix:

       0     1
0  4058   977
1   806  4159

Accuracy: 82.17 %




#### SVD Matrix

In [1042]:
logistic_regression_classifier(svd_matrix)

Confusion Matrix:

       0     1
0  4314   721
1   612  4353

Logistic Regression Accuracy: 86.67 %
