# Topic Modeling (with LSA) & Sentiment Analysis (with NB and LR)

## 1. Libraries

In [13]:
import numpy as np
import pandas as pd
import regex as re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import treetaggerwrapper
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from tqdm import tqdm_notebook

## 2. Data Loading & Pre-Processing

In [14]:
# Loading the Data
csv = pd.read_csv('IMDB.csv') # 50,000 labeled {positive, negative} movie reviews

# Collection of reviews <- 'Review' column from the csv
collection = [review for review in csv.iloc[:,0]]

# 'y' into SKLearn (without OneHotEncoding) <- 'Sentiment' column from the csv
y = csv.iloc[:, 1]

## 3. Functions

### 3.1 Generating the Corpus & the Vocabulary

In [15]:
def generate_corpus_vocabulary(collection):
    
    """
    Description: Normalizes each document in a collection to generate a corpus & the vocabulary.
    
    - Input: 
        (1) A list of raw documents [[doc1],[doc2],...,[docN]] where each doc is a raw string.
        
    - Output:
        (1) A list of processed documents [[doc1], [doc2],..., [docN]] where on each doc
        the following operations have been applied:
            - Removal of non-alphabetic characters
            - Case folding of all words
            - Tokenization for each '\w+'
            - Removal of stopwords and punctuation
            - Lemmatization & stemming
        (2) A list [[word1],[word2],...,[wordN]] of all the unique words in the corpus.
    """
    
    # --------------------OBJECTS--------------------
    
    # Stopwords (NLTK)
    stopWords = stopwords.words('english')
    # Punctuation (Custom)
    punct = list(r"!`\"»«',(-....:;<>?)")
    # Lemmatizer (TreeTagger)
    tagger = treetaggerwrapper.TreeTagger(TAGLANG='en')
    # Stemmer (PorterStemmer)
    stemmer = SnowballStemmer("english")
    
    # --------------------LOOP--------------------
    
    corpus = []
    for doc in tqdm_notebook(collection):
        # Alphabetization
        _doc = re.sub('[^A-Za-z]', ' ', doc)
        # Case Folding
        _doc = _doc.lower()
        # Tokenization
        _doc = word_tokenize(_doc)
        # StopWords
        _doc = [word for word in _doc if not word in stopWords]
        # Punctuation
        _doc = [word for word in _doc if not word in punct]
        # Lemmatization
        _doc = [re.split(r'\t', word)[2] for word in tagger.tag_text(_doc)]
        # Stemming
        # _doc = [stemmer.stem(word) for word in _doc] # Better without
        # Joining
        _doc = ' '.join(_doc)
        # Appending
        if _doc != '':
            corpus.append(_doc)
    
    # --------------------VOCABULARY--------------------
    
    vocabulary = sorted(list(set(' '.join(corpus).split(" "))))
    
    # --------------------PRINT--------------------
    
    print("No. of Documents in the Corpus:", len(corpus))
    print("No. of Unique Words in the Corpus:", len(vocabulary))
    
    return corpus, vocabulary

### 3.2 Generating a Bag-Of-Words (BOW) Matrix

In [25]:
def generate_bow_matrix(corpus, min_df, vocabulary):
    
    """
    Description: Generates a Bag-Of-Words Matrix.
    
    - Input: A corpus of processed documents [doc1, doc2,..., docN].
    
    - Output: A (m x n) bag-of-words matrix, where m is the number of documents 
    in the corpus and n is the number of specified maximum features.
    """
    
    # Model
    bow_vectorizer = CountVectorizer(min_df=min_df,
                                     vocabulary=vocabulary)
    # Fit & Transform
    bow_matrix = bow_vectorizer.transform(collection)

    print("A", bow_matrix.shape, "BOW Matrix has been generated.\n")
    
    return bow_matrix

### 3.3 Transforming BOW Matrix into TF-IDF Matrix

In [17]:
def transform_into_tfidf(bow_matrix):
    
    """
    Description: Transforms a BOW Matrix into a TF-IDF Matrix.
    
    - Input: A BOW Matrix & a specified number N of maximum
    features(i.e., top N most frequent unique words).
    
    - Output: A (m x n) TF-IDF matrix, where m is the number of documents
    in the corpus and n is the number of specified maximum features.
    """

    # Model
    tfidf_transformer = TfidfTransformer()
    # Fit & Transform
    tfidf_matrix = tfidf_transformer.fit_transform(bow_matrix).toarray()
    
    print("A", bow_matrix.shape, "BOW Matrix has been transformed into a", tfidf_matrix.shape, "TF-IDF Matrix.\n")
    
    return tfidf_matrix

### 3.4 Transforming TF-IDF Matrix into (Truncated) SVD Matrix

In [18]:
def transform_into_svd(tfidf_matrix, n_components, n_iter):
    
    """
    Description: Transforms a TF-IDF Matrix into a (TruncateD) SVD Matrix.
    
    - Input: A TF-IDF Matrix, a specified number of maximum components
    (latent topics), & a specified number of iterations of the algorithm.
    
    - Output: A (m x n) SVD matrix, where m is the number of documents in the 
    corpus and n is the number of specified maximum components.
    """
    
    # Normalization
    normalizer = Normalizer(copy=False)    
    tfidf_matrix = normalizer.fit_transform(tfidf_matrix)
    # Model
    svd = TruncatedSVD(n_components=n_components, n_iter=n_iter)
    # Fit 
    svd.fit(tfidf_matrix)
    # Topics
    topics = svd.components_
    # Index
    index = ["document{}".format(i) for i in range(len(tfidf_matrix))]
    # Columns
    columns = ["topic{}".format(i) for i in range(n_components)]
    # DataFrame
    svd_matrix = pd.DataFrame(svd.transform(tfidf_matrix),
                              index=index,
                              columns=columns)
    
    print("A", tfidf_matrix.shape, "TF-IDF Matrix has been transformed into a", svd_matrix.shape, "SVD Matrix.\n")
    
    return svd_matrix, topics

### 3.5 Extracting Latent Topics from (Truncated) SVD Matrix 

In [71]:
def latent_topics(terms, topics):
    for i, topic in tqdm_notebook(enumerate(topics)):
        terms_topic = zip(terms, topic)
        sorted_terms = sorted(terms_topic, key = lambda x : x[1], reverse=True)[:10]
        print("Topic " + str(i) + ": ")
        for t in sorted_terms:
            print(t[0])

### 3.6 Computing Most (Cosine-Wise) Similar Terms

In [20]:
def compute_cos_sim(term):
    """
    Computes the cosine similarity of one TF-IDF-vectorized term with respect to
    the vocabulary and shows the top 10 most similar ones.
    
    - Input: Term.
    - Output: Real-valued number [0,1].
        
    """
    
    TF_IDF = pd.DataFrame(tfidf_matrix, columns=vocabulary)
    cos_sim = {}
    for i in tqdm_notebook(vocabulary):
        cos_sim[i] = cosine_similarity([TF_IDF[term].values], [TF_IDF[i].values]).round(3)
    _cos_sim = pd.Series(cos_sim).sort_values(ascending=False)
        
    print(f"Top 20 terms most (cosine-wise) similar to '{term}':")

    return pd.DataFrame(_cos_sim, columns=['Cosine']).head(21)

### 3.7 (Multinomial) Naïve Bayes Classifier

In [49]:
def multinomial_NB_classifier(matrix):
    
    """
    (Multinomial) Naïve Bayes Classifier.
    """
    
    # Splitting Data into Train & Test Set
    X_train, X_test, y_train, y_test = train_test_split(matrix, # BOW or TF-IDF
                                                        y,
                                                        test_size=0.20,
                                                        random_state=0)
    # Model
    naiveBayes = MultinomialNB()
    # Fit
    naiveBayes.fit(X_train, y_train)
    # Predictions
    y_pred = naiveBayes.predict(X_test)
    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)

    accuracy = ((cm[0][0] + cm[1][1]) / (cm[0][0] + cm[0][1] + cm[1][0] + cm[1][1])) * 100
    
    print("(Multinomial) Naïve Bayes has been successfully applied to the data.\n")
    print("Confusion Matrix:\n\n", pd.DataFrame(cm))
    print("\nAccuracy:", accuracy, "%")

### 3.8 Logistic Regression Classifier

In [37]:
def logistic_regression_classifier(matrix):
    
    """
    Logistic Regression Classifier.
    """
    
    # Splitting Data into Train & Test Set
    X_train, X_test, y_train, y_test = train_test_split(matrix, # SVD 
                                                        y,
                                                        test_size=0.20,
                                                        random_state=0)
    # Model
    logisticRegression = LogisticRegression(solver='lbfgs',
                                            random_state = 0)
    # Fit
    logisticRegression.fit(X_train, y_train)
    # Predictions
    y_pred = logisticRegression.predict(X_test)
    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    # Accuracy
    accuracy = ((cm[0][0] + cm[1][1]) / (cm[0][0] + cm[0][1] + cm[1][0] + cm[1][1])) * 100
    
    print("Logistic Regression has been successfully applied to the data.\n")
    print("Confusion Matrix:\n\n", pd.DataFrame(cm))
    print("\nAccuracy:", accuracy, "%")

## 4. Execution & Evaluation

Using the functions above...

First, I generate:
    - Corpus <- IMBD.csv
    - BOW Matrix <- Corpus
    - TF-IDF Matrix <- Bow Matrix
    - (Truncated) SVD Matrix <- TF-IDF Matrix

Then I extract the 1000 Latent Topics from the SVD Matrix, and compute
the top 20 terms cosine-wise most similar terms for 'bad' and 'good'. 

Finally, I apply the following algorithms to the data:
    - (Multinomial) Naïve Bayes <- BOW Matrix
    - (Multinomial) Naïve Bayes <- TF-IDF Matrix
    - Logistic Regression <- (Truncated) SVD Matrix

And for each model I calculate the % of accuracy. 

In [23]:
# Corpus <- IMBD.csv
corpus, vocabulary = generate_corpus_vocabulary(collection)

HBox(children=(IntProgress(value=0, max=50000), HTML(value='')))


No. of Documents in the Corpus: 50000
No. of Unique Words in the Corpus: 86892


In [27]:
# BOW Matrix <- Corpus
bow_matrix = generate_bow_matrix(corpus,
                                 min_df=2,
                                 vocabulary=vocabulary)

A (50000, 86892) BOW Matrix has been generated.



In [28]:
# TF-IDF Matrix <- BOW Matrix
tfidf_matrix = transform_into_tfidf(bow_matrix)

A (50000, 86892) BOW Matrix has been transformed into a (50000, 86892) TF-IDF Matrix.



In [40]:
# SVD Matrix <- TF-IDF Matrix
svd_matrix, topics = transform_into_svd(tfidf_matrix, 
                                         n_components=1000,
                                         n_iter=10)

A (50000, 86892) TF-IDF Matrix has been transformed into a (50000, 1000) SVD Matrix.



In [72]:
# Latent Topics <- SVD Matrix
latent_topics(vocabulary, topics)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Topic 0: 
movie
to
and
but
have
you
in
not
be
if
Topic 1: 
worst
bad
waste
horrible
movie
ever
terrible
money
don
acting
Topic 2: 
love
great
recommend
favorite
watch
see
will
wonderful
movie
you
Topic 3: 
waste
worst
ever
money
terrible
life
read
book
horrible
seen
Topic 4: 
acting
story
horror
good
overall
recommend
plot
action
great
suspense
Topic 5: 
comedy
worst
funny
ever
cast
actor
laugh
bad
hilarious
performance
Topic 6: 
sci
fi
dvd
martial
arts
series
copy
vhs
video
action
Topic 7: 
funny
martial
jackie
arts
action
chan
comedy
humor
hong
fight
Topic 8: 
martial
arts
ever
jackie
chan
action
worst
hong
fight
seen
Topic 9: 
scarlett
rhett
eyre
dalton
book
jane
zelah
timothy
rochester
gwtw
Topic 10: 
show
fi
sci
episode
season
ideally
canceled
series
network
wrap
Topic 11: 
scarlett
rhett
ever
laugh
funny
worst
halloween
eyre
dalton
seen
Topic 12: 
scarlett
rhett
rent
buy
store
video
gwtw
rental
dvd
wind
Topic 13: 
gooding
cuba
sci
fi
jr
game
budget
fun
bad
graphics
Topic 14: 
por

Topic 109: 
paulie
laurel
hardy
parrot
dry
stan
paint
malden
cinemax
recommend
Topic 110: 
ricci
bean
christina
atkinson
surprised
rowan
pleasantly
expecting
curly
stooge
Topic 111: 
twilight
zone
capshaw
vin
diesel
degeneres
riddick
surprised
jamie
pleasantly
Topic 112: 
cinemax
kung
hartnett
fu
klein
estevez
paint
miserably
dry
sit
Topic 113: 
palma
dickinson
angie
hartnett
de
klein
twilight
nancy
leelee
bynes
Topic 114: 
bean
chapa
niro
damian
spinal
atkinson
excited
tap
meg
halloween
Topic 115: 
seagal
val
wasting
hartnett
klein
kilmer
steven
regret
paulie
leelee
Topic 116: 
vin
diesel
riddick
pitch
stirling
duchovny
minnie
hawes
nan
david
Topic 117: 
homicide
dumb
cage
bayliss
whoopi
vin
diesel
riddick
nicolas
goldberg
Topic 118: 
brokedown
danes
burt
thailand
ricci
ustinov
newhart
malden
christina
kate
Topic 119: 
bean
atkinson
hartnett
rowan
klein
leelee
josh
christmas
busey
sobieski
Topic 120: 
pie
asleep
stirling
witherspoon
estevez
hawes
nan
reese
rachael
quaid
Topic 121: 
fi

Topic 209: 
remotely
beckham
unpredictable
bend
heather
keira
parminder
darkness
graham
busby
Topic 210: 
figured
madsen
warn
bother
lackawanna
blockbuster
minnie
sheffer
jonathan
plain
Topic 211: 
figured
miserably
stahl
fool
laugh
tonight
suchet
martial
ashley
neeson
Topic 212: 
memento
nolan
minnie
lackawanna
remotely
satisfied
list
expecting
grow
lower
Topic 213: 
cillian
mcadams
columbo
val
weak
storyline
price
craven
gordon
thriller
Topic 214: 
trust
crouse
mantegna
lorelai
unpredictable
con
ringwald
gilmore
bunch
full
Topic 215: 
crouse
mantegna
con
chucky
lindsay
susie
chance
opinion
sean
canceled
Topic 216: 
halfway
unless
niro
wardrobe
understand
tired
casting
capshaw
night
throughly
Topic 217: 
entertaining
expected
scary
matrix
animatrix
relax
neeson
drunk
rob
tenacious
Topic 218: 
diaz
cameron
shelf
toole
columbo
hong
kong
warn
twilight
fourth
Topic 219: 
pulp
chainsaw
pie
elijah
martial
beaten
tongue
cheek
ending
expected
Topic 220: 
dentist
tired
quinn
pointless
believab

Topic 308: 
hardly
lackawanna
idiotic
dentist
alot
epatha
corbin
dom
nanny
deluise
Topic 309: 
brite
justice
rainbow
heartwarming
russo
fawcett
farrah
simba
exciting
petition
Topic 310: 
whats
finish
dolph
benson
frustrated
touched
thriller
cartoon
romance
surprised
Topic 311: 
streisand
barbra
judy
garland
wow
hawn
coaster
unless
regret
entertaining
Topic 312: 
enjoyment
browsing
susie
scary
cleaning
neeson
ok
romantic
preview
cameron
Topic 313: 
unrealistic
heather
worms
browsing
pathetic
barrymore
graham
possibly
made
unpredictable
Topic 314: 
awesome
greatest
bless
brite
animatrix
judge
hewlett
deciding
trinity
elvis
Topic 315: 
ernest
directing
ghoulies
bed
prom
barrymore
toole
sleep
remotely
warming
Topic 316: 
slow
entertain
love
entertaining
better
trust
sorry
tear
avid
comment
Topic 317: 
cheek
tear
misfortune
tongue
jerker
satisfied
touched
touching
dont
tape
Topic 318: 
brite
opinion
rainbow
scenery
unique
finish
misleading
crouse
graham
heather
Topic 319: 
antwone
stay
feld

Topic 404: 
canceled
relate
annoyed
disappointed
asking
directing
worthwhile
found
expected
romance
Topic 405: 
satisfy
agreed
annoyed
wish
forwarding
jovi
ranger
peculiar
awesome
jolie
Topic 406: 
weak
ariel
melody
heartwarming
leave
morgana
watching
plan
forget
mermaid
Topic 407: 
trust
admit
summer
mess
becuase
dukes
feldman
wanting
insult
mayall
Topic 408: 
doubt
lower
admit
touching
simpsons
dolph
glad
gein
cube
hyped
Topic 409: 
mardi
gras
remotely
mst
thinking
madsen
start
feldman
concept
shame
Topic 410: 
susie
nostalgic
sematary
value
martial
brite
bonham
beforehand
arts
camcorder
Topic 411: 
suffice
heather
existent
favor
thankfully
burstyn
trust
mislead
twice
flynch
Topic 412: 
relate
mormon
hyped
trust
cabin
dull
drama
attention
stupid
scary
Topic 413: 
brite
scarecrow
rainbow
agreed
hopefully
susie
hope
mine
slow
pleasantly
Topic 414: 
mcqueen
negative
ernest
twice
alot
celine
inspirational
jesse
relate
sexually
Topic 415: 
annoyed
happen
everybody
thriller
headache
idioti

Topic 504: 
ghoulies
worms
vu
deja
simpsons
cleaning
grow
differently
manna
remotely
Topic 505: 
concept
asking
izzard
cgi
endure
belushi
spend
automatically
kill
sullavan
Topic 506: 
impressive
hyped
weak
advise
cleaning
low
alas
puerile
penelope
ranger
Topic 507: 
puppet
honestly
corelli
mandolin
reading
cheesy
avid
intended
admit
somethings
Topic 508: 
cbs
buying
browsing
ernest
sincerely
porno
nostalgic
preview
mardi
okay
Topic 509: 
development
grab
clad
touching
away
thankfully
clever
manna
convoluted
walking
Topic 510: 
keitel
eagle
mtv
harvey
dull
suck
regret
continued
spike
ernest
Topic 511: 
browsing
barney
disgraceful
mardi
buying
childish
gras
creative
outcome
suspenseful
Topic 512: 
barbie
bonham
blethyn
helena
collection
guinea
pig
opinion
want
enjoyment
Topic 513: 
slower
alba
winded
toy
angel
figured
vu
deja
suspenseful
group
Topic 514: 
wanting
trailer
mean
incredibly
vu
deja
cause
gory
bogosian
without
Topic 515: 
melissa
want
hard
indiana
sagemiller
cbs
upset
cringe


Topic 602: 
barbie
describe
annoyed
considering
burstyn
awful
poitier
planning
idiotic
talented
Topic 603: 
horrid
trouble
tsui
mst
hark
disappointment
trailer
greatest
merk
imdb
Topic 604: 
robot
jox
biko
werewolf
ketchup
close
minus
premise
unintentional
scary
Topic 605: 
theater
pick
casting
recognize
judgement
drag
trier
gretchen
substance
von
Topic 606: 
burstyn
expectation
touching
mathis
mst
mess
weekend
beginning
ferland
samantha
Topic 607: 
mature
gretchen
bettie
ghoulies
page
existent
chance
whenever
hope
qualify
Topic 608: 
jumpy
awesome
couldnt
someones
ruin
know
mencia
bought
trust
cusack
Topic 609: 
thereof
belushi
dolph
inspirational
worth
checking
barbie
ghoulies
tops
lundgren
Topic 610: 
annoyed
gossett
special
night
ridiculous
refreshing
jeremy
swank
outline
streisand
Topic 611: 
tired
relate
walking
view
scamp
norma
tramp
beginning
spoiler
unintentional
Topic 612: 
hardly
poitier
believable
crappy
bore
scared
put
till
stranger
bess
Topic 613: 
ashamed
suspenseful
ima

Topic 696: 
rothrock
improvement
tried
anyways
plain
buffalo
suggested
refund
boggy
predict
Topic 697: 
sadness
carlyle
amazed
rothrock
jox
nt
sorta
natasha
heaton
henstridge
Topic 698: 
qualify
caliber
campy
gadget
wanting
positive
arctic
peterson
darkness
keitel
Topic 699: 
lasting
fun
lately
pathetic
btk
check
online
rental
gadget
excuse
Topic 700: 
caprica
besides
entertaining
loving
delirious
silence
badly
gratification
mature
expect
Topic 701: 
twist
lasting
spending
difficult
message
ninja
partially
impression
weak
believable
Topic 702: 
everyone
impressive
sympathy
hyped
commenter
profanity
feel
kid
found
compliment
Topic 703: 
alot
screwed
bogged
visiteurs
prefer
wait
notch
stahl
eventually
sense
Topic 704: 
afraid
merit
planning
guarantee
weak
money
someones
sleeping
greatly
agree
Topic 705: 
available
concept
regret
suffer
fourth
previous
til
people
plain
ok
Topic 706: 
theres
sesame
rothrock
goodfellas
dramatic
cynthia
porgy
bess
round
couldnt
Topic 707: 
secondly
izzard
de

Topic 790: 
armageddon
braindead
collection
honestly
happening
rarely
fx
cliche
anybody
whoever
Topic 791: 
rothrock
cynthia
barbie
armageddon
falling
unfortunately
birthday
ok
promise
appreciate
Topic 792: 
makeup
airing
technically
basically
patience
hell
grandma
passable
simply
authentic
Topic 793: 
awake
suggested
min
forum
paying
flop
mafia
dahmer
prepared
yzma
Topic 794: 
rothrock
dribble
learn
require
outline
cynthia
min
entertainment
peterson
keitel
Topic 795: 
certainly
mcteer
qualify
lost
decided
terrible
theatre
absolute
plain
frighten
Topic 796: 
waiting
ketchup
repeated
amazed
upsetting
hate
want
treat
rate
insist
Topic 797: 
sale
childish
cool
gadget
grab
inspector
accept
website
mask
scale
Topic 798: 
flop
leguizamo
touching
simpsons
drag
nia
tate
larenz
drunk
anton
Topic 799: 
likes
either
lots
sucking
relive
freebird
avid
wardrobe
porno
try
Topic 800: 
outdated
nonstop
jeremy
tired
fathom
predict
slap
nearly
quiet
advance
Topic 801: 
suggested
favour
definitely
avid
ko

Topic 888: 
studied
somethings
direction
someday
ghoulies
potential
noticeable
title
improve
unsettling
Topic 889: 
favour
grab
approximately
message
browsing
hopefully
crappy
porno
mislead
cheezy
Topic 890: 
airing
simpsons
unique
amused
theater
silverman
automatically
fishing
beautiful
upsetting
Topic 891: 
panel
compare
interfere
porno
stretch
braindead
csi
dismay
replay
stays
Topic 892: 
cheezy
bed
satisfy
appreciate
pick
springer
rothrock
amateur
inspirational
cynthia
Topic 893: 
fx
purchase
harmony
lampoon
mafia
flooding
improvement
manna
heck
quit
Topic 894: 
outline
determine
ugh
promised
percent
platoon
told
awe
excited
dismay
Topic 895: 
qualify
misfortune
attacking
cable
result
afternoon
expecting
strother
dooley
somewhere
Topic 896: 
frightening
focused
contain
unexplained
suck
start
cant
cliche
follow
scenery
Topic 897: 
cleaning
library
whatnot
besides
daughter
undoubtedly
twist
ugh
dandy
stereo
Topic 898: 
whimsical
asterix
accomplish
astin
roberts
volunteer
compelling
c

Topic 980: 
nonstop
iii
somebody
screen
tediously
makeup
beginner
bound
sooner
publicity
Topic 981: 
least
tight
spirit
consideration
super
idiotic
learned
afterward
puppet
apparent
Topic 982: 
scare
urge
correctly
braindead
wil
wider
never
anatomie
transylvania
grace
Topic 983: 
natasha
henstridge
accomplish
dahmer
merit
rookie
dumb
dedication
matrix
valley
Topic 984: 
romance
lifetime
screw
cheezy
freaky
carpenter
consequences
terminal
mutiny
mummy
Topic 985: 
remind
winded
easily
makeup
killjoy
hidden
maximum
checked
unexplained
bother
Topic 986: 
poster
effort
fallen
untergang
relaxing
child
grace
der
sketch
impossible
Topic 987: 
candidate
mcteer
approximately
twister
considering
janet
unoriginal
ridiculous
winded
negative
Topic 988: 
disgraceful
zoom
bought
heigl
promising
partial
humorous
suggested
intend
grass
Topic 989: 
keep
rothrock
awake
important
stolen
embarrassed
hahaha
manna
probably
beach
Topic 990: 
picking
still
mess
greatness
brilliant
unexplained
infected
raptor
co

In [31]:
# Term: 'bad'
compute_cos_sim('bad')

HBox(children=(IntProgress(value=0, max=86892), HTML(value='')))


Top 20 terms most (cosine-wise) similar to 'bad':


Unnamed: 0,Cosine
bad,[[1.0]]
movie,[[0.345]]
to,[[0.336]]
acting,[[0.324]]
but,[[0.319]]
and,[[0.317]]
so,[[0.315]]
not,[[0.298]]
in,[[0.296]]
just,[[0.289]]


In [32]:
# Term: 'good'
compute_cos_sim('good')

HBox(children=(IntProgress(value=0, max=86892), HTML(value='')))


Top 20 terms most (cosine-wise) similar to 'good':


Unnamed: 0,Cosine
good,[[1.0]]
and,[[0.461]]
but,[[0.459]]
to,[[0.437]]
in,[[0.41]]
movie,[[0.41]]
not,[[0.385]]
have,[[0.351]]
br,[[0.35]]
be,[[0.349]]


In [50]:
# (Multinomial) Naïve Bayes <- BOW Matrix
multinomial_NB_classifier(bow_matrix)

(Multinomial) Naïve Bayes has been successfully applied to the data.

Confusion Matrix:

       0     1
0  4434   601
1   899  4066

Accuracy: 85.0 %


In [44]:
# (Multinomial) Naïve Bayes <- TF-IDF Matrix
multinomial_NB_classifier(tfidf_matrix)

(Multinomial) Naïve Bayes has been successfully applied to the data.

Confusion Matrix:

       0     1
0  4293   742
1   965  4000

Accuracy: 82.93 %


In [46]:
# Logistic Regression <- (Truncated) SVD Matrix
logistic_regression_classifier(svd_matrix)

Logistic Regression has been successfully applied to the data.

Confusion Matrix:

       0     1
0  4302   733
1   518  4447

Accuracy: 87.49 %
