# Inferring Topics from IMDB Reviews

In [1]:
import numpy as np
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
import pandas as pd
import matplotlib.pyplot as plt

## Exploring the Dataset: [Large Movie Review Dataset](https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz)

In [2]:
ROOT = '../neuralnets/aclImdb/train/pos/'

In [3]:
reviews = []
for file in os.listdir(ROOT):
    path = os.path.join(ROOT, file)
    if os.path.isfile(path):
        with open(path, 'r') as fin:
            reviews.append(fin.read())

In [4]:
len(reviews)

12500

In [5]:
for i in range(3):
    print(reviews[i])
    print('=' * 150)

Not wishing to give *anything* away here, I would just say this technically excellent, flawlessly acted and uplifting little flic will reward the viewer with an excellent hour and a half's entertainment: It will amuse, surprise, possibly embarrass occasionally and almost certainly tug at the heartstrings from time to time, as it approaches the inevitable, but not obvious, ending without becoming clichéd or predictable in any way. Most definitely recommended.<br /><br />A previous User's Comment gives 8 out of 10 for the film and 10 out of 10 for both Branagh and Bonham-Carter's outstanding performances - I agree entirely....
Wrestlemania 14 is not often looked as one of the great Wrestlemania's but I would personally put it, in my top 5, if not the top 3. It has so many great things, and it truly signified the birth of The Attitude Era, which was WWE's best era, in my opinion. HBK has the heart of a lion, and him putting over Austin like he did, on his way out, was pure class on his pa

## Feature Extraction

In [8]:
vect = TfidfVectorizer(stop_words='english')
X = vect.fit_transform(reviews)

pd.DataFrame(X.toarray(), columns=vect.get_feature_names())

Unnamed: 0,00,000,000s,003830,006,007,0079,0080,0083,0093638,...,élan,émigré,émigrés,était,état,étc,êxtase,ís,østbye,über
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12495,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12496,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12497,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12498,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## NMF Decomposition

In [14]:
N_TOPICS = 15
nmf = NMF(n_components=N_TOPICS)
W = nmf.fit_transform(X)  # Document-topic matrix
H = nmf.components_       # Topic-term matrix



In [15]:
# Top 10 words per topic

words = np.array(vect.get_feature_names())
topic_words = pd.DataFrame(np.zeros((N_TOPICS, 10)), index=[f'Topic {i + 1}' for i in range(N_TOPICS)],
                           columns=[f'Word {i + 1}' for i in range(10)]).astype(str)
for i in range(N_TOPICS):
    ix = H[i].argsort()[::-1][:10]
    topic_words.iloc[i] = words[ix]

topic_words

Unnamed: 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10
Topic 1,br,10,ll,spoilers,end,simply,yes,spoiler,quite,just
Topic 2,movie,movies,watch,recommend,10,seen,saw,best,actors,definitely
Topic 3,film,films,director,characters,seen,cinema,festival,work,scenes,art
Topic 4,series,episode,episodes,season,tv,characters,trek,seasons,shows,television
Topic 5,man,role,character,performance,best,plays,john,played,does,actor
Topic 6,good,pretty,story,bad,acting,really,job,liked,nice,little
Topic 7,war,world,documentary,people,american,history,soldiers,men,women,hitler
Topic 8,funny,comedy,laugh,hilarious,eddie,fun,jokes,humor,funniest,murphy
Topic 9,like,think,really,just,don,people,know,say,didn,lot
Topic 10,time,years,saw,seen,dvd,old,remember,ve,music,disney


In [16]:
# Create a topic mapping

topic_mapping = {
    'Topic 4': 'TV',
    'Topic 7': 'War',
    'Topic 8': 'Comedy',
    'Topic 12': 'Book Adaptation',
    'Topic 13': 'Horror',
    'Topic 15': 'Martial Arts / Action'
}

In [17]:
# Recall the document-topic matrix, W

W = pd.DataFrame(W, columns=[f'Topic {i + 1}' for i in range(N_TOPICS)])
W['max_topic'] = W.apply(lambda x: topic_mapping.get(x.idxmax()), axis=1)
W[pd.notnull(W['max_topic'])].head(10)

Unnamed: 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9,Topic 10,Topic 11,Topic 12,Topic 13,Topic 14,Topic 15,max_topic
2,0.028314,0.0,0.022122,0.00148,0.023043,0.002044,0.030939,0.0,0.006389,0.0,0.000774,0.007251,0.0,0.003574,0.0,War
16,0.000251,0.0,0.001575,0.0,0.029132,0.002257,0.0,0.033108,0.016283,0.0,0.012337,0.0,0.003595,0.011944,0.010159,Comedy
18,0.029574,0.0,0.01901,0.001797,0.016906,0.008574,0.000129,0.03801,0.005558,0.00625,0.036652,0.0,0.0,0.0,0.0,Comedy
26,0.015179,0.000349,0.0,0.0,0.015907,0.012349,0.0,0.034328,0.015722,0.008809,0.004318,0.0,0.0,0.001958,0.000922,Comedy
27,0.031523,0.008099,0.000171,0.003151,0.009975,0.001411,0.035158,0.042588,0.0,0.0,0.001425,0.002624,0.0,0.003865,0.002781,Comedy
29,0.0,0.000614,0.0,0.0,0.0,0.014862,0.0,0.014987,0.010941,0.0,0.0,0.001534,0.066263,0.0,0.036239,Horror
30,0.023404,0.012107,0.016814,0.0,0.008135,0.00962,0.001377,0.040382,0.000809,0.004582,0.004803,0.001186,0.014194,0.0,0.0,Comedy
31,0.012324,0.003554,0.028753,0.0,0.017125,0.003483,0.006804,0.0,0.003702,0.0,0.006449,0.000833,0.034161,0.005682,0.0,Horror
34,0.0,0.016503,0.0,0.0,0.013825,0.0,0.0,0.038567,0.004479,0.021462,0.0,0.0,0.0,0.010132,0.0,Comedy
58,0.000228,0.046686,0.0,0.0,0.0001,0.0,0.004866,0.0,0.001639,0.013741,0.037063,0.069237,0.0,0.012097,0.0,Book Adaptation


In [21]:
reviews[58]

'In my humble opinion, this movie did not receive the recognition it deserved. Robert Redford lives near me here in Provo, Utah, at Sundance. I enjoy most of his work, and this was my favorite. I\'m sorry that more people didn\'t appreciate it. My grandmother was an avid reader and read the book years before it came out on the big screen. She gave it to me to read after we had seen the movie together. The movie and book hit an emotional spot within my heart, and I was weepy for several days after seeing the movie. Sometimes love isn\'t enough to keep our loved ones from hurting themselves. We see this in our own family relationships, yet our love and our families and their stories endure throughout generations of time. The cinematography was perfect and breathtaking -- I was awed by its beauty and how well it brought to life the words of the author of the book, Norman Maclean, "But when I am alone in the half light of the canyon, all existence seems to fade to a being with my soul, and