In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import LatentDirichletAllocation
from time import time

In [2]:
data = pd.read_csv("data/labeledTrainData.tsv", delimiter="\t")

In [3]:
data['review'] = data['review'].apply(lambda doc: doc.replace("<br />", " ") )  

In [5]:
vectorizer = CountVectorizer(max_features=10000, max_df=.15,stop_words='english')
X = vectorizer.fit_transform(data.review)

In [6]:
feature_names = vectorizer.get_feature_names()

In [7]:
feature_names = np.array(feature_names)
feature_names

array(['00', '000', '01', ..., 'zoom', 'zorro', 'zu'], dtype='<U17')

In [36]:
tsvd = TruncatedSVD(n_components=10,n_iter=100)
document_topics_svd = tsvd.fit_transform(X)

In [37]:
document_topics_svd.shape

(25000, 10)

In [53]:
document_topics_svd[:10,:]

array([[ 3.53283514e+00, -1.24712934e+00, -3.16698289e-01,
         6.84033015e-01,  4.20053041e-01,  4.07582626e-01,
        -8.30695637e-01, -2.33134649e-01,  4.02611699e-01,
        -3.83181622e-01],
       [ 1.42191362e+00,  3.78031980e-01,  2.12787479e-01,
         9.44534504e-02, -6.07471266e-01,  4.15587145e-01,
        -5.33383796e-01,  1.48920036e+00,  9.28817919e-02,
        -2.23502870e-01],
       [ 2.51699009e+00,  1.55914416e-01, -2.44395002e-01,
        -6.73739067e-01, -8.24049510e-01,  4.59933623e-01,
         3.70925065e-01, -7.82501321e-01, -1.14114760e-01,
         1.15790811e-02],
       [ 1.96791825e+00, -7.26915914e-02,  3.78761132e-02,
         6.11255353e-01, -9.78804943e-01,  6.34031250e-01,
        -7.97098762e-02,  4.41820164e-01, -1.61737925e-01,
        -4.28879466e-01],
       [ 2.87334263e+00, -2.24277955e-01, -2.00429985e-01,
         9.81999917e-02, -1.02166105e-02,  1.97915743e-01,
        -1.72778872e-02, -2.18458990e-01,  3.54997516e-01,
        -4.

In [55]:
np.argsort(document_topics_svd[:10,:])[::-1]

array([[7, 9, 8, 6, 2, 3, 1, 5, 4, 0],
       [5, 2, 7, 9, 1, 8, 3, 4, 6, 0],
       [9, 1, 6, 8, 3, 7, 2, 4, 5, 0],
       [1, 3, 5, 6, 7, 2, 9, 4, 8, 0],
       [1, 7, 9, 4, 3, 6, 2, 5, 8, 0],
       [9, 1, 7, 2, 6, 4, 3, 5, 8, 0],
       [4, 9, 8, 6, 1, 2, 7, 3, 5, 0],
       [4, 7, 3, 2, 8, 9, 1, 6, 5, 0],
       [4, 6, 9, 8, 3, 2, 1, 5, 0, 7],
       [1, 6, 9, 2, 7, 8, 5, 4, 3, 0]], dtype=int64)

In [38]:
sorted_tsvd_components = np.argsort(tsvd.components_, axis=1)[:, ::-1]
sorted_tsvd_components

array([[2748, 7206, 6086, ...,  647, 9673, 8013],
       [9973, 7945, 9685, ..., 9026, 2608, 4402],
       [7945, 6323, 3133, ..., 3413, 9876, 9973],
       ...,
       [1121, 4846, 7198, ..., 1855, 9685,  236],
       [ 236, 4085, 6323, ..., 9903, 9685, 4402],
       [7945, 7206,  242, ..., 6323, 6086, 6261]], dtype=int64)

In [39]:
sorted_tsvd_components.shape

(10, 10000)

In [47]:
# adapted from https://github.com/amueller/introduction_to_ml_with_python/blob/master/mglearn/tools.py

def print_topics(topics, feature_names, sorting, topics_per_chunk=6,
                 n_words=20):
    for i in range(0, len(topics), topics_per_chunk):
        these_topics = topics[i: i + topics_per_chunk]
        len_this_chunk = len(these_topics)
        print(("topic {:<8}" * len_this_chunk).format(*these_topics))
        print(("-------- {0:<5}" * len_this_chunk).format(""))
        # print top n_words frequent words
        for i in range(n_words):
            try:
                print(("{:<14}" * len_this_chunk).format(
                    *feature_names[sorting[these_topics, i]]))
            except:
                pass
        print("\n")
        


In [40]:
print_topics(topics=range(10), feature_names=feature_names,sorting=sorted_tsvd_components,
             topics_per_chunk=5, n_words=10)


topic 0       topic 1       topic 2       topic 3       topic 4       
--------      --------      --------      --------      --------      
doesn         young         series        funny         family        
real          series        original      comedy        series        
new           war           episode       didn          match         
old           family        tv            match         old           
director      world         episodes      got           house         
work          new           new           10            father        
thing         years         season        big           girl          
years         role          action        guy           guy           
actually      father        version       rock          episode       
makes         performance   batman        thing         mother        


topic 5       topic 6       topic 7       topic 8       topic 9       
--------      --------      --------      --------      --------      
matc

In [34]:
def print_original_reviews(topics,num, num_reviews):
    topic = np.argsort(topics[:,num])[::-1]
    # print the num_reviews documents where the topic is most important
    for i in topic[:num_reviews]:
        # show first two sentences
        print(".".join(data.iloc[i]['review'].split(".")[:2]) + ".\n")

In [41]:
print_original_reviews(document_topics_svd,7,10)

I felt duty bound to watch the 1983 Timothy Dalton / Zelah Clarke adaptation of \Jane Eyre,\" because I'd just written an article about the 2006 BBC \"Jane Eyre\" for TheScreamOnline.  So, I approached watching this the way I'd approach doing homework.

The 1983 BBC production of \Jane Eyre\" starring Zelah Clarke and Timothy Dalton (LOVE HIM) has always been and will always be my favorite Jane Eyre. If you watch any other version of Jane Eyre without reading the book, it will be like watching some regular movie which you will forget the next day.

There are many adaptations of Charlotte Brontë's classic novel \Jane Eyre\", and taking into consideration the numerous reviews written about them there is also a lively discussion on which of them is the best. The short film adaptations all suffer from the fact that it is simply not possible to cram the whole plot of the novel into a movie of about a 100 min.

This TV production of 1970 starring Susannah York and George C. Scott is another 

In [42]:
start = time()
lda10 = LatentDirichletAllocation(n_components=10, learning_method="batch",
max_iter=25, random_state=0)
document_topics_lda_10 = lda10.fit_transform(X)
print('LDA10 took {} minutes'.format((time() - start)/60.))

LDA10 took 21.570063312848408 minutes


In [21]:
lda10.components_.shape

(10, 10000)

In [22]:
sorted_lda10_components = np.argsort(lda10.components_, axis=1)[:, ::-1]
sorted_lda10_components

array([[7603, 6565, 1468, ..., 3780, 6087, 5082],
       [ 236, 4842, 7603, ..., 9986, 2842, 3584],
       [2608, 9026,  242, ...,  769,  960,  873],
       ...,
       [3377, 9973, 3891, ..., 5648, 5848,  882],
       [1121, 9548, 7198, ..., 1425, 8908, 4299],
       [4402, 3954, 2977, ..., 3584, 4299,  769]], dtype=int64)

In [23]:
print_topics(topics=range(10), feature_names=feature_names,sorting=sorted_lda10_components,
             topics_per_chunk=5, n_words=10)

topic 0       topic 1       topic 2       topic 3       topic 4       
--------      --------      --------      --------      --------      
role          action        didn          world         game          
performance   james         thing         director      action        
cast          role          actors        work          guy           
john          john          worst         real          car           
murder        western       funny         human         gets          
wife          stewart       actually      audience      guys          
plays         fight         want          makes         kids          
director      cast          10            cinema        world         
robert        plays         minutes       feel          thing         
actor         jack          script        point         going         


topic 5       topic 6       topic 7       topic 8       topic 9       
--------      --------      --------      --------      --------      
musi

In [43]:
print_original_reviews(document_topics_lda_10,5,10)

This is one of the finest music concerts anyone will ever see and hear. I grew up when All My Lovin' was brand new and to hear it again today by the original artist today is a measure of Sir P Mc's power to spellbind any crowd of any age.

This is one of my two or three favorite Stooges shorts, and undoubtedly Christine McIntyre's best performance with the trio. She is good in a number of other shorts, but here she is absolutely brilliant.

The Great Caruso displays the unique talents of Mario Lanza. He shows great acting capacity and is in top form as a lyrical singer, paired with Dorothy Kirsten, soprano of the Metropolitan Opera.

Blank Check is a movie that I saw on TV one day and like most movies they air on TV Blank Check wasn't that good. First of all no one I have ever met has seen Blank Check and that includes people that grew up in the 90s.

If people didn't know who Barbra Streisand was before this,..

I will never forget when I saw this title in the video store way back whe

In [27]:
start = time()
lda100 = LatentDirichletAllocation(n_components=100, learning_method="batch",
max_iter=25, random_state=0)
document_topics_lda_100 = lda100.fit_transform(X)
print('LDA100 took {} minutes'.format((time() - start)/60.))

LDA100 took 27.538193794091544 minutes


In [28]:
topics = np.array([7,10,16, 25, 28, 36, 22, 40, 51, 53, 54, 63, 89, 97])
sorted_lda_components = np.argsort(lda100.components_, axis=1)[:, ::-1]
feature_names = np.array(vectorizer.get_feature_names())
print_topics(topics=topics, feature_names=feature_names,
sorting=sorted_lda_components, topics_per_chunk=4, n_words=25)

topic 7       topic 10      topic 16      topic 25      
--------      --------      --------      --------      
king          dr            che           disbelief     
jack          lugosi        revolution    invisible     
freddy        karloff       von           swedish       
hotel         ray           power         hollow        
stephen       dracula       fido          verhoeven     
nightmare     mad           rangers       university    
lion          scientist     soderbergh    paul          
alice         bela          revolutionary problem       
shining       jet           europa        suspend       
paulie        rukh          danish        kevin         
street        sellers       cuba          suspension    
kubrick       li            del           valentine     
nicholson     old           trier         bacon         
stanley       universal     salman        big           
danny         mask          timmy         implausible   
wendy         boris         bas

In [46]:
print_original_reviews(document_topics_lda_100,28,15)

It's been a while since I've watched this movie, and the series, but now I'm refreshing my memory! This was a very funny movie based on the classic series! Johnny Knoxville and Seann William Scott were hilarious together. Bo and Luke Duke help Uncle Jesse run Moonshine in the General Lee.

George Cukor directs a brooding and cynical classic. The distinctive Ronald Coleman is at his best in this piece of Noir about an actor who loses himself in his roles.

BEGIN SPOILER: Fitfully funny and memorable for Mr. Chong's literal roach-smoking scene: Chong coolly mashes a stray kitchen cockroach into his pipe's bowl, lights up, coughs and hacks violently for a seeming eternity,then with perfect aplomb and not skipping a beat, re-loads the bowl properly, re-lights, re-tokes.

Every motion picture Bette Davis stars in is worth experiencing. Before Davis co-stars with Leslie Howard in \Of Human Bondage,\" she'd been in over a score of movies.

When an actor has to play the role of an actor, ficti