In [1]:
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string
import os
from pandas import DataFrame
import numpy as np
import sklearn.feature_extraction.text as text
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation



In [8]:
os.chdir("E:\\Consultancy\\Crayon")
df = DataFrame.from_csv("100K_Restaurants_reviews.tsv", sep="\t")
no_features = 1000
no_topics = 10

In [28]:
#nmf linear algebra method
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(df['ReviewTxt'])
tfidf_feature_names = tfidf_vectorizer.get_feature_names()


In [29]:
no_topics = 20
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)


In [31]:
topic_words = {}
n_top_words = 5

for topic, comp in enumerate(nmf.components_):
    # for the n-dimensional array "arr":
    # argsort() returns a ranked n-dimensional array of arr, call it "ranked_array"
    # which contains the indices that would sort arr in a descending fashion
    # for the ith element in ranked_array, ranked_array[i] represents the index of the
    # element in arr that should be at the ith index in ranked_array
    # ex. arr = [3,7,1,0,3,6]
    # np.argsort(arr) -> [3, 2, 0, 4, 5, 1]
    # word_idx contains the indices in "topic" of the top num_top_words most relevant
    # to a given topic ... it is sorted ascending to begin with and then reversed (desc. now)    
    word_idx = np.argsort(comp)[::-1][:n_top_words]

    # store the words most relevant to the topic
    topic_words[topic] = [tfidf_feature_names[i] for i in word_idx]
print(topic_words)

{0: ['place', 'visit', 'friends', 'hangout', 'hang'], 1: ['just', 'time', 'like', 'ordered', 'menu'], 2: ['good', 'overall', 'pretty', 'music', 'quite'], 3: ['food', 'quality', 'price', 'ambiance', 'average'], 4: ['great', 'music', 'food', 'view', 'experience'], 5: ['chicken', 'burger', 'biriyani', 'wings', 'ordered'], 6: ['veg', 'non', 'starters', 'buffet', 'main'], 7: ['nice', 'friends', 'hangout', 'overall', 'hang'], 8: ['awesome', 'just', 'loved', 'simply', 'try'], 9: ['beer', 'music', 'beers', 'toit', 'brewed'], 10: ['best', 'bangalore', 'town', 'places', 've'], 11: ['dosa', 'masala', 'coffee', 'dosas', 'benne'], 12: ['ambience', 'food', 'loved', 'music', 'perfect'], 13: ['indian', 'restaurant', 'north', 'south', 'chinese'], 14: ['service', 'excellent', 'slow', 'ambiance', 'bad'], 15: ['biryani', 'mutton', 'taste', 'boneless', 'andhra'], 16: ['amazing', 'just', 'try', 'simply', 'loved'], 17: ['money', 'value', 'worth', '10', 'waste'], 18: ['love', 'just', 'absolutely', 'burgers', 

In [6]:
# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(df['ReviewTxt'])
tf_feature_names = tf_vectorizer.get_feature_names()

In [9]:
lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)




In [11]:
topic_words = {}
n_top_words = 5

for topic, comp in enumerate(lda.components_):
    # for the n-dimensional array "arr":
    # argsort() returns a ranked n-dimensional array of arr, call it "ranked_array"
    # which contains the indices that would sort arr in a descending fashion
    # for the ith element in ranked_array, ranked_array[i] represents the index of the
    # element in arr that should be at the ith index in ranked_array
    # ex. arr = [3,7,1,0,3,6]
    # np.argsort(arr) -> [3, 2, 0, 4, 5, 1]
    # word_idx contains the indices in "topic" of the top num_top_words most relevant
    # to a given topic ... it is sorted ascending to begin with and then reversed (desc. now)    
    word_idx = np.argsort(comp)[::-1][:n_top_words]

    # store the words most relevant to the topic
    topic_words[topic] = [tf_feature_names[i] for i in word_idx]

In [29]:
lda.

-1

In [None]:
for i in range(len(df['ReviewTxt'])):
    print(lda[df.iloc[i,8]]) 