In [2]:
import json
import pandas as pd
import numpy as np

# load hipster data
with open('./hipster_review_sample_500_train') as f:
    df_hipster_train = pd.DataFrame(json.loads(line) for line in f)

with open('./hipster_review_sample_500_test') as f:
    df_hipster_test = pd.DataFrame(json.loads(line) for line in f)

df_hipster_data = pd.concat([df_hipster_train, df_hipster_test])
    
# load touristy data
with open('./touristy_review_sample_500_train') as f:
    df_touristy_train = pd.DataFrame(json.loads(line) for line in f)

with open('./touristy_review_sample_500_test') as f:
    df_touristy_test = pd.DataFrame(json.loads(line) for line in f)

df_touristy_data = pd.concat([df_touristy_train, df_touristy_test])

all_data = pd.concat([df_hipster_data, df_touristy_data])

In [4]:
from __future__ import print_function

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

n_features = 4000
n_topics = 40
n_top_words = 15

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()


print("Loading dataset...")
dataset = []
for elem in df_hipster_data['text']:
    dataset.append(elem)

print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words='english')
#tf_vectorizer = CountVectorizer(max_features=n_features)


tf = tf_vectorizer.fit_transform(dataset)

print("Fitting LDA model ...")
lda = LatentDirichletAllocation(n_components=n_topics, max_iter=20,
                                learning_method='online',
                                learning_offset=50.)

lda.fit(tf)

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

Loading dataset...
Extracting tf features for LDA...
Fitting LDA model ...

Topics in LDA model:

Topic #0:
99 specials cube box response cardboard provolone pointed advertised charges properly display showed chips explained

Topic #1:
omelet feta québec necessarily wine pistachio clothes stores cup bread homemade eggs open sounds plus

Topic #2:
dirty mcdonald biscuit fast rice charlotte clothing stomach biscuits parties food list berry good miss

Topic #3:
almonds beaten transformed doing indian safe happen vont baguette avocado buying compare hearts don caffeine

Topic #4:
fatburger fry gay turkey sin saying operation remind discover soul silly jukebox french diner city

Topic #5:
like bar just nice wall large busy damn tall door time noisy garage live probably

Topic #6:
pate french ham prosciutto greens tomato veal montreal villette museum la meats favorite fruits salami

Topic #7:
just great place bar pool watch nice furniture like songs girls contemporary owners garden sit

Topi