In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import lda
from sklearn.externals import joblib
import time

In [4]:
header = ['class_id', 'review']
train_df = pd.read_csv('../data/yelp_review_full/train.csv', header=None, names=header)

In [5]:
train_df.head()

Unnamed: 0,class_id,review
0,5,dr. goldberg offers everything i look for in a...
1,2,"Unfortunately, the frustration of being Dr. Go..."
2,4,Been going to Dr. Goldberg for over 10 years. ...
3,4,Got a letter in the mail last week that said D...
4,1,I don't know what Dr. Goldberg was like before...


In [6]:
train_df.review = train_df.review.str.lower()

In [7]:
train_x = train_df.review.values

In [8]:
bow_model = CountVectorizer(stop_words='english')

In [9]:
bow = bow_model.fit_transform(train_x)

In [10]:
bow_model.get_feature_names()[:10]

['00',
 '000',
 '0000',
 '00000',
 '000000',
 '0000000',
 '0000000003',
 '000000001',
 '00000001',
 '0000002']

In [11]:
joblib.dump(bow_model, '../models/yelpf_bow_model.pkl')
bow_model = joblib.load('../models/yelpf_bow_model.pkl')

In [12]:
bow = bow_model.transform(train_x)

In [13]:
# n_topics = [20]
n_topics = [100]
# n_topics = [200]
n_iter = 2000

In [None]:
with open('./yelpf_topic100iter2000_done_time.txt', 'w') as f:
    for n in n_topics:
        start = time.time()
        lda_model = lda.lda.LDA(n_topics=n, n_iter=n_iter, random_state=0)
        lda_model.fit(bow)
#         joblib.dump(lda_model, '../models/hybrid_lda_model_{}_{}iter.pkl'.format(n, n_iter))
        joblib.dump(lda_model, '../models/yelpf_lda_model_{}_{}iter.pkl'.format(n, n_iter))
        end = time.time()
        print("topic_N =", str(n), "train time", end - start, file=f)

INFO:lda:n_documents: 650000
INFO:lda:vocab_size: 242573
INFO:lda:n_words: 39991116
INFO:lda:n_topics: 100
INFO:lda:n_iter: 2000
  if sparse and not np.issubdtype(doc_word.dtype, int):
INFO:lda:<0> log likelihood: -565319352
INFO:lda:<10> log likelihood: -435563787
INFO:lda:<20> log likelihood: -397002005
INFO:lda:<30> log likelihood: -382963938
INFO:lda:<40> log likelihood: -375897368
INFO:lda:<50> log likelihood: -371836039
INFO:lda:<60> log likelihood: -369324908
INFO:lda:<70> log likelihood: -367666309
INFO:lda:<80> log likelihood: -366509668
INFO:lda:<90> log likelihood: -365667582
INFO:lda:<100> log likelihood: -365043145
INFO:lda:<110> log likelihood: -364556454
INFO:lda:<120> log likelihood: -364141692
INFO:lda:<130> log likelihood: -363777666
INFO:lda:<140> log likelihood: -363505722
INFO:lda:<150> log likelihood: -363265556
INFO:lda:<160> log likelihood: -363068795
INFO:lda:<170> log likelihood: -362906311
INFO:lda:<180> log likelihood: -362752599
INFO:lda:<190> log likelihoo

In [15]:
lda_model_100 = joblib.load('../models/yelpf_lda_model_100_2000iter.pkl')

In [16]:
feature_names = bow_model.get_feature_names()

In [17]:
for topic in lda_model_100.components_:
    sorted_index = np.argsort(topic)[::-1]
    top_words = np.array([feature_names[i] for i in sorted_index[:20]])
    print(top_words)

['steak' 'good' 'cooked' 'ordered' 'medium' 'rare' 'meat' 'rib' 'filet'
 'potatoes' 'dinner' 'meal' 'steaks' 'prime' 'restaurant' 'service' 'cut'
 'sides' 'steakhouse' 'mashed']
['really' 'like' 'place' 'good' 'just' 'nice' 'pretty' 'little' 'think'
 'kind' 'nthe' 'didn' 'cool' 'thing' 'lot' 'ni' 'feel' 'liked' 'bit'
 'wasn']
['menu' 'restaurant' 'dish' 'meal' 'dessert' 'course' 'dishes' 'dinner'
 'delicious' 'chef' 'duck' 'dining' 'main' 'gras' 'served' 'tapas'
 'experience' 'ordered' 'foie' 'scallops']
['bar' 'game' 'place' 'watch' 'games' 'sports' 'tv' 'great' 'good' 'fun'
 'play' 'tvs' 'beer' 'food' 'football' 'big' 'drinks' 'like' 'night'
 'watching']
['food' 'service' 'better' 'quality' 'good' 'restaurant' 'average' 'place'
 'prices' 'price' 'high' 'ok' 'mediocre' 'overpriced' 'slow' 'best' 'par'
 'bad' 'restaurants' 'decent']
['just' 'don' 'know' 'say' 'like' 'says' 'oh' 'want' 'tell' 'right'
 'comes' 'ask' 'said' 'll' 'let' 'ok' 'yes' 'guy' 'going' 'nso']
['night' 'open' 'late'

In [None]:
bow = bow_model.transform(train_x)
theta_docs_100 = lda_model_100.transform(bow)

  if sparse and not np.issubdtype(doc_word.dtype, int):


In [None]:
topic_100_df = pd.DataFrame(theta_docs_100)

topic_100_df.columns = ['topic' + str(i) for i in range(100)]

pd.concat([train_df, topic_100_df], axis=1).to_csv('../data/yelp_review_full/topic100_train.csv', index=False)

In [20]:
test_df = pd.read_csv('../data/yelp_review_full/test.csv', header=None, names=header)

test_df.review = test_df.review.str.lower()

test_x = test_df.review.values

In [21]:
bow = bow_model.transform(test_x)
theta_docs_100 = lda_model_100.transform(bow)

  if sparse and not np.issubdtype(doc_word.dtype, int):


In [22]:
topic_100_df = pd.DataFrame(theta_docs_100)

topic_100_df.columns = ['topic' + str(i) for i in range(100)]

pd.concat([test_df, topic_100_df], axis=1).to_csv('../data/yelp_review_full/topic100_test.csv', index=False)