In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import lda
from sklearn.externals import joblib
import time

In [2]:
header = ['class_id', 'title', 'description']
train_df = pd.read_csv('../data/db_pedia/train.csv', header=None, names=header)

In [3]:
train_df.head()

Unnamed: 0,class_id,title,description
0,1,E. D. Abbott Ltd,Abbott of Farnham E D Abbott Limited was a Br...
1,1,Schwan-Stabilo,Schwan-STABILO is a German maker of pens for ...
2,1,Q-workshop,Q-workshop is a Polish company located in Poz...
3,1,Marvell Software Solutions Israel,Marvell Software Solutions Israel known as RA...
4,1,Bergan Mercy Medical Center,Bergan Mercy Medical Center is a hospital loc...


In [4]:
train_df.title = train_df.title.str.lower()
train_df.description = train_df.description.str.lower()

In [5]:
train_x = train_df.title.values + ' ' + train_df.description.values

In [6]:
bow_model = CountVectorizer(stop_words='english')

In [7]:
bow = bow_model.fit_transform(train_x)

In [8]:
bow_model.get_feature_names()[:10]

['00',
 '000',
 '0000',
 '00000006',
 '000001',
 '000002',
 '00000972',
 '000012',
 '000015',
 '000018']

In [9]:
joblib.dump(bow_model, '../models/dbpedia_bow_model.pkl')
bow_model = joblib.load('../models/dbpedia_bow_model.pkl')

In [10]:
bow = bow_model.transform(train_x)

In [12]:
# n_topics = [20]
n_topics = [100]
# n_topics = [200]
n_iter = 2000

In [None]:
with open('./dbpedia_topic100iter2000_done_time.txt', 'w') as f:
    for n in n_topics:
        start = time.time()
        lda_model = lda.lda.LDA(n_topics=n, n_iter=n_iter, random_state=0)
        lda_model.fit(bow)
#         joblib.dump(lda_model, '../models/hybrid_lda_model_{}_{}iter.pkl'.format(n, n_iter))
        joblib.dump(lda_model, '../models/dbpedia_lda_model_{}_{}iter.pkl'.format(n, n_iter))
        end = time.time()
        print("topic_N =", str(n), "train time", end - start, file=f)

INFO:lda:n_documents: 560000
INFO:lda:vocab_size: 671754
INFO:lda:n_words: 17243036
INFO:lda:n_topics: 100
INFO:lda:n_iter: 2000
  if sparse and not np.issubdtype(doc_word.dtype, int):
INFO:lda:<10> log likelihood: -185963427
INFO:lda:<20> log likelihood: -166643540
INFO:lda:<30> log likelihood: -161749909
INFO:lda:<40> log likelihood: -159516258
INFO:lda:<50> log likelihood: -158240453
INFO:lda:<60> log likelihood: -157440623
INFO:lda:<70> log likelihood: -156915282
INFO:lda:<80> log likelihood: -156550047
INFO:lda:<90> log likelihood: -156293965
INFO:lda:<100> log likelihood: -156084553
INFO:lda:<110> log likelihood: -155927969
INFO:lda:<120> log likelihood: -155795867
INFO:lda:<130> log likelihood: -155691814
INFO:lda:<140> log likelihood: -155620747
INFO:lda:<150> log likelihood: -155542469
INFO:lda:<160> log likelihood: -155480517
INFO:lda:<170> log likelihood: -155423854
INFO:lda:<180> log likelihood: -155367053
INFO:lda:<190> log likelihood: -155311589
INFO:lda:<200> log likelih

In [14]:
lda_model_100 = joblib.load('../models/dbpedia_lda_model_100_2000iter.pkl')

In [15]:
feature_names = bow_model.get_feature_names()

In [16]:
for topic in lda_model_100.components_:
    sorted_index = np.argsort(topic)[::-1]
    top_words = np.array([feature_names[i] for i in sorted_index[:20]])
    print(top_words)

['book' 'history' 'life' 'written' 'jewish' 'work' 'human' 'israel'
 'political' 'social' 'world' 'non' 'israeli' 'hebrew' 'synagogue'
 'published' 'people' 'culture' 'author' 'modern']
['class' 'railway' 'built' 'locomotive' 'electric' 'locomotives' 'steam'
 'railways' 'railroad' 'station' 'diesel' 'line' 'train' 'rail' 'service'
 'type' 'series' 'used' 'operated' 'unit']
['records' 'label' 'record' 'music' 'hop' 'hip' 'independent' 'group'
 'producer' 'dj' 'rapper' 'based' 'american' 'artists' 'known' 'founded'
 'artist' 'better' 'released' 'dance']
['company' 'air' 'based' 'airline' 'international' 'services' 'airport'
 'owned' 'power' 'oil' 'bus' 'operates' 'service' 'airlines' 'operated'
 'energy' 'gas' 'transit' 'main' 'charter']
['family' 'republic' 'butterfly' 'congo' 'africa' 'kenya' 'african'
 'tanzania' 'democratic' 'uganda' 'ghana' 'central' 'cameroon' 'consists'
 'guinea' 'coast' 'western' 'sierra' 'zimbabwe' 'nigeria']
['television' 'born' 'play' 'actor' 'known' 'theatre'

In [None]:
bow = bow_model.transform(train_x)
theta_docs_100 = lda_model_100.transform(bow)

  if sparse and not np.issubdtype(doc_word.dtype, int):


In [18]:
topic_100_df = pd.DataFrame(theta_docs_100)

topic_100_df.columns = ['topic' + str(i) for i in range(100)]

pd.concat([train_df, topic_100_df], axis=1).to_csv('../data/db_pedia/topic100_train.csv', index=False)

In [19]:
test_df = pd.read_csv('../data/db_pedia/test.csv', header=None, names=header)

test_df.title = test_df.title.str.lower()
test_df.description = test_df.description.str.lower()

test_x = test_df.title.values + ' ' + test_df.description.values

In [20]:
bow = bow_model.transform(test_x)
theta_docs_100 = lda_model_100.transform(bow)

  if sparse and not np.issubdtype(doc_word.dtype, int):


In [21]:
topic_100_df = pd.DataFrame(theta_docs_100)

topic_100_df.columns = ['topic' + str(i) for i in range(100)]

pd.concat([test_df, topic_100_df], axis=1).to_csv('../data/db_pedia/topic100_test.csv', index=False)