In [2]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import lda
from sklearn.externals import joblib
import time

In [None]:
header = ['class_id', 'title', 'description']
train_df = pd.read_csv('../data/ag_news/train.csv', header=None, names=header)

In [None]:
train_df.head()

In [None]:
train_df.title = train_df.title.str.lower()
train_df.description = train_df.description.str.lower()

In [None]:
train_x = train_df.title.values + ' ' + train_df.description.values

In [None]:
# bow_model = CountVectorizer(stop_words='english')

In [None]:
# bow = bow_model.fit_transform(train_x)

In [None]:
# bow_model.get_feature_names()[:10]

In [None]:
# joblib.dump(bow_model, '../models/bow_model.pkl')
bow_model = joblib.load('../models/bow_model.pkl')

In [None]:
bow = bow_model.transform(train_x)

In [None]:
n_topics = [20, 100, 200]
n_iter = 2000

In [None]:
with open('./ag_news_iter2000_done_time.txt', 'w') as f:
    for n in n_topics:
        start = time.time()
        lda_model = lda.lda.LDA(n_topics=n, n_iter=n_iter, random_state=0)
        lda_model.fit(bow)
        joblib.dump(lda_model, '../models/lda_model_{}_{}iter.pkl'.format(n, n_iter))
        end = time.time()
        print("topic_N =", str(n), "train time", end - start, file=f)

In [None]:
lda_model_100 = joblib.load('../models/lda_model_100_2000iter.pkl')

In [None]:
feature_names = bow_model.get_feature_names()

In [None]:
for topic in lda_model_100.components_:
    sorted_index = np.argsort(topic)[::-1]
    top_words = np.array([feature_names[i] for i in sorted_index[:20]])
    print(top_words)

In [None]:
bow = bow_model.transform(train_x)
theta_docs_100 = lda_model_100.transform(bow)

In [None]:
topic_100_df = pd.DataFrame(theta_docs_100)

topic_100_df.columns = ['topic' + str(i) for i in range(100)]

pd.concat([train_df, topic_100_df], axis=1).to_csv('../data/ag_news/topic100_train.csv', index=False)

In [None]:
test_df = pd.read_csv('../data/ag_news/test.csv', header=None, names=header)

test_df.title = test_df.title.str.lower()
test_df.description = test_df.description.str.lower()

test_x = test_df.title.values + ' ' + test_df.description.values

In [None]:
bow = bow_model.transform(test_x)
theta_docs_100 = lda_model_100.transform(bow)

In [None]:
topic_100_df = pd.DataFrame(theta_docs_100)

topic_100_df.columns = ['topic' + str(i) for i in range(100)]

pd.concat([test_df, topic_100_df], axis=1).to_csv('../data/ag_news/topic100_test.csv', index=False)

### topic20

In [None]:
lda_model_20 = joblib.load('../models/lda_model_20_2000iter.pkl')

feature_names = bow_model.get_feature_names()

for topic in lda_model_20.components_:
    sorted_index = np.argsort(topic)[::-1]
    top_words = np.array([feature_names[i] for i in sorted_index[:20]])
    print(top_words)
    
bow = bow_model.transform(train_x)
theta_docs_20 = lda_model_20.transform(bow)

topic_20_df = pd.DataFrame(theta_docs_20)

topic_20_df.columns = ['topic' + str(i) for i in range(20)]

pd.concat([train_df, topic_20_df], axis=1).to_csv('../data/ag_news/topic20_train.csv', index=False)

bow = bow_model.transform(test_x)
theta_docs_20 = lda_model_20.transform(bow)

topic_20_df = pd.DataFrame(theta_docs_20)

topic_20_df.columns = ['topic' + str(i) for i in range(20)]

pd.concat([test_df, topic_20_df], axis=1).to_csv('../data/ag_news/topic20_test.csv', index=False)

### topic200

In [None]:
lda_model_200 = joblib.load('../models/lda_model_200_2000iter.pkl')

feature_names = bow_model.get_feature_names()

for topic in lda_model_200.components_:
    sorted_index = np.argsort(topic)[::-1]
    top_words = np.array([feature_names[i] for i in sorted_index[:20]])
    print(top_words)
    
bow = bow_model.transform(train_x)
theta_docs_200 = lda_model_200.transform(bow)

topic_200_df = pd.DataFrame(theta_docs_200)

topic_200_df.columns = ['topic' + str(i) for i in range(200)]

pd.concat([train_df, topic_200_df], axis=1).to_csv('../data/ag_news/topic200_train.csv', index=False)

bow = bow_model.transform(test_x)
theta_docs_200 = lda_model_200.transform(bow)

topic_200_df = pd.DataFrame(theta_docs_200)

topic_200_df.columns = ['topic' + str(i) for i in range(200)]

pd.concat([test_df, topic_200_df], axis=1).to_csv('../data/ag_news/topic200_test.csv', index=False)