In [1]:
import pandas as pd
import urllib.request
urllib.request.urlretrieve("https://raw.githubusercontent.com/franciscadias/data/master/abcnews-date-text.csv", filename="abcnews-date-text.csv")
data = pd.read_csv('abcnews-date-text.csv', error_bad_lines=False)

In [2]:
print(len(data))
print(data.head(5))

1082168
   publish_date                                      headline_text
0      20030219  aba decides against community broadcasting lic...
1      20030219     act fire witnesses must be aware of defamation
2      20030219     a g calls for infrastructure protection summit
3      20030219           air nz staff in aust strike for pay rise
4      20030219      air nz strike to affect australian travellers


In [64]:
text = data[['headline_text']]
text.head(5)

Unnamed: 0,headline_text
0,aba decides against community broadcasting lic...
1,act fire witnesses must be aware of defamation
2,a g calls for infrastructure protection summit
3,air nz staff in aust strike for pay rise
4,air nz strike to affect australian travellers


In [80]:
#토큰화
import nltk
text['headline_text'] = pd.DataFrame({'document':text['headline_text']})
tokenized_doc = text['headline_text'].apply(lambda x: x.split()) # 토큰화
text['headline_text'] =text['headline_text'].apply(lambda x: x.split()) # 토큰화

print(tokenized_doc[:5])
print(text.head(5))

0    [decides, community, broadcasting, licence]
1     [fire, witnesses, must, aware, defamation]
2    [calls, infrastructure, protection, summit]
3                    [staff, aust, strike, rise]
4       [strike, affect, australian, travellers]
Name: headline_text, dtype: object
                                 headline_text
0  [decides, community, broadcasting, licence]
1   [fire, witnesses, must, aware, defamation]
2  [calls, infrastructure, protection, summit]
3                  [staff, aust, strike, rise]
4     [strike, affect, australian, travellers]


In [81]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
text['headline_text'] = text['headline_text'].apply(lambda x: [word for word in x if word not in (stop)])
print(text.head(5))

                                 headline_text
0  [decides, community, broadcasting, licence]
1   [fire, witnesses, must, aware, defamation]
2  [calls, infrastructure, protection, summit]
3                  [staff, aust, strike, rise]
4     [strike, affect, australian, travellers]


In [82]:
from nltk.stem import WordNetLemmatizer
text['headline_text'] = text['headline_text'].apply(lambda x: [WordNetLemmatizer().lemmatize(word, pos='v') for word in x])
print(text.head(5))

                                headline_text
0     [decide, community, broadcast, licence]
1    [fire, witness, must, aware, defamation]
2  [call, infrastructure, protection, summit]
3                 [staff, aust, strike, rise]
4    [strike, affect, australian, travellers]


In [83]:
tokenized_doc = text['headline_text'].apply(lambda x: [word for word in x if len(word) > 3])
print(tokenized_doc[:5])

0       [decide, community, broadcast, licence]
1      [fire, witness, must, aware, defamation]
2    [call, infrastructure, protection, summit]
3                   [staff, aust, strike, rise]
4      [strike, affect, australian, travellers]
Name: headline_text, dtype: object


In [84]:
# 역토큰화 (토큰화 작업을 되돌림)
detokenized_doc = []
for i in range(len(text)):
    t = ' '.join(tokenized_doc[i])
    detokenized_doc.append(t)

text['headline_text'] = detokenized_doc # 다시 text['headline_text']에 재저장

In [85]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english', 
max_features= 1000) # 상위 1,000개의 단어를 보존 
X = vectorizer.fit_transform(text['headline_text'])
X.shape # TF-IDF 행렬의 크기 확인

(1082168, 1000)

In [86]:
from sklearn.decomposition import LatentDirichletAllocation
lda_model=LatentDirichletAllocation(n_components=10,learning_method='online',random_state=777,max_iter=1)
lda_top=lda_model.fit_transform(X)
print(lda_model.components_)
print(lda_model.components_.shape) 

[[1.69780007e+03 1.00009454e-01 1.00008808e-01 ... 1.00002655e-01
  1.00001171e-01 1.00003333e-01]
 [1.00010749e-01 1.00005216e-01 1.00005479e-01 ... 1.00006045e-01
  1.00002917e-01 7.56281699e+02]
 [1.00002664e-01 2.75535526e+03 1.00010855e-01 ... 1.00006171e-01
  1.00005181e-01 1.00005444e-01]
 ...
 [1.00004306e-01 1.00007591e-01 1.00018292e-01 ... 1.00004813e-01
  1.00002256e-01 1.00003850e-01]
 [1.00004955e-01 1.00008398e-01 3.20022176e+03 ... 1.00003864e-01
  1.00004122e-01 1.00004587e-01]
 [1.00002817e-01 1.00006578e-01 1.00007865e-01 ... 1.00006806e-01
  1.00002823e-01 1.00004215e-01]]
(10, 1000)


In [87]:
terms = vectorizer.get_feature_names() # 단어 집합. 1,000개의 단어가 저장됨.

def get_topics(components, feature_names, n=5):
    for idx, topic in enumerate(components):
        print("Topic %d:" % (idx+1), [(feature_names[i], topic[i].round(2)) for i in topic.argsort()[:-n - 1:-1]])
get_topics(lda_model.components_,terms)

Topic 1: [('south', 6702.75), ('years', 5096.96), ('jail', 4610.17), ('life', 4280.38), ('labor', 4057.55)]
Topic 2: [('australian', 11121.65), ('plan', 6039.68), ('interview', 5926.19), ('change', 5863.79), ('home', 5677.19)]
Topic 3: [('police', 12099.7), ('sydney', 8417.37), ('kill', 5852.89), ('test', 5064.43), ('drug', 4291.04)]
Topic 4: [('report', 5621.96), ('rural', 5500.28), ('death', 4978.13), ('state', 4928.71), ('hospital', 4331.42)]
Topic 5: [('attack', 6967.31), ('market', 5546.53), ('coast', 5434.03), ('tasmanian', 4866.34), ('shoot', 4499.39)]
Topic 6: [('australia', 13740.9), ('trump', 11955.89), ('charge', 8437.42), ('murder', 6268.67), ('house', 6131.75)]
Topic 7: [('crash', 5284.45), ('north', 5147.84), ('woman', 4568.87), ('west', 4004.58), ('budget', 3672.16)]
Topic 8: [('queensland', 7740.25), ('canberra', 6117.06), ('live', 5544.74), ('rise', 4045.23), ('national', 4040.47)]
Topic 9: [('election', 7824.88), ('melbourne', 7567.88), ('court', 7548.24), ('adelaide'