# Topic Modeling
- **토픽 모델링(Topic Modeling)**이란 기계 학습 및 자연어 처리 분야에서 토픽이라는 문서 집합의 추상적인 주제를 발견하기 위한 통계적 모델 중 하나로, 텍스트 본문의 숨겨진 의미 구조를 발견하기 위해 사용되는 텍스트 마이닝 기법을 말한다.

## 잠재 의미 분석(Latent Semantic Analysis, LSA)
- 전체 코퍼스에서 문자 속 단어들 간의 관계를 찾아내는 자연어 처리 정보 검색 기술.
- 단어와 단어 사이, 문서와 문서 사이, 단어와 문서 사이의 의미적 유사성 점수를 찾아낸다.

In [1]:
import pandas as pd
import numpy as np
import urllib.request
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/franciscadias/data/master/abcnews-date-text.csv",
                           filename="/content/abcnews-data-text.csv")

('/content/abcnews-data-text.csv', <http.client.HTTPMessage at 0x7fb9d6586e50>)

In [5]:
data = pd.read_csv('/content/abcnews-data-text.csv', error_bad_lines = False)
data

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers
...,...,...
1082163,20170630,when is it ok to compliment a womans smile a g...
1082164,20170630,white house defends trumps tweet
1082165,20170630,winter closes in on tasmania as snow ice falls
1082166,20170630,womens world cup australia wins despite atapat...


In [6]:
data.head()  # publish_date 열은 필요없음

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers


In [7]:
text = data[['headline_text']]
text.head()

Unnamed: 0,headline_text
0,aba decides against community broadcasting lic...
1,act fire witnesses must be aware of defamation
2,a g calls for infrastructure protection summit
3,air nz staff in aust strike for pay rise
4,air nz strike to affect australian travellers


In [9]:
text.nunique()

headline_text    1054983
dtype: int64

In [10]:
# 중복 샘플 제거

text.drop_duplicates(inplace = True)
text = text.reset_index(drop = True)

print(len(text))

1054983


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


### 데이터 정제 및 정규화

In [11]:
# NLTK 토크나이저를 이용해서 토큰화

text['headline_text'] = text.apply(lambda row:nltk.word_tokenize(row['headline_text']), axis = 1)

In [12]:
# 불용어 제거
stop_words = stopwords.words('english')
text['headline_text'] = text['headline_text'].apply(lambda x: [word for word in x if word not in (stop_words) ])

In [13]:
text.head()

Unnamed: 0,headline_text
0,"[aba, decides, community, broadcasting, licence]"
1,"[act, fire, witnesses, must, aware, defamation]"
2,"[g, calls, infrastructure, protection, summit]"
3,"[air, nz, staff, aust, strike, pay, rise]"
4,"[air, nz, strike, affect, australian, travellers]"


In [15]:
# 단어 정규화 과정 길이가 1~2인 단어는 제거하는 전처리
# 단어 정규화 3인칭 단수 표현 -> 1인칭 변환, 과거형 동사 -> 현재형 동사 등 수행
text['headline_text'] = text['headline_text'].apply(lambda x : [WordNetLemmatizer().lemmatize(word, pos='v') for word in x])

In [16]:
# 길이가 1~2인 단어를 제거
text = text['headline_text'].apply(lambda x:[word for word in x if len(word) > 2])
print(text[:5])

0     [aba, decide, community, broadcast, licence]
1    [act, fire, witness, must, aware, defamation]
2       [call, infrastructure, protection, summit]
3            [air, staff, aust, strike, pay, rise]
4    [air, strike, affect, australian, travellers]
Name: headline_text, dtype: object


In [17]:
# 역토큰화 (토큰화 작업을 역으로 수행)
detokenized_doc=[]
for i in range(len(text)):
  t = ' '.join(text[i])
  detokenized_doc.append(t)

train_data = detokenized_doc

In [18]:
train_data[:5]

['aba decide community broadcast licence',
 'act fire witness must aware defamation',
 'call infrastructure protection summit',
 'air staff aust strike pay rise',
 'air strike affect australian travellers']

In [20]:
# 상위 5000개의 단어만 사용
c_vectorizer = CountVectorizer(stop_words='english', max_features= 5000)
document_term_matrix = c_vectorizer.fit_transform(train_data)

In [21]:
# DTM의 크기
print('행렬의 크기 : ', document_term_matrix.shape)  # 문서의 수 * 단어 집합의 크기

행렬의 크기 :  (1054983, 5000)


### scikit-learn Truncated SVD 활용

In [22]:
from sklearn.decomposition import TruncatedSVD

n_topics = 10
lsa_model = TruncatedSVD(n_components = n_topics)
lsa_model.fit_transform(document_term_matrix)

array([[ 1.20578344e-02, -3.56134717e-03,  1.83290592e-02, ...,
         5.11364099e-03, -2.45189575e-03,  1.08896067e-02],
       [ 2.91251012e-02, -1.05870080e-02,  1.82923347e-02, ...,
         1.64724595e-03, -7.60829386e-03, -1.09781827e-03],
       [ 5.05213693e-03, -1.93264128e-03,  9.78853158e-03, ...,
        -2.30888752e-03,  9.00708510e-05,  2.26573620e-03],
       ...,
       [ 2.97811088e-02,  5.14607131e-03,  2.47392911e-02, ...,
         3.57545741e-02,  4.65769188e-03,  2.78202203e-02],
       [ 6.08535393e-02, -1.27092234e-02,  1.39536831e-01, ...,
         9.24501534e-01,  7.89413370e-01, -2.75091076e-01],
       [ 7.14542151e-02,  2.89557814e-02,  2.17020063e-03, ...,
        -5.85412514e-03, -2.65628005e-02, -3.25230833e-02]])

In [23]:
print(np.shape(lsa_model.components_))

(10, 5000)


In [26]:
terms = c_vectorizer.get_feature_names()

def get_topics(components, feature_names, n=5):
  for idx, topic in enumerate(components):
    print("Topic %d:" % (idx+1), [(feature_names[i], topic[i].round(5)) for i in topic.argsort()[:-n - 1:-1]])

In [27]:
# LSA에 대한 결과물

get_topics(lsa_model.components_, terms)

Topic 1: [('police', 0.74636), ('man', 0.45348), ('charge', 0.21095), ('new', 0.1409), ('court', 0.1114)]
Topic 2: [('man', 0.69352), ('charge', 0.30054), ('court', 0.16851), ('face', 0.11393), ('murder', 0.10706)]
Topic 3: [('new', 0.83689), ('plan', 0.23654), ('say', 0.18263), ('council', 0.11012), ('govt', 0.10903)]
Topic 4: [('say', 0.73969), ('plan', 0.35838), ('govt', 0.16256), ('council', 0.12831), ('urge', 0.0763)]
Topic 5: [('plan', 0.73566), ('council', 0.1768), ('govt', 0.13211), ('urge', 0.08888), ('water', 0.06645)]
Topic 6: [('govt', 0.49438), ('urge', 0.27076), ('court', 0.23967), ('fund', 0.20937), ('win', 0.1717)]
Topic 7: [('charge', 0.49961), ('court', 0.47329), ('face', 0.35458), ('plan', 0.13056), ('murder', 0.12154)]
Topic 8: [('win', 0.62941), ('court', 0.30909), ('kill', 0.22776), ('crash', 0.17665), ('australia', 0.11596)]
Topic 9: [('win', 0.61847), ('charge', 0.4712), ('open', 0.07569), ('sydney', 0.07024), ('cup', 0.06183)]
Topic 10: [('council', 0.48057), (

## 잠재 디리클레 할당(Latent Dirichlet Allocation, LDA)
- LSA에 사용한 train data를 그대로 사용할 것임

### TF-IDF 행렬 생성

In [28]:
# 상위 5000개의 단어만 사용
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features = 5000)
tf_idf_matrix = tfidf_vectorizer.fit_transform(train_data)

# TF-IDF행렬의 크기를 확인
print('행렬의 크기 : ', tf_idf_matrix.shape)

행렬의 크기 :  (1054983, 5000)


### scikit-learn LDA model활용

In [29]:
from sklearn.decomposition import LatentDirichletAllocation
lda_model = LatentDirichletAllocation(n_components = 10, learning_method = 'online', random_state = 777, max_iter=1)
lda_model.fit_transform(tf_idf_matrix)

array([[0.0335099 , 0.0335099 , 0.0335099 , ..., 0.17024867, 0.0335099 ,
        0.0335099 ],
       [0.03365631, 0.03365631, 0.03365631, ..., 0.03365631, 0.03365631,
        0.03365631],
       [0.25184095, 0.0366096 , 0.0366096 , ..., 0.0366096 , 0.0366096 ,
        0.0366096 ],
       ...,
       [0.26687206, 0.02914502, 0.02914502, ..., 0.13007484, 0.02916018,
        0.28739608],
       [0.10378115, 0.02637829, 0.12325014, ..., 0.02637829, 0.02637829,
        0.02637829],
       [0.03376055, 0.03376055, 0.2255442 , ..., 0.03376055, 0.03376055,
        0.03376055]])

In [30]:
print(np.shape(lda_model.components_))

(10, 5000)


In [31]:
# 함수 짜 보기 : def get_topics(components, feature_names, n = 5)

# LDA의 결과 토픽과 각 단어의 비중을 출력하자
def get_topics(components, feature_names, n=5):
  for idx, topic in enumerate(components):
    print("Topic %d:" % (idx+1), [(feature_names[i], topic[i].round(5)) for i in topic.argsort()[:-n - 1:-1]])

In [32]:
get_topics(lda_model.components_, terms)

Topic 1: [('australia', 9359.06334), ('sydney', 5854.97288), ('attack', 4784.76322), ('change', 4193.63035), ('year', 3924.88997)]
Topic 2: [('government', 6344.07413), ('charge', 5947.12292), ('man', 4519.7974), ('state', 3658.16422), ('live', 3625.10473)]
Topic 3: [('australian', 7666.65651), ('say', 7561.01807), ('police', 5513.22932), ('home', 4048.38409), ('report', 3796.04446)]
Topic 4: [('melbourne', 5298.35047), ('south', 4844.59835), ('death', 4281.78433), ('china', 3214.44581), ('women', 3029.28443)]
Topic 5: [('win', 5704.0914), ('canberra', 4322.0963), ('die', 4025.63057), ('open', 3771.65243), ('warn', 3577.47151)]
Topic 6: [('court', 5246.3124), ('world', 4536.86331), ('country', 4166.34794), ('woman', 3983.97748), ('crash', 3793.50267)]
Topic 7: [('election', 5418.5038), ('adelaide', 4864.95604), ('house', 4478.6135), ('school', 3966.82676), ('2016', 3955.11155)]
Topic 8: [('trump', 8189.58575), ('new', 6625.2724), ('north', 3705.40987), ('rural', 3521.42659), ('donald',