# 제 9 강 LLM으로 토픽 모델 설명하기

## 9.1. 패키지 가져오기

In [1]:
import pandas as pd
import numpy as np
import string
import re
# NLTK 금지어 가져오기
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
# LDA 분석을 위한 패키지 가져오기
import gensim
import gensim, logging, warnings
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
# 그래프 생성을 위한 라이브러리
import matplotlib.pyplot as plt


In [2]:
# 금지어 확장
def set_stop_words(extended_words):
    stop_words = stopwords.words('english')
    stop_words.extend(extended_words)
    return stop_words

# 도메인 금지어
extended_words = ['paypal','app', 'money']
stop_words= set_stop_words(extended_words)


In [3]:
# 텍스트 전처리 함수
def preprocess_text(text_data,get_lemma=True, tags=(), stop_words=[]):
    # 소문자 변경
    text_data = text_data.lower()
    # URL 제거
    text_data = re.sub(r'((www.\S+)|(https?://\S+))', r"", text_data)
    # HTML 태그 제거
    text_data = re.sub(r'<[^>]+>', r'', text_data)
    # 숫자 제거
    text_data = re.sub(r'[0-9]\S+', r'', text_data)
    # 문장부호 제거
    text_data = [char for char in text_data if char not in string.punctuation]
    text_data = "".join(text_data)
    # 금지어 제거
    text_data = [word for word in text_data.split() if word.lower() not in stop_words]
    text_data = " ".join(text_data)
    # 래마타이즈
    if get_lemma==True:
        text_data = [lemmatizer.lemmatize(word) for word in text_data.split()]
        text_data = " ".join(text_data)
    # POS 필터 
    if len(tags)>0:
        text_data = [word for word, pos in nltk.pos_tag(nltk.word_tokenize(text_data)) if pos.startswith(tags)]
        text_data = " ".join(text_data)

    return text_data



In [4]:
def sent_to_words(sentences):
    for sent in sentences:
        sent = gensim.utils.simple_preprocess(str(sent), deacc=True) 
        yield(sent) 
        
sent_to_words(["sentences"])

<generator object sent_to_words at 0x000001C7A3841A50>

## 9.2. 데이터 읽기 및 표본추출

In [5]:
df = pd.read_csv("papers_1000.csv")

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Unnamed: 0      1000 non-null   int64 
 1   pmid            1000 non-null   int64 
 2   doi             977 non-null    object
 3   journal         994 non-null    object
 4   country         992 non-null    object
 5   title           994 non-null    object
 6   authors         988 non-null    object
 7   abstract        994 non-null    object
 8   citation_count  1000 non-null   int64 
 9   published_at    994 non-null    object
dtypes: int64(3), object(7)
memory usage: 78.2+ KB


In [7]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,pmid,doi,journal,country,title,authors,abstract,citation_count,published_at
0,155763,34494017,10.21203/rs.3.rs-862572/v1,Research square,United States,Calibration of Two Validated SARS-CoV-2 Pseudo...,"Yunda Huang, Oleg Borisov, Jia Jin Kee, Lindsa...",<Abstract><AbstractText>Vaccine-induced neutra...,0,2021-09-26 00:00:00
1,19979,33772169,10.1038/s41415-021-2860-z,British dental journal,England,Phone call success.,"M Loh, R Smith, M Forde, D Mills","<?xml version=""1.0""?>\n<p/>\n",0,2021-04-26 00:00:00
2,29771,33613398,10.3389/fpsyg.2021.621633,Frontiers in psychology,Switzerland,"Self-Perceived Mental Health Status, Digital A...","Vanja Kopilaš, Anni M Hasratian, Lucia Martine...",<Abstract>\n <AbstractText>The ...,1,2021-02-23 00:00:00
3,65366,33178449,10.1136/bmjsem-2020-000943,BMJ open sport & exercise medicine,England,Could Virtual Reality play a role in the rehab...,"Merlijn Smits, J Bart Staal, Harry van Goor",<Abstract>\n <AbstractText>Post...,3,2020-11-13 00:00:00
4,5998,33865136,10.1016/j.scitotenv.2021.146967,The Science of the total environment,Netherlands,Detection of SARS-CoV-2 RNA in the Danube Rive...,"Stoimir Kolarević, Adrienn Micsinai, Réka Szán...",<Abstract>\n <AbstractText>In S...,4,2021-04-21 00:00:00


In [8]:
df.dropna(inplace=True)

# 3. 텍스트 전처리

In [9]:
TAGS = ("JJ", "NN", "RB", "VB")
df['abstract2'] = df.apply(lambda x: preprocess_text(x['abstract'], 
                                                         get_lemma=True, 
                                                         tags=TAGS,
                                                         stop_words=stop_words), 
                                                         axis=1)

In [10]:
df.head()

Unnamed: 0.1,Unnamed: 0,pmid,doi,journal,country,title,authors,abstract,citation_count,published_at,abstract2
0,155763,34494017,10.21203/rs.3.rs-862572/v1,Research square,United States,Calibration of Two Validated SARS-CoV-2 Pseudo...,"Yunda Huang, Oleg Borisov, Jia Jin Kee, Lindsa...",<Abstract><AbstractText>Vaccine-induced neutra...,0,2021-09-26 00:00:00,vaccineinduced neutralizing antibody nabs key ...
1,19979,33772169,10.1038/s41415-021-2860-z,British dental journal,England,Phone call success.,"M Loh, R Smith, M Forde, D Mills","<?xml version=""1.0""?>\n<p/>\n",0,2021-04-26 00:00:00,
2,29771,33613398,10.3389/fpsyg.2021.621633,Frontiers in psychology,Switzerland,"Self-Perceived Mental Health Status, Digital A...","Vanja Kopilaš, Anni M Hasratian, Lucia Martine...",<Abstract>\n <AbstractText>The ...,1,2021-02-23 00:00:00,novelty coronavirus disease covid pandemic occ...
3,65366,33178449,10.1136/bmjsem-2020-000943,BMJ open sport & exercise medicine,England,Could Virtual Reality play a role in the rehab...,"Merlijn Smits, J Bart Staal, Harry van Goor",<Abstract>\n <AbstractText>Post...,3,2020-11-13 00:00:00,postcovid patient particularly needed high car...
4,5998,33865136,10.1016/j.scitotenv.2021.146967,The Science of the total environment,Netherlands,Detection of SARS-CoV-2 RNA in the Danube Rive...,"Stoimir Kolarević, Adrienn Micsinai, Réka Szán...",<Abstract>\n <AbstractText>In S...,4,2021-04-21 00:00:00,serbia le collected municipal wastewater treat...


In [11]:
# abstract2의 결측행 지우기
df = df[df['abstract2'] != '']

In [13]:
# Convert to list
dataset = df['abstract2'].values.tolist()
data_words = list(sent_to_words(dataset))

In [14]:
data_words[:5]

[['vaccineinduced',
  'neutralizing',
  'antibody',
  'nabs',
  'key',
  'biomarkers',
  'considered',
  'associated',
  'vaccine',
  'efficacy',
  'united',
  'state',
  'phase',
  'efficacy',
  'trial',
  'covid',
  'vaccine',
  'nabs',
  'measured',
  'different',
  'validated',
  'sarscov',
  'neutralization',
  'assay',
  'trial',
  'using',
  'assay',
  'describe',
  'compare',
  'nab',
  'titer',
  'obtained',
  'assay',
  'observe',
  'assay',
  'consistently',
  'yielded',
  'higher',
  'nab',
  'titer',
  'assay',
  'performed',
  'world',
  'health',
  'organizationa',
  'antisarscov',
  'immunoglobulin',
  'international',
  'standard',
  'covid',
  'convalescent',
  'serum',
  'vaccinee',
  'serum',
  'overcome',
  'challenge',
  'difference',
  'readout',
  'pose',
  'data',
  'assay',
  'evaluate',
  'calibration',
  'approach',
  'show',
  'assay',
  'calibrated',
  'common',
  'scale',
  'result',
  'aid',
  'decisionmaking',
  'based',
  'data',
  'assay',
  'evaluati

In [15]:
#결측치 제거
data_words = [doc for doc in data_words if len(doc)>0]

In [16]:
data_words[:5]

[['vaccineinduced',
  'neutralizing',
  'antibody',
  'nabs',
  'key',
  'biomarkers',
  'considered',
  'associated',
  'vaccine',
  'efficacy',
  'united',
  'state',
  'phase',
  'efficacy',
  'trial',
  'covid',
  'vaccine',
  'nabs',
  'measured',
  'different',
  'validated',
  'sarscov',
  'neutralization',
  'assay',
  'trial',
  'using',
  'assay',
  'describe',
  'compare',
  'nab',
  'titer',
  'obtained',
  'assay',
  'observe',
  'assay',
  'consistently',
  'yielded',
  'higher',
  'nab',
  'titer',
  'assay',
  'performed',
  'world',
  'health',
  'organizationa',
  'antisarscov',
  'immunoglobulin',
  'international',
  'standard',
  'covid',
  'convalescent',
  'serum',
  'vaccinee',
  'serum',
  'overcome',
  'challenge',
  'difference',
  'readout',
  'pose',
  'data',
  'assay',
  'evaluate',
  'calibration',
  'approach',
  'show',
  'assay',
  'calibrated',
  'common',
  'scale',
  'result',
  'aid',
  'decisionmaking',
  'based',
  'data',
  'assay',
  'evaluati

In [17]:
# !python3 -m spacy download en  # run in terminal once
def process_ngram(data_words):
    # Build the bigram and trigram models
    bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram = gensim.models.Phrases(bigram[data_words], threshold=100)
    trigram_mod = gensim.models.phrases.Phraser(trigram)
    data_words = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in data_words]
    data_words = [bigram_mod[doc] for doc in data_words]
    data_words = [trigram_mod[bigram_mod[doc]] for doc in data_words]
    return data_words


In [18]:
# Process ngram
documents = process_ngram(data_words)

In [19]:
#  Gensim 모델링을 위한 Dictionary 세트 준비 
dictionary = corpora.Dictionary(documents)
for i in range(0, 10):
    print(dictionary.get(i))

adapted
aid
antisarscov
approach
assay
associated
based
biomarkers
calibrated
calibration


In [20]:
len(dictionary)

10443

In [21]:
def filter_dictionary_by_count(min_count, documents, dictionary):
    from collections import Counter
    word_counter = Counter((word for words in documents for word in words))
    removal_word_idxs = {
        dictionary.token2id[word] for word, count in word_counter.items()
        if count < min_count
    }
    dictionary.filter_tokens(removal_word_idxs)
    dictionary.compactify()
    print('dictionary size : %d' % len(dictionary))# dictionary size : 10354
    return dictionary


In [22]:
# 카운터를 사용해 딕셔너리 크기 축소
dictionary = filter_dictionary_by_count(10, documents, dictionary)

dictionary size : 1570


In [23]:
# Create Corpus: Term Document Frequency
corpus = [dictionary.doc2bow(text) for text in documents]
# View
print(corpus[:1][0][:30])

[(0, 1), (1, 1), (2, 8), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 3), (11, 2), (12, 1), (13, 1), (14, 1), (15, 1), (16, 2), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1)]


In [None]:
#시험에 나옴
# Cohherence가 왜 중요한가?
# Coherence는 주제의 일관성을 측정하는 방법입니다.
# 이것은 주제가 서로 얼마나 잘 맞는지를 나타내는 점수입니다.
# 높은 일관성은 주제가 서로 잘 맞는다는 것을 의미합니다.
# 일관성이 낮은 주제는 서로 다른 주제로 간주됩니다.
# Coherence 값은 0에서 1 사이의 값으로 나타납니다.
# 일반적으로 0.4 이상의 값은 좋은 주제를 나타냅니다.
# Coherence 값이 높을수록 주제가 서로 잘 맞는다는 것을 의미합니다.

In [None]:
#시험에 나옴
# Perplexity는 모델이 데이터를 설명하는 능력을 측정하는 방법입니다.
# Perplexity 값은 낮을수록 더 좋습니다.
# Perplexity는 모델이 데이터를 얼마나 잘 설명하는지를 나타내는 값입니다.
# Perplexity 값이 낮을수록 모델이 데이터를 더 잘 설명한다고 볼 수 있습니다.
# Perplexity 값은 0에서 무한대의 값을 가질 수 있습니다.

In [31]:
def compute_perplexity_values(dictionary, corpus, k_values, passes_list):
    result = []
    for num_topics in k_values:
        lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                               id2word=dictionary,
                                               num_topics=num_topics,
                                               passes=passes_list,
                                               random_state=42)
        perplexity_score = lda_model.log_perplexity(corpus)
        result.append((num_topics, passes_list, perplexity_score))
        print(f"Topics: {num_topics}, Passes:{passes_list}, Perplexity Score: {perplexity_score:.4f}")
    return result

In [36]:
k_values = range(2, 10, 2)
passes_list = [10, 20, 30]
perplexity_scores = []

for passes in passes_list:
	perplexity_scores.extend(compute_perplexity_values(dictionary, corpus, k_values, passes))

Topics: 2, Passes:10, Perplexity Score: -6.7021
Topics: 4, Passes:10, Perplexity Score: -6.6665
Topics: 6, Passes:10, Perplexity Score: -6.6973
Topics: 8, Passes:10, Perplexity Score: -6.7203
Topics: 2, Passes:20, Perplexity Score: -6.6972
Topics: 4, Passes:20, Perplexity Score: -6.6476
Topics: 6, Passes:20, Perplexity Score: -6.6691
Topics: 8, Passes:20, Perplexity Score: -6.6794
Topics: 2, Passes:30, Perplexity Score: -6.6950
Topics: 4, Passes:30, Perplexity Score: -6.6408
Topics: 6, Passes:30, Perplexity Score: -6.6579
Topics: 8, Passes:30, Perplexity Score: -6.6649
