In [99]:
import pandas as pd
import numpy as np
import string
import re
# NLTK 금지어 가져오기
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
# LDA 분석을 위한 패키지 가져오기
import gensim
import gensim, logging, warnings
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
# 그래프 생성을 위한 라이브러리
import matplotlib.pyplot as plt

In [100]:
# 금지어 확장
def set_stop_words(extended_words):
    stop_words = stopwords.words('english')
    stop_words.extend(extended_words)
    
    return stop_words

extended_words = ['abstract','abstracttext']
stop_words= set_stop_words(extended_words)

In [101]:
def preprocess_text(text_data, get_lemma=True, tags=(), stop_words=[]):
	# 소문자 변경
	text_data = text_data.lower()
	# URL 제거
	text_data = re.sub(r'((www.\S+)|(https?://\S+))', r"", text_data)
	# HTML 태그 제거
	text_data = re.sub(r'<[^>]+>', r'', text_data)
	# 숫자 제거
	text_data = re.sub(r'[0-9]\S+', r'', text_data)
	# 문장부호 제거
	text_data = [char for char in text_data if char not in string.punctuation]
	text_data = "".join(text_data)
	# 금지어 제거
	text_data = [word for word in text_data.split() if word.lower() not in stop_words]
	text_data = " ".join(text_data)
	# 래마타이즈
	if get_lemma == True:
		text_data = [lemmatizer.lemmatize(word) for word in text_data.split()]
		text_data = " ".join(text_data)
	# POS 필터
	if len(tags) > 0:
		text_data = [word for word, pos in nltk.pos_tag(nltk.word_tokenize(text_data)) if pos.startswith(tags)]
		text_data = " ".join(text_data)
	return text_data

In [102]:
def sent_to_words(sentences):
    for sent in sentences:
        sent = gensim.utils.simple_preprocess(str(sent),
        deacc=True)
        yield(sent)

In [103]:
# !python3 -m spacy download en # run in terminal once
def process_ngram(data_words):
	# Build the bigram and trigram models
	bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
	bigram_mod = gensim.models.phrases.Phraser(bigram)
	trigram = gensim.models.Phrases(bigram[data_words], threshold=100)
	trigram_mod = gensim.models.phrases.Phraser(trigram)
	
	data_words = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in data_words]
	data_words = [bigram_mod[doc] for doc in data_words]
	data_words = [trigram_mod[bigram_mod[doc]] for doc in data_words]
	
	return data_words

In [104]:
def filter_dictionary_by_count(min_count, documents, dictionary):
	from collections import Counter
	word_counter = Counter((word for words in documents for word in words))
	removal_word_idxs = {
		dictionary.token2id[word] for word, count in word_counter.items() if count < min_count
	}
	dictionary.filter_tokens(removal_word_idxs)
	dictionary.compactify()
	print('dictionary size : %d' % len(dictionary))  # dictionary size : 10354
	return dictionary

In [105]:
df = pd.read_csv('C:/Users/Edward/Downloads/papers_1000.csv')

df.shape

(1000, 10)

In [106]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Unnamed: 0      1000 non-null   int64 
 1   pmid            1000 non-null   int64 
 2   doi             977 non-null    object
 3   journal         994 non-null    object
 4   country         992 non-null    object
 5   title           994 non-null    object
 6   authors         988 non-null    object
 7   abstract        994 non-null    object
 8   citation_count  1000 non-null   int64 
 9   published_at    994 non-null    object
dtypes: int64(3), object(7)
memory usage: 78.2+ KB


In [107]:
df.head()

Unnamed: 0.1,Unnamed: 0,pmid,doi,journal,country,title,authors,abstract,citation_count,published_at
0,155763,34494017,10.21203/rs.3.rs-862572/v1,Research square,United States,Calibration of Two Validated SARS-CoV-2 Pseudo...,"Yunda Huang, Oleg Borisov, Jia Jin Kee, Lindsa...",<Abstract><AbstractText>Vaccine-induced neutra...,0,2021-09-26 00:00:00
1,19979,33772169,10.1038/s41415-021-2860-z,British dental journal,England,Phone call success.,"M Loh, R Smith, M Forde, D Mills","<?xml version=""1.0""?>\n<p/>\n",0,2021-04-26 00:00:00
2,29771,33613398,10.3389/fpsyg.2021.621633,Frontiers in psychology,Switzerland,"Self-Perceived Mental Health Status, Digital A...","Vanja Kopilaš, Anni M Hasratian, Lucia Martine...",<Abstract>\n <AbstractText>The ...,1,2021-02-23 00:00:00
3,65366,33178449,10.1136/bmjsem-2020-000943,BMJ open sport & exercise medicine,England,Could Virtual Reality play a role in the rehab...,"Merlijn Smits, J Bart Staal, Harry van Goor",<Abstract>\n <AbstractText>Post...,3,2020-11-13 00:00:00
4,5998,33865136,10.1016/j.scitotenv.2021.146967,The Science of the total environment,Netherlands,Detection of SARS-CoV-2 RNA in the Danube Rive...,"Stoimir Kolarević, Adrienn Micsinai, Réka Szán...",<Abstract>\n <AbstractText>In S...,4,2021-04-21 00:00:00


In [108]:
df.dropna(inplace=True)
# unnanmed:0 컬럼 제거
df.drop(['Unnamed: 0'], axis=1, inplace=True)
df.head()

Unnamed: 0,pmid,doi,journal,country,title,authors,abstract,citation_count,published_at
0,34494017,10.21203/rs.3.rs-862572/v1,Research square,United States,Calibration of Two Validated SARS-CoV-2 Pseudo...,"Yunda Huang, Oleg Borisov, Jia Jin Kee, Lindsa...",<Abstract><AbstractText>Vaccine-induced neutra...,0,2021-09-26 00:00:00
1,33772169,10.1038/s41415-021-2860-z,British dental journal,England,Phone call success.,"M Loh, R Smith, M Forde, D Mills","<?xml version=""1.0""?>\n<p/>\n",0,2021-04-26 00:00:00
2,33613398,10.3389/fpsyg.2021.621633,Frontiers in psychology,Switzerland,"Self-Perceived Mental Health Status, Digital A...","Vanja Kopilaš, Anni M Hasratian, Lucia Martine...",<Abstract>\n <AbstractText>The ...,1,2021-02-23 00:00:00
3,33178449,10.1136/bmjsem-2020-000943,BMJ open sport & exercise medicine,England,Could Virtual Reality play a role in the rehab...,"Merlijn Smits, J Bart Staal, Harry van Goor",<Abstract>\n <AbstractText>Post...,3,2020-11-13 00:00:00
4,33865136,10.1016/j.scitotenv.2021.146967,The Science of the total environment,Netherlands,Detection of SARS-CoV-2 RNA in the Danube Rive...,"Stoimir Kolarević, Adrienn Micsinai, Réka Szán...",<Abstract>\n <AbstractText>In S...,4,2021-04-21 00:00:00


In [110]:
TAGS = ('NN', 'NNS', 'NNP', 'NNPS')
df['abstract2'] = df['abstract'].apply(lambda x: preprocess_text(x, get_lemma=True, tags=TAGS, stop_words=stop_words))
df.head()

Unnamed: 0,pmid,doi,journal,country,title,authors,abstract,citation_count,published_at,abstract2
0,34494017,10.21203/rs.3.rs-862572/v1,Research square,United States,Calibration of Two Validated SARS-CoV-2 Pseudo...,"Yunda Huang, Oleg Borisov, Jia Jin Kee, Lindsa...",<Abstract><AbstractText>Vaccine-induced neutra...,0,2021-09-26 00:00:00,antibody key biomarkers vaccine efficacy state...
1,33772169,10.1038/s41415-021-2860-z,British dental journal,England,Phone call success.,"M Loh, R Smith, M Forde, D Mills","<?xml version=""1.0""?>\n<p/>\n",0,2021-04-26 00:00:00,
2,33613398,10.3389/fpsyg.2021.621633,Frontiers in psychology,Switzerland,"Self-Perceived Mental Health Status, Digital A...","Vanja Kopilaš, Anni M Hasratian, Lucia Martine...",<Abstract>\n <AbstractText>The ...,1,2021-02-23 00:00:00,novelty coronavirus disease society capability...
3,33178449,10.1136/bmjsem-2020-000943,BMJ open sport & exercise medicine,England,Could Virtual Reality play a role in the rehab...,"Merlijn Smits, J Bart Staal, Harry van Goor",<Abstract>\n <AbstractText>Post...,3,2020-11-13 00:00:00,patient care cognitive rehabilitation resource...
4,33865136,10.1016/j.scitotenv.2021.146967,The Science of the total environment,Netherlands,Detection of SARS-CoV-2 RNA in the Danube Rive...,"Stoimir Kolarević, Adrienn Micsinai, Réka Szán...",<Abstract>\n <AbstractText>In S...,4,2021-04-21 00:00:00,serbia le wastewater release environment waste...


In [111]:
df = df.dropna(subset=['abstract2']).copy()

In [112]:
dfs = df['abstract2'].values.tolist()
data_words = list(sent_to_words(dfs))
len(data_words[0])

45