In [1]:
# Library
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

import re
import nltk

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from konlpy.tag import Okt

from gensim.models import Word2Vec
from gensim.models import KeyedVectors

In [46]:
words_data = pd.read_csv('./Data/Preprocess_Data/up20_words_완료.csv')
abs_df = pd.read_csv('./Data/Preprocess_Data/Abstract3.csv')

In [3]:
words_data.head()

Unnamed: 0,word,count,제거
0,a,217,x
1,b,72,x
2,c,202,x
3,d,30,x
4,e,23,x


In [4]:
words_data['제거'].value_counts()

x    25258
Name: 제거, dtype: int64

In [5]:
abs_df.head()

Unnamed: 0.1,Unnamed: 0,Year,Abstract3
0,0,2013,a method for peer_to_peer_streaming of video_o...
1,1,2013,in this paper we discus the bacterial network_...
2,2,2013,this article treat a digital_humanity work in ...
3,3,2013,this work describes preliminary step towards n...
4,4,2013,goal extraction in learning_by_demonstration i...


In [47]:
words_data = words_data[words_data['제거']!='x']

In [48]:
words_data.reset_index(drop=True, inplace=True)

In [49]:
words_data.drop('제거',axis=1, inplace=True)

In [50]:
len(words_data)

9187

In [10]:
# 양쪽 공백 제거|
words_data['word'] = words_data['word'].str.strip()

In [11]:
# words_data에 공백을 언더바(_)로 대체
words_data['word2'] = 0
for i in range(len(words_data)):
    if len(str(words_data['word'][i]).split(' ')) >1:
        words_data['word2'][i] = words_data['word'][i].replace(' ','_')
    else:
        words_data['word2'][i] = words_data['word'][i]

In [12]:
words_data['length'] = 0
for i in range(len(words_data)):
    words_data['length'][i] = len(str(words_data['word'][i]))

In [13]:
# 결과 확인
words_data.head()

Unnamed: 0,word,count,word2,length
0,3dgis,21,3dgis,5
1,c4isr,21,c4isr,5
2,bacnet,21,bacnet,6
3,hadith,21,hadith,6
4,cog,22,cog,3


In [14]:
# length에 따라 정렬 (변환이 제대로 되지 않는 것 방지)
words_data.sort_values(by='length', ascending=False, inplace=True)
words_data.reset_index(drop=True, inplace=True)

In [15]:
# 중복되는 값 중 첫번째만 남기고 제거
words_data.drop_duplicates(subset='word2', keep='first', inplace=True)

In [16]:
t = abs_df.copy()

In [18]:
t.head()

Unnamed: 0.1,Unnamed: 0,Year,Abstract3
0,0,2013,a method for peer_to_peer_streaming of video_o...
1,1,2013,in this paper we discus the bacterial network_...
2,2,2013,this article treat a digital_humanity work in ...
3,3,2013,this work describes preliminary step towards n...
4,4,2013,goal extraction in learning_by_demonstration i...


In [19]:
# year 칼럼의 뒤의 두 자리 추출
t['year_suffix'] = t['Year'].astype(str).str[-2:]
# list 칼럼의 단어들을 분리하여 리스트로 변환
word_lists = t['Abstract3'].str.split()

In [20]:
word_lists

0          [a, method, for, peer_to_peer_streaming, of, v...
1          [in, this, paper, we, discus, the, bacterial, ...
2          [this, article, treat, a, digital_humanity, wo...
3          [this, work, describes, preliminary, step, tow...
4          [goal, extraction, in, learning_by_demonstrati...
                                 ...                        
2267158    [human_centered, development, of, information_...
2267159    [the, computing, device, in, cloud, or, fog, d...
2267160    [mobile_technology, are, becoming, more, and, ...
2267161    [development, of, intelligent_system, with, th...
2267162    [in, this, paper, we, gauge, the, utility, of,...
Name: Abstract3, Length: 2267163, dtype: object

In [35]:
t2 = t.iloc[:100]

In [22]:
# 단어 집합을 집합(Set)으로 변환하여 탐색 시간을 단축
words_subset = set(words_data['word2'])

# 정규식 패턴을 미리 생성하여 반복문 내에서 재사용
pattern = re.compile(r'\b({})\b'.format('|'.join(map(re.escape, words_subset))))

In [23]:
len(words_subset)

9155

In [40]:
# 단어가 해당되면 연도까지 붙여주기
new_word_lists = []  # 매칭된 단어에 suffix가 추가된 리스트

for words, suffix in tqdm(zip(word_lists, t2['year_suffix']), total=len(t2)):
    new_words = []
    for word in words:
        if pattern.match(word):
            word = str(suffix) + '_' + word

            new_words.append(word)
        else:
            new_words.append(word)
    new_word_lists.append(new_words)

100%|████████████████████████████████████████| 100/100 [00:00<00:00, 212.63it/s]


In [53]:
new_word_strings = [' '.join(word_list) for word_list in new_word_lists]

In [56]:
new_word_strings[43]

'given a connected undirected_graph whose edge are labelled the minimum labelling spanning_tree problem seek a spanning_tree whose edge have the smallest number of distinct label in recent work the mlst problem ha been shown to be np_hard and some effective heuristic have been proposed and analysed in this paper we present preliminary result of a currently on going project regarding the implementation of an intelligent_optimization_algorithm to solve the mlst problem this algorithm is obtained by the basic variable_neighbourhood_search_heuristic with the integration of other 13_complement from 13_machine_learning statistic and experimental_algorithmics in order to produce high quality performance and to completely automate the resulting optimization_strategy © 2013 springer verlag'

In [58]:
t2['reAbs'] = new_word_strings

In [59]:
t2

Unnamed: 0.1,Unnamed: 0,Year,Abstract3,year_suffix,reAbs
0,0,2013,a method for peer_to_peer_streaming of video_o...,13,a method for peer_to_peer_streaming of video_o...
1,1,2013,in this paper we discus the bacterial network_...,13,in this paper we discus the bacterial network_...
2,2,2013,this article treat a digital_humanity work in ...,13,this article treat a digital_humanity work in ...
3,3,2013,this work describes preliminary step towards n...,13,this work describes preliminary step towards n...
4,4,2013,goal extraction in learning_by_demonstration i...,13,goal extraction in learning_by_demonstration i...
...,...,...,...,...,...
95,95,2013,identification of cancer associated protein is...,13,identification of 13_cancer associated protein...
96,96,2013,a new on_chip ca5nb2tio12 dielectric_resonator...,13,a new on_chip ca5nb2tio12 13_dielectric_resona...
97,97,2013,building account for roughly 40% of all u ener...,13,13_building account for roughly 40% of all u e...
98,98,2013,this paper present a simple interval type 2 fu...,13,this paper present a simple interval type 2 13...


In [16]:
# 정규표현식을 사용하여 's와 영어 알파벳, 숫자, 그리고 underscore('_') 이외의 문자 제거
abs_df['Abstract3'] = abs_df['Abstract3'].astype(str)
abs_df['Abstract3'] = abs_df['Abstract3'].apply(lambda x: re.sub(r"[^a-zA-Z0-9_\s]|'s\b", '', x))

In [17]:
# 공백이 2칸 이상이면 한칸으로 줄어듦
abs_df['Abstract3'] = abs_df['Abstract3'].apply(lambda x: re.sub(r'\s{2,}', ' ', x))

In [23]:
import logging
from tqdm import tqdm
tqdm.pandas()

abs_df['Abstract3'] = abs_df['Abstract3'].astype(str)

# 불용어 설정
stop_words = set(stopwords.words('english'))

# tokens 칼럼 : 불용어 제거 추가버전
chunk_size = 300000  # 처리할 청크 크기 설정

# 청크별로 데이터 처리
total_rows = len(abs_df)
start = 0
end = chunk_size

pbar = tqdm(total=total_rows)  # 진행 상황 표시를 위한 tqdm 객체 생성

while start < total_rows:
    chunk_data = abs_df['Abstract3'].iloc[start:end]
    abs_df.loc[start:end, 'tokens'] = chunk_data.progress_apply(lambda x: [word for word in word_tokenize(x) if word not in stop_words])

    start += chunk_size
    end += chunk_size
    if end > total_rows:
        end = total_rows

    pbar.update(chunk_size)  # tqdm 객체 업데이트

pbar.close()  # tqdm 객체 종료


  0%|                                               | 0/1000000 [01:34<?, ?it/s][A
100%|█████████████████████████████████| 300000/300000 [01:53<00:00, 2640.44it/s]

 10%|███▍                              | 30637/300000 [00:11<01:47, 2507.62it/s][A
100%|█████████████████████████████████| 300000/300000 [01:52<00:00, 2670.62it/s][A

 16%|█████▍                            | 47909/300000 [00:19<01:52, 2239.83it/s][A
100%|█████████████████████████████████| 300000/300000 [02:04<00:00, 2413.04it/s][A

 13%|████▎                             | 38267/300000 [00:15<01:35, 2727.69it/s][A
100%|█████████████████████████████████| 300000/300000 [02:13<00:00, 2248.63it/s][A

 13%|████▎                             | 38061/300000 [00:14<01:55, 2276.83it/s][A
100%|█████████████████████████████████| 300000/300000 [01:55<00:00, 2592.52it/s][A

  3%|█                                  | 8621/300000 [00:28<01:50, 2629.42it/s][A
100%|█████████████████████████████████| 300000/300000 [02:16<00:00, 2190.

In [15]:
abs_df.to_csv('원본불용어제거.csv', encoding='utf-8-sig', index=False)

In [25]:
abs_df['tokens'][0]

['method',
 'peer_to_peer_streaming',
 'video_on_demand',
 'residential',
 'node',
 'described',
 'possible',
 'problem',
 'peer_to_peer',
 'video_on_demand',
 'streaming',
 'necessity',
 'storing',
 'disk',
 'residential',
 'user',
 'content',
 'streamed',
 'allowing',
 'malicious_user',
 'distribute',
 'illegally',
 'content',
 'proposed',
 'method',
 'ha',
 'advantage',
 'storing',
 'users',
 'disk',
 'reduced',
 'version',
 'content',
 'although',
 'reduced',
 'version',
 'stored',
 'disk',
 'still',
 'used',
 'proposed',
 'peer_to_peer',
 'scheme',
 'sufficient',
 'recover',
 'original',
 'content',
 'preventing',
 'unauthorized',
 'distribution',
 '2013',
 'ieee']

In [26]:
abs_df['Abstract3'][0]

'a method for peer_to_peer_streaming of video_on_demand with residential node is described a possible problem with doing peer_to_peer video_on_demand streaming is the necessity of storing on the disk of the residential user the content to be streamed allowing a malicious_user to distribute illegally the content the proposed method ha the advantage of storing on the users disk only a reduced version of the content although the reduced version stored in disk can still be used in the proposed peer_to_peer scheme it is not sufficient to recover the original content preventing an unauthorized distribution of it 2013 ieee'

In [27]:
abs_df['tokens'] = abs_df['tokens'].apply(lambda x: ' '.join(x))

In [19]:
abs_df = pd.read_csv('원본불용어제거.csv')
abs_df.head()

Unnamed: 0.1,Unnamed: 0,Year,Abstract3,tokens
0,0,2013,a method for peer_to_peer_streaming of video_o...,method peer_to_peer_streaming video_on_demand ...
1,1,2013,in this paper we discus the bacterial network_...,paper discus bacterial network_communication_d...
2,2,2013,this article treat a digital_humanity work in ...,article treat digital_humanity work classical ...
3,3,2013,this work describes preliminary step towards n...,work describes preliminary step towards nano s...
4,4,2013,goal extraction in learning_by_demonstration i...,goal extraction learning_by_demonstration comp...


2. 'modelling'을 'modeling'으로 수정
3. 'ada boost'를 'adaboost'로 수정
4. 'zig bee'를 'zigbee'로 수정
5. 'alzheimer’s disease'를 'alzheimer disease'로 수정
6. 'optimisation'을 'optimization'으로 수정
7. 'neural networks'를 'neural network'으로
8. 'audiovisual'을 'audio visual'로
9. 'authorisation'을 'authorization'로
10. 'chatbots'를 'chatbot'로
11. 'conceptualisation'을 'conceptualization'로

In [13]:
replacement_dict = {
    'ada_boost': 'adaboost',
    'zig_bee': 'zigbee',
    'optimisation': 'optimization',
    'neural_networks': 'neural_network',
    'audiovisual': 'audio_visual',
    'authorisation': 'authorization',
    'chatbots': 'chatbot',
    'conceptualisation': 'conceptualization'
}

def replace_words(text):
    for old_word, new_word in replacement_dict.items():
        text = text.replace(old_word, new_word)
    return text

tqdm.pandas()  # tqdm을 사용하기 위해 pandas에 연결

abs_df['Abstract3'] = abs_df['Abstract3'].progress_apply(replace_words)

100%|█████████████████████████████| 2267163/2267163 [00:13<00:00, 169149.55it/s]


In [7]:
abs_df['Abstract3'] = abs_df['Abstract3'].str.replace('ada_boost', 'adaboost', case=False)

Unnamed: 0.1,Unnamed: 0,Year,Abstract3,tokens
29057,29057,2011,prediction of nuclear protein is one of the ma...,prediction nuclear protein one major challenge...
71440,71440,2020,a module for assessing the investment risk of ...,module assessing investment risk virtual compa...
84086,84086,2008,promoter recognition ha been attempted using d...,promoter recognition ha attempted using differ...
106850,106850,2008,in this paper chromatic information is integra...,paper chromatic information integrated ada_boo...
115092,115092,2015,in this paper a moving vehicle_detection_algor...,paper moving vehicle_detection_algorithm based...
...,...,...,...,...
2226517,2226517,2019,people are careful when they are trying to buy...,people careful trying buy new house budget mar...
2257305,2257305,2021,in recent year due to the exponential increase...,recent year due exponential increase usage mul...
2259924,2259924,2021,in this paper we describe our submission for t...,paper describe submission hasoc 2021 contest t...
2262452,2262452,2021,computer_aided_diagnosis_system have become a ...,computer_aided_diagnosis_system become signifi...


In [14]:
abs_df[abs_df['Abstract3'].str.contains('ada_boost', case=False)]

Unnamed: 0.1,Unnamed: 0,Year,Abstract3,tokens


In [None]:
# 단어 대체

In [20]:
abs_df[abs_df['Year']>=1994]

Unnamed: 0.1,Unnamed: 0,Year,Abstract3,tokens
0,0,2013,a method for peer_to_peer_streaming of video_o...,method peer_to_peer_streaming video_on_demand ...
1,1,2013,in this paper we discus the bacterial network_...,paper discus bacterial network_communication_d...
2,2,2013,this article treat a digital_humanity work in ...,article treat digital_humanity work classical ...
3,3,2013,this work describes preliminary step towards n...,work describes preliminary step towards nano s...
4,4,2013,goal extraction in learning_by_demonstration i...,goal extraction learning_by_demonstration comp...
...,...,...,...,...
2267158,2267158,2021,human_centered development of information_syst...,human_centered development information_system ...
2267159,2267159,2021,the computing device in cloud or fog data_cent...,computing device cloud fog data_center remain ...
2267160,2267160,2021,mobile_technology are becoming more and more a...,mobile_technology becoming accepted used pedag...
2267161,2267161,2021,development of intelligent_system with the pur...,development intelligent_system pursuit detecti...


--- 7/10 여기까지 진행했음

In [16]:
abs_df = pd.read_csv('원본불용어제거.csv')
abs_df.head()

Unnamed: 0.1,Unnamed: 0,Year,Abstract3,tokens
0,0,2013,a method for peer_to_peer_streaming of video_o...,method peer_to_peer_streaming video_on_demand ...
1,1,2013,in this paper we discus the bacterial network_...,paper discus bacterial network_communication_d...
2,2,2013,this article treat a digital_humanity work in ...,article treat digital_humanity work classical ...
3,3,2013,this work describes preliminary step towards n...,work describes preliminary step towards nano s...
4,4,2013,goal extraction in learning_by_demonstration i...,goal extraction learning_by_demonstration comp...


In [None]:
t = abs_df.copy()

In [None]:
# year 칼럼의 뒤의 두 자리 추출
t['year_suffix'] = t['year'].astype(str).str[-2:]
# list 칼럼의 단어들을 분리하여 리스트로 변환
word_lists = t['tokens'].str.split()

In [None]:
t2 = t.iloc[:1000]

In [None]:
# 단어 집합을 집합(Set)으로 변환하여 탐색 시간을 단축
words_subset = set(words_data['word2'])

# 정규식 패턴을 미리 생성하여 반복문 내에서 재사용
pattern = re.compile(r'\b({})\b'.format('|'.join(map(re.escape, words_subset))))

In [None]:
new_word_lists = []  # 매칭된 단어에 suffix가 추가된 리스트

for words, suffix in tqdm(zip(word_lists, t2['year_suffix']), total=len(t)):
    new_words = []
    for word in words:
        if pattern.match(word):
            new_words.append(str(suffix) + '_' + word)
        else:
            new_words.append(word)
    new_word_lists.append(new_words)

In [None]:
import logging
abs_df['Abstract3'] = abs_df['Abstract3'].astype(str)

# 불용어 설정
stop_words = set(stopwords.words('english'))

# tokens 칼럼 : 불용어 제거 추가버전
abs_df['tokens'] = abs_df['Abstract3'].apply(lambda x: [word for word in tqdm(word_tokenize(x), desc='Processing', unit='word') if word not in stop_words])

In [13]:
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r