# 2. Keywords

[수행 과정]
- Abstract에 포함되는 키워드 찾아서 변환하기 (언더바 처리)
- Keyword에 포함되는 것만 남기기

In [1]:
# Library
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

import re
import nltk

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from konlpy.tag import Okt

from gensim.models import Word2Vec
from gensim.models import KeyedVectors

In [2]:
# Loading Data
words_data = pd.read_csv('./Data/Preprocess_Data/Author_Keywords.csv')
# abstract_data = pd.read_csv('./Data/Preprocess_Data/Abstract_preprocessing.csv')


## 1. Abstract에 포함되는 키워드 찾아서 저장하기

In [3]:
words_data = words_data[words_data['count']>=20]
# words_data.to_csv('Data/Preprocess_Data/up20_words.csv', index=False, encoding='utf-8-sig')

In [4]:
words_data.reset_index(drop=True, inplace=True)

### 완전일치한 부분이 있으면 공백에 _를 씌워서 바꿔주기
- ex) neural network -> neural_network

In [5]:
# 양쪽 공백 제거
words_data['word'] = words_data['word'].str.strip()

In [6]:
# words_data에 공백을 언더바(_)로 대체
words_data['word2'] = 0
for i in range(len(words_data)):
    if len(str(words_data['word'][i]).split(' ')) >1:
        words_data['word2'][i] = words_data['word'][i].replace(' ','_')
    else:
        words_data['word2'][i] = words_data['word'][i]

In [7]:
words_data['length'] = 0
for i in range(len(words_data)):
    words_data['length'][i] = len(str(words_data['word'][i]))

In [8]:
# 결과 확인
words_data.head()

Unnamed: 0,word,count,word2,length
0,machine learning,39529,machine_learning,16
1,deep learning,36992,deep_learning,13
2,neural network,18899,neural_network,14
3,cloud computing,17700,cloud_computing,15
4,classification,17340,classification,14


In [9]:
words_data[words_data['word2'].str.endswith('_')]

Unnamed: 0,word,count,word2,length


In [10]:
# length에 따라 정렬 (변환이 제대로 되지 않는 것 방지)
words_data.sort_values(by='length', ascending=False, inplace=True)
words_data.reset_index(drop=True, inplace=True)
words_data.head()

Unnamed: 0,word,count,word2,length
0,h.5.1 [information interfaces and presentation...,44,h.5.1_[information_interfaces_and_presentation...,122
1,human centered computing human computer intera...,37,human_centered_computing_human_computer_intera...,89
2,i.3.7 [computer graphics]: three dimensional g...,29,i.3.7_[computer_graphics]:_three_dimensional_g...,81
3,i.3.6 [computer graphics]: methodology and tec...,30,i.3.6_[computer_graphics]:_methodology_and_tec...,75
4,h.5.2 [information interfaces and presentation...,20,h.5.2_[information_interfaces_and_presentation...,75


In [11]:
# 중복되는 값 중 첫번째만 남기고 제거
words_data.drop_duplicates(subset='word2', keep='first', inplace=True)

In [93]:
words_data[words_data['word2'].str.contains('e_research')]

Unnamed: 0,word,count,word2,length
261,design science research methodology,23,design_science_research_methodology,35
431,undergraduate research experience,22,undergraduate_research_experience,33
2233,qualitative research method,23,qualitative_research_method,27
3759,user experience research,24,user_experience_research,24
5431,design science research,467,design_science_research,23
5705,collaborative research,31,collaborative_research,22
6129,undergraduate research,100,undergraduate_research,22
7546,reproducible research,82,reproducible_research,21
7874,quantitative research,44,quantitative_research,21
8361,qualitative research,344,qualitative_research,20


In [62]:
len(words_data)

35402

In [129]:
# 디버깅
# k = 251232
abstract_data['Abstract3'] = ""
words_dict = dict(zip(words_data2['word'], words_data2['word2']))

for k in tqdm(range(10000)):
    abstract = str(abstract_data['Abstract2'][k])
    replaced_abstract = abstract

    for word, word2 in words_dict.items():
        replaced_abstract = replaced_abstract.replace(word, word2)

    abstract_data['Abstract3'][k] = replaced_abstract
    abstract_data['Abstract3'][k]

100%|█████████████████████████████████████| 10000/10000 [03:06<00:00, 53.66it/s]


In [139]:
# 방법 1
in_abstracts = []
words_subset = set(words_data['word2'])  # 단어 집합을 집합(Set)으로 변환하여 탐색 시간을 단축

# 정규식 패턴을 미리 생성하여 반복문 내에서 재사용
pattern = re.compile(r'\b({})\b'.format('|'.join(map(re.escape, words_subset))))

for k in tqdm(range(10000)):
    abstract = abs_df['Abstract3'][k]
    in_words = pattern.findall(abstract)
    in_abstracts.append([word for word in in_words if word in words_subset])

100%|█████████████████████████████████████| 10000/10000 [04:46<00:00, 34.93it/s]


In [133]:
# 방법 2 (빈도수 고려 안하는 버전)
in_abstracts2 = []
words_subset = set(words_data['word2'])  # 단어 집합을 집합(Set)으로 변환하여 탐색 시간을 단축

# 정규식 패턴을 미리 생성하여 반복문 내에서 재사용
# pattern = re.compile(r'\b({})\b'.format('|'.join(map(re.escape, words_subset))))

for k in tqdm(range(10000)):
    abstract = set(abs_df['Abstract3'][k].split())
    words_in_abstract = words_subset.intersection(abstract)
    in_abstracts.append(list(words_in_abstract))


100%|██████████████████████████████████| 10000/10000 [00:00<00:00, 31486.83it/s]


In [140]:
in_abstracts[0]

['a',
 'method',
 'peer_to_peer_streaming',
 'node',
 'is',
 'a',
 'problem',
 'is',
 'disk',
 'user',
 'content',
 'be',
 'a',
 'malicious_user',
 'content',
 'method',
 'ha',
 'advantage',
 'user',
 's',
 'disk',
 'a',
 'version',
 'content',
 'version',
 'in',
 'disk',
 'can',
 'be',
 'in',
 'it',
 'is',
 'content',
 'distribution',
 'it',
 'ieee']

In [137]:
len(in_abstracts[2])

61

In [141]:
# words 칼럼의 단어를 words2 칼럼의 단어로 대체
abstract_data['Abstract3'] = ""

words_dict = dict(zip(words_data2['word'], words_data2['word2']))

for k in tqdm(range(len(abstract_data))):
    abstract = str(abstract_data['Abstract2'][k])
    replaced_abstract = abstract

    for word, word2 in words_dict.items():
        replaced_abstract = replaced_abstract.replace(word, word2)

    abstract_data['Abstract3'][k] = replaced_abstract

100%|██████████████████████████████| 2267163/2267163 [13:21:24<00:00, 47.15it/s]


In [142]:
new_abstract = abstract_data[['Year','Abstract3']]
new_abstract.to_csv('Data/Preprocess_Data/Abstract3.csv', encoding='utf-8-sig', index=True)

In [14]:
new_abstract['Abstract3'][0]

"a method for peer_to_peer_streaming of video_on_demand_with residential node is described a possible problem with doing peer_to_peer_video_on_demand_streaming is the necessity of storing on the disk of the residential user the content to be streamed allowing a malicious_user to distribute illegally the content the proposed method ha the advantage of storing on the user's disk only a reduced version of the content although the reduced version stored in disk can still be used in the proposed peer_to_peer_scheme it is not sufficient to recover the original content preventing an unauthorized distribution of it © 2013 ieee"

## --------

In [12]:
abs_df = pd.read_csv('./Data/Preprocess_Data/Abstract3.csv')

In [13]:
abs_df

Unnamed: 0.1,Unnamed: 0,Year,Abstract3
0,0,2013,a method for peer_to_peer_streaming of video_o...
1,1,2013,in this paper we discus the bacterial network_...
2,2,2013,this article treat a digital_humanity work in ...
3,3,2013,this work describes preliminary step towards n...
4,4,2013,goal extraction in learning_by_demonstration i...
...,...,...,...
2267158,2267158,2021,human_centered development of information_syst...
2267159,2267159,2021,the computing device in cloud or fog data_cent...
2267160,2267160,2021,mobile_technology are becoming more and more a...
2267161,2267161,2021,development of intelligent_system with the pur...


In [14]:
in_abstracts = []
words_subset = set(words_data['word2'])  # 단어 집합을 집합(Set)으로 변환하여 탐색 시간을 단축

# 정규식 패턴을 미리 생성하여 반복문 내에서 재사용
pattern = re.compile(r'\b({})\b'.format('|'.join(map(re.escape, words_subset))))

for k in tqdm(range(1000000, 1100000)):
    abstract = abs_df['Abstract3'][k]
    in_words = pattern.findall(abstract)
    in_abstracts.append([word for word in in_words if word in words_subset])

100%|███████████████████████████████████| 100000/100000 [52:14<00:00, 31.91it/s]


In [17]:
in_abstracts = []
for k in tqdm(range(1100000,1300000)):
    abstract = abs_df['Abstract3'][k]
    in_words = pattern.findall(abstract)
    in_abstracts.append([word for word in in_words if word in words_subset])

100%|█████████████████████████████████| 200000/200000 [1:37:48<00:00, 34.08it/s]


In [20]:
len(in_abstracts)

200000

In [21]:
abs_df2 = pd.DataFrame(in_abstracts)
# 각 행의 리스트를 공백으로 구분하여 연결
abs_df2 = abs_df2.apply(lambda row: ', '.join(row.dropna().astype(str)), axis=1)
# 이를 DataFrame으로 변환
abs_df2 = pd.DataFrame(abs_df2, columns=['list'])
abs_df2

Unnamed: 0,list
0,"paper, address, problem, congestion_control, q..."
1,"software_testing, approach, type, code, testin..."
2,"in, online, activity, e, transaction, at, peak..."
3,"researcher, use, test_suite, s, reduction, tec..."
4,"paper, reconfigurable, distributed_arithmetic,..."
...,...
199995,"a, chaotic_system, three_dimensional, mathemat..."
199996,"pest, image_recognition, technology, is, point..."
199997,"logic, block, field_programmable_gate_array, u..."
199998,"decomposition, algorithm, a, moea, d, transfor..."


In [22]:
abs_df2.to_csv('Data/Preprocess_Data/추출본(1100000~1300000).csv', encoding='utf-8-sig', index=False)

In [23]:
len(abs_df2)

200000

In [None]:
df2 = abs_df2.copy()
# 문자열 분할, 공백 제거 및 새로운 칼럼에 저장
df2['list'] = df2['list'].apply(lambda x: x.split(',', 2)[2].strip())

In [None]:
df2.to_csv('Data/Preprocess_Data/추출본.csv', encoding='utf-8-sig', index=False)

In [11]:
df2 = pd.read_csv('./Data/Preprocess_Data/추출본.csv')

In [13]:
df2['list'][0]

"a method for peer_to_peer_streaming of video_on_demand_with residential node is described a possible problem with doing peer_to_peer_video_on_demand_streaming is the necessity of storing on the disk of the residential user the content to be streamed allowing a malicious_user to distribute illegally the content the proposed method ha the advantage of storing on the user's disk only a reduced version of the content although the reduced version stored in disk can still be used in the proposed peer_to_peer_scheme it is not sufficient to recover the original content preventing an unauthorized distribution of it © 2013 ieee"