In [1]:
# Library
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

import re
import nltk

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from konlpy.tag import Okt

from gensim.models import Word2Vec
from gensim.models import KeyedVectors

from networkx.algorithms.community import greedy_modularity_communities
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx

In [3]:
t = pd.read_csv('./Data/Preprocess_Data/Total_data.csv')
words_data = pd.read_csv('./Data/Preprocess_Data/up20_words_완료.csv')

In [10]:
ak = t[['Author Keywords','Year']]

In [11]:
ak = ak[ak['Author Keywords'].notnull()]

In [12]:
ak.reset_index(drop=True, inplace=True)

In [13]:
ak

Unnamed: 0,Author Keywords,Year
0,ancient roman law; bibliotheca iuris antiqui; ...,2013
1,Diffusion-based Molecular Communication Channe...,2013
2,Coevolution; Evaluation process; Game's strate...,2013
3,bias; density profile; insect; processing yiel...,2013
4,Applied mathematic; Operations research; Playe...,2013
...,...,...
1574349,Bounded lattice; Closure operator; Constructio...,2021
1574350,Cognitive maps; Decision making; E–learning; I...,2021
1574351,Indoor location systems; Type-2 Intuitionistic...,2021
1574352,Fog computing; Green community; Green data cen...,2021


- Author Keywords 전처리

In [14]:
# Author Keywords 칼럼에서 괄호 안의 내용은 모두 제거하는 함수
def remove_brackets(text):
    # 괄호와 괄호 안의 내용을 제거하는 정규 표현식
    pattern = r'\([^)]*\)'
    result = re.sub(pattern, '', text)

    # 공백이 두 개 이상 연속으로 있는 경우, 이를 하나의 공백으로 바꿈
    result = re.sub(' +', ' ', result)

    return result

In [15]:
ak['Author Keywords'] = ak['Author Keywords'].apply(remove_brackets)

In [16]:
# ';'를 기준으로 키워드 분할 후 소문자로 변환하고 '-'를 공백으로 대체하는 함수
def process_keywords(keyword):
    return keyword.lower().replace('-', ' ')

# tqdm을 사용하여 진행 상황 표시
for i in tqdm(range(len(ak))):
    keywords = []
    for word in ak['Author Keywords'][i].split('; '):
        keywords.append(process_keywords(word))
    ak['Author Keywords'][i] = ' '.join(keywords)

100%|██████████████████████████████| 1574354/1574354 [02:34<00:00, 10178.05it/s]


In [22]:
ak2223 = ak[ak['Year']>=2021]

In [23]:
ak2223.reset_index(drop=True, inplace=True)
ak2223.head()

Unnamed: 0,Author Keywords,Year
0,3d mapping; calibration; lidar; mining; photog...,2022
1,low cost sensors; ostia antica; sculpture twin...,2022
2,3d scanner metrology; 3d vision; iso 10360 13;...,2022
3,3d vision; camera calibration; design of exper...,2022
4,case law; machine learning; natural language p...,2022


In [24]:
# 새로운 DataFrame 생성
new_df = pd.DataFrame(columns=['Author Keywords', 'Year'])

# ';'를 기준으로 행을 구분하여 새로운 행 생성
for i in tqdm(range(len(ak2223))):
    keywords = ak2223['Author Keywords'][i].split('; ')
    year = ak2223['Year'][i]
    for keyword in keywords:
        new_df = new_df.append({'Author Keywords': keyword, 'Year': year}, ignore_index=True)

100%|█████████████████████████████████| 156657/156657 [1:03:47<00:00, 40.93it/s]


In [25]:
new_df

Unnamed: 0,Author Keywords,Year
0,3d mapping,2022
1,calibration,2022
2,lidar,2022
3,mining,2022
4,photogrammetry,2022
...,...,...
723414,movement,2022
723415,reinforcement learning,2022
723416,contrastive learning,2022
723417,dialogue system,2022


In [33]:
from nltk.stem import WordNetLemmatizer
lemma = WordNetLemmatizer()

In [36]:
# lemmatizer 실행
lemma = WordNetLemmatizer()

for i in tqdm(range(len(new_df))):
    w = new_df['Author Keywords'][i]
    for j in range(len(w.split(' '))):
        a = w.split(' ')
        if j == len(a)-1:
            word = a[j] # 마지막 단어만 추출
            word = lemma.lemmatize(word,'n')# 단수형태로 변환
#             print(word)
            a[-1] = word
            a = " ".join(a)# 다시 합치기
    new_df['Author Keywords'][i] = a

100%|████████████████████████████████| 723419/723419 [01:01<00:00, 11750.81it/s]


In [87]:
new_df

Unnamed: 0,Author Keywords,Year,keywords
0,3d mapping,2022,3d_mapping
1,calibration,2022,calibration
2,lidar,2022,lidar
3,mining,2022,mining
4,photogrammetry,2022,photogrammetry
...,...,...,...
723414,movement,2022,movement
723415,reinforcement learning,2022,reinforcement_learning
723416,contrastive learning,2022,contrastive_learning
723417,dialogue system,2022,dialogue_system


In [38]:
new_df['Author Keywords'] = new_df['Author Keywords'].str.strip()

In [40]:
new_df['keywords'] = new_df['Author Keywords'].str.replace(' ', '_')

In [138]:
new_df

Unnamed: 0,Author Keywords,Year,keywords,keywords2
0,3d mapping,2022,3d_mapping,3d_mapping
1,calibration,2022,calibration,calibration
2,lidar,2022,lidar,lidar
3,mining,2022,mining,mining
4,photogrammetry,2022,photogrammetry,photogrammetry
...,...,...,...,...
723414,movement,2022,movement,movement
723415,reinforcement learning,2022,reinforcement_learning,reinforcement_learning
723416,contrastive learning,2022,contrastive_learning,contrastive_learning
723417,dialogue system,2022,dialogue_system,dialogue_system


In [98]:
new_df['keywords2'] = new_df['keywords']

In [99]:
# 기존에 교수님이 주셨던 파일에 매칭이 되는지 안되는지를 파악하고
# 몇번 나오는지

a = new_df.groupby('keywords')[['keywords2']].count()

In [101]:
a.reset_index(drop=False, inplace=True)

In [103]:
a.columns = ['keywords','count']

In [106]:
a.sort_values(by='count',ascending=False, inplace=True)

In [108]:
a.reset_index(drop=True, inplace=True)

In [109]:
# 22, 23년도에 나오는 전체 author keywords 명단
a

Unnamed: 0,keywords,count
0,deep_learning,10636
1,machine_learning,10202
2,convolutional_neural_network,3452
3,artificial_intelligence,3132
4,neural_network,2656
...,...,...
229926,group_navigation,1
229927,group_non_interactive_key_exchange,1
229928,group_object_alignment,1
229929,group_of_automorphisms,1


In [111]:
a['포함여부'] = 'x'
for i in tqdm(range(len(a))):
    if a['keywords'][i] in words_subset:
        a['포함여부'][i] = 'o'

100%|█████████████████████████████████| 229931/229931 [00:23<00:00, 9665.80it/s]


In [112]:
a.to_csv('')

Unnamed: 0,keywords,count,포함여부
0,deep_learning,10636,o
1,machine_learning,10202,o
2,convolutional_neural_network,3452,o
3,artificial_intelligence,3132,o
4,neural_network,2656,x
...,...,...,...
229926,group_navigation,1,x
229927,group_non_interactive_key_exchange,1,x
229928,group_object_alignment,1,x
229929,group_of_automorphisms,1,x


In [114]:
b = new_df.groupby(['keywords','Year'])[['keywords2']].count()
b.reset_index(drop=False, inplace=True)
b.columns = ['keywords','year','count']
b.sort_values(by='count',ascending=False, inplace=True)

In [118]:
b

Unnamed: 0,keywords,year,count
56822,deep_learning,2022,8236
135074,machine_learning,2022,7706
46351,convolutional_neural_network,2022,2643
135075,machine_learning,2023,2496
56823,deep_learning,2023,2400
...,...,...,...
97975,green_function,2022,1
97976,green_function,2023,1
97977,green_game,2022,1
97978,green_gas,2022,1


In [119]:
pivot_df = b.pivot_table(index='keywords', columns='year', values='count', aggfunc='sum', fill_value=0)

In [121]:
pivot_df.reset_index(drop=False, inplace=True)

In [124]:
pivot_df.columns

Index(['keywords', 2022, 2023], dtype='object', name='year')

In [125]:
pivot_df.sort_values(by=[2022,2023], ascending=False)

year,keywords,2022,2023
50285,deep_learning,8236,2400
119463,machine_learning,7706,2496
41176,convolutional_neural_network,2643,809
12542,artificial_intelligence,2332,800
139209,neural_network,2067,589
...,...,...,...
229898,“xun_translation”_application,0,1
229925,∪,0,1
229928,─swarm_control,0,1
229929,使_shǐ,0,1


In [126]:
pivot_df.reset_index(drop=True, inplace=True)

In [132]:
pivot_df['포함여부'] = 'x'
for i in tqdm(range(len(pivot_df))):
    if pivot_df['keywords'][i] in words_subset:
        pivot_df['포함여부'][i] = 'o'

100%|█████████████████████████████████| 229931/229931 [00:23<00:00, 9680.53it/s]


In [134]:
pivot_df.sort_values(by=[2022,2023], ascending=False, inplace=True)

In [135]:
pivot_df

year,keywords,2022,2023,포함여부
50285,deep_learning,8236,2400,o
119463,machine_learning,7706,2496,o
41176,convolutional_neural_network,2643,809,o
12542,artificial_intelligence,2332,800,o
139209,neural_network,2067,589,x
...,...,...,...,...
229898,“xun_translation”_application,0,1,x
229925,∪,0,1,x
229928,─swarm_control,0,1,x
229929,使_shǐ,0,1,x


In [136]:
pivot_df.to_csv('2223연도별단어수및포함단어.csv',encoding='utf-8-sig',index=False)

In [137]:
t2

NameError: name 't2' is not defined

In [7]:
t2 = t[t['Year']>=1994]

In [8]:
t2.reset_index(drop=True, inplace=True)

- word 데이터

In [42]:
words_data = words_data[words_data['제거']!='x']
words_data.reset_index(drop=True, inplace=True)
words_data.drop('제거',axis=1, inplace=True)
len(words_data)

9187

In [43]:
# 양쪽 공백 제거
words_data['word'] = words_data['word'].str.strip()

In [44]:
# words_data에 공백을 언더바(_)로 대체
words_data['word2'] = 0
for i in range(len(words_data)):
    if len(str(words_data['word'][i]).split(' ')) >1:
        words_data['word2'][i] = words_data['word'][i].replace(' ','_')
    else:
        words_data['word2'][i] = words_data['word'][i]

In [45]:
words_data['length'] = 0
for i in range(len(words_data)):
    words_data['length'][i] = len(str(words_data['word'][i]))

In [46]:
# 중복되는 값 중 첫번째만 남기고 제거
words_data.drop_duplicates(subset='word2', keep='first', inplace=True)

In [47]:
len(words_data)

9155

- 포함되는 단어만 찾기

In [50]:
# 단어 집합을 집합(Set)으로 변환하여 탐색 시간을 단축
words_subset = list(set(words_data['word2']))

In [51]:
len(words_subset)

9155

In [56]:
# 'Author Keywords' 열에서 words_subset 단어들의 등장 횟수를 저장할 딕셔너리
word_counts = {}

# 'Author Keywords' 열을 반복하여 등장 횟수 계산
for keyword in tqdm(new_df['Author Keywords']):
    for word in keyword.split():
        if word in words_subset:
            if word in word_counts:
                word_counts[word] += 1
            else:
                word_counts[word] = 1

# word_counts 딕셔너리를 데이터프레임으로 변환
word_counts_df = pd.DataFrame(list(word_counts.items()), columns=['Word', 'Count'])

100%|█████████████████████████████████| 723419/723419 [02:19<00:00, 5196.60it/s]


In [69]:
# 'Author Keywords' 열을 하나의 문자열로 결합
keywords_combined = ' '.join(new_df['keywords'])

# words_subset 단어들의 등장 횟수를 세기 위해 count_dict 딕셔너리 초기화
count_dict = {word: 0 for word in words_subset}

# 'Author Keywords' 열에서 words_subset 단어들의 등장 횟수 세기
for word in tqdm(keywords_combined.split()):
    if word in count_dict:
        count_dict[word] += 1

# 등장 횟수를 담은 데이터프레임 생성
if count_dict:
    word_counts_df = pd.DataFrame(count_dict.items(), columns=['Word', 'Count'])
else:
    word_counts_df = pd.DataFrame(columns=['Word', 'Count'])

100%|██████████████████████████████| 723491/723491 [00:00<00:00, 3452504.89it/s]


In [70]:
word_counts_df.sort_values(by='Count',ascending=False, inplace=True)

In [73]:
under20 = word_counts_df[word_counts_df['Count']<20]

In [74]:
under20.to_csv('2223년20번미만.csv',encoding='utf-8-sig', index=False)

In [75]:
under20

Unnamed: 0,Word,Count
6593,branch_and_bound,19
1062,benefit,19
7294,auditing,19
2995,gaussian_naive_bayes,19
4262,counterfactuals,19
...,...,...
1906,behaviour_modelling,0
6406,audio_compression,0
1911,asynchronous_distributed_system,0
1913,scene_categorization,0


In [80]:
word_up20 = word_counts_df[word_counts_df['Count']>0]
word_up20.reset_index(drop=True, inplace=True)

In [82]:
word_up20.to_csv('2223등장단어.csv',encoding='utf-8-sig', index=False)

In [83]:
word_up20

Unnamed: 0,Word,Count
0,deep_learning,10636
1,machine_learning,10202
2,convolutional_neural_network,3452
3,artificial_intelligence,3132
4,blockchain,2638
...,...,...
8242,real_time_pricing,1
8243,collision_free,1
8244,deep_recurrent_neural_network,1
8245,grid_technology,1


In [85]:
'grid_technology' in words_subset

True

In [140]:
ak['Year'].value_counts()

2019    141634
2021    133162
2020    123714
2022    123047
2018    118762
2017     98026
2016     90904
2015     81400
2011     80179
2012     80055
2010     77527
2013     77453
2014     77097
2009     61875
2008     44761
2007     36114
2023     33610
2006     25036
2005     18491
2003     13142
2004     11357
2002      6663
2000      3939
2001      3814
1998      2647
1999      1971
1997      1955
1996      1159
1994       793
1995       787
1993       666
1991       555
1992       444
1990       240
1989       200
1988       198
1987       155
1978       116
1986       105
1976        73
1979        65
1974        58
1975        56
1981        56
1980        50
1982        49
1977        44
1972        34
1983        29
1971        25
1984        23
1985        21
1973        15
1970         2
1954         1
Name: Year, dtype: int64

# 14~23년도 10년치 데이터만 사용하여 각 단어가 연도마다 몇번씩 나타났는지

In [143]:
ak_new = ak[ak['Year']>=2014]
ak_new.reset_index(drop=True, inplace=True)
ak_new.head()

Unnamed: 0,Author Keywords,Year
0,ecg; gain effect; qrs detection,2017
1,frame difference; mixture of gaussians model; ...,2017
2,canny; fusion algorithm; sobel,2017
3,automatic monitoring system; borehole image; g...,2017
4,complex technical route; cross entropy; steelm...,2017


In [144]:
len(ak_new)

1021356

In [146]:
# 새로운 DataFrame 생성
ak_1423 = pd.DataFrame(columns=['Author Keywords', 'Year'])

# ';'를 기준으로 행을 구분하여 데이터프레임 생성
keywords = ak_new['Author Keywords'].str.split('; ')
years = ak_new['Year']

In [148]:
# 행 복제를 위한 리스트 생성
keywords_expanded = []
years_expanded = []

# ';'로 분할된 키워드와 연도를 각각 확장하여 리스트에 추가
for i in tqdm(range(len(keywords))):
    keywords_expanded.extend(keywords[i])
    years_expanded.extend([years[i]] * len(keywords[i]))

100%|█████████████████████████████| 1021356/1021356 [00:05<00:00, 182739.71it/s]


In [149]:
# 확장된 데이터를 새로운 DataFrame에 할당
ak_1423['Author Keywords'] = keywords_expanded
ak_1423['Year'] = years_expanded

In [150]:
ak_1423

Unnamed: 0,Author Keywords,Year
0,ecg,2017
1,gain effect,2017
2,qrs detection,2017
3,frame difference,2017
4,mixture of gaussians model,2017
...,...,...
4693917,acoustic anomaly detection,2021
4693918,audio feature extraction,2021
4693919,convolutional autoencoder,2021
4693920,convolutional long short term memory autoencoder,2021


In [151]:
# lemmatizer 실행
lemma = WordNetLemmatizer()

for i in tqdm(range(len(ak_1423))):
    w = ak_1423['Author Keywords'][i]
    for j in range(len(w.split(' '))):
        a = w.split(' ')
        if j == len(a)-1:
            word = a[j] # 마지막 단어만 추출
            word = lemma.lemmatize(word,'n')# 단수형태로 변환
#             print(word)
            a[-1] = word
            a = " ".join(a)# 다시 합치기
    ak_1423['Author Keywords'][i] = a

100%|███████████████████████████████| 4693922/4693922 [08:12<00:00, 9530.10it/s]


In [152]:
ak_1423['Author Keywords'] = ak_1423['Author Keywords'].str.strip()

ak_1423['keywords'] = ak_1423['Author Keywords'].str.replace(' ', '_')

ak_1423['keywords2'] = ak_1423['keywords']

In [153]:
b = ak_1423.groupby(['keywords','Year'])[['keywords2']].count()
b.reset_index(drop=False, inplace=True)
b.columns = ['keywords','year','count']
b.sort_values(by='count',ascending=False, inplace=True)

In [154]:
b.head()

Unnamed: 0,keywords,year,count
366415,deep_learning,2022,8236
366414,deep_learning,2021,7996
879559,machine_learning,2022,7706
879558,machine_learning,2021,7314
366412,deep_learning,2019,6478


In [155]:
pivot_df = b.pivot_table(index='keywords', columns='year', values='count', aggfunc='sum', fill_value=0)

pivot_df.reset_index(drop=False, inplace=True)

In [156]:
pivot_df

year,keywords,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,,3,9,4,6,13,8,8,20,11,10
1,!model_repair,0,0,0,0,1,0,0,0,0,0
2,"""1+x""_certificate_system",0,0,0,0,0,0,0,1,0,0
3,"""1_norm",0,0,0,0,0,0,0,0,1,0
4,"""3d_models""",0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
1007145,�heart_rate_variability,0,0,0,0,1,0,0,0,0,0
1007146,�iot,0,0,0,0,1,0,0,0,0,0
1007147,�premature_beat_classification,0,0,0,0,1,0,0,0,0,0
1007148,�time_series_data,0,0,0,0,1,0,0,0,0,0


In [157]:
pivot_df.sort_values(by=[2023,2022,2021,2020], ascending=False, inplace=True)

In [158]:
pivot_df['포함여부'] = 'x'
for i in tqdm(range(len(pivot_df))):
    if pivot_df['keywords'][i] in words_subset:
        pivot_df['포함여부'][i] = 'o'

100%|███████████████████████████████| 1007150/1007150 [01:45<00:00, 9547.26it/s]


In [160]:
pivot_df.to_csv('14~23년단어빈도.csv',encoding='utf-8-sig', index=False)

In [164]:
pivot_df

year,keywords,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,포함여부
517060,machine_learning,732,935,1229,1918,3253,5418,5672,7314,7706,2496,o
216030,deep_learning,127,271,640,1454,3319,6478,6266,7996,8236,2400,o
179995,convolutional_neural_network,46,152,445,992,1990,3091,2728,3141,2643,809,o
54844,artificial_intelligence,147,174,202,292,580,1167,1531,2177,2332,800,o
96546,blockchain,0,3,37,174,729,1701,1923,2092,1991,647,o
...,...,...,...,...,...,...,...,...,...,...,...,...
1007145,�heart_rate_variability,0,0,0,0,1,0,0,0,0,0,x
1007146,�iot,0,0,0,0,1,0,0,0,0,0,x
1007147,�premature_beat_classification,0,0,0,0,1,0,0,0,0,0,x
1007148,�time_series_data,0,0,0,0,1,0,0,0,0,0,x


In [168]:
pivot_df[pivot_df['keywords'].str.contains('algorithm')]

year,keywords,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,포함여부
357720,genetic_algorithm,735,832,867,876,989,1031,884,847,761,209,x
40221,and_algorithms,0,0,0,0,0,0,0,0,0,192,x
33892,algorithm,234,235,279,270,329,432,358,415,373,123,x
34529,algorithms:_machine_learning_architecture,0,0,0,0,0,0,0,0,0,94,x
34527,algorithms:_image_recognition_and_understanding,0,0,0,0,0,0,0,0,0,73,x
...,...,...,...,...,...,...,...,...,...,...,...,...
1006328,θ_curvature_algorithm,1,0,0,0,0,0,0,0,0,0,x
1006607,ϵ_greedy_algorithm,0,0,0,0,0,1,0,0,0,0,x
1006613,ϵ_preferred_evolutionary_algorithm,0,0,1,0,0,0,0,0,0,0,x
1006650,–_covering_algorithm,0,0,1,0,0,0,0,0,0,0,x


# 논문 편수 (정확한 데이터)

In [162]:
df = pd.read_csv('./Data/Preprocess_Data/Abstract_preprocessing.csv')
df

Unnamed: 0,Title,Year,Abstract,Author Keywords,Index Keywords,Abstract2
0,Copyright protection in Peer-to-Peer networks ...,2013,a method for peer to peer streaming of video o...,,Housing; Peer to peer networks; Peer to peer; ...,a method for peer to peer streaming of video o...
1,Multi objective design for bacterial communica...,2013,in this paper we discuss the bacterial network...,,Communication; Data processing; Design; Econom...,in this paper we discus the bacterial network ...
2,Classical antiquity and semantic content manag...,2013,this article treats a digital humanities work ...,ancient roman law; bibliotheca iuris antiqui; ...,Bibliographic retrieval systems; Semantics; Th...,this article treat a digital humanity work in ...
3,Nano-scale reservoir computing,2013,this work describes preliminary steps towards ...,,Cadmium compounds; Computer software; Nanotech...,this work describes preliminary step towards n...
4,Temporal emphasis for goal extraction in task ...,2013,goal extraction in learning by demonstration i...,,Anthropomorphic robots; Extraction; Learning a...,goal extraction in learning by demonstration i...
...,...,...,...,...,...,...
2267158,Business process-based IS development as a nat...,2021,human centered development of information syst...,,Computer architecture; Information systems; In...,human centered development of information syst...
2267159,Green fog: cost efficient real time power mana...,2021,the computing devices in cloud or fog data cen...,Fog computing; Green community; Green data cen...,Complex networks; Cooling systems; Electric po...,the computing device in cloud or fog data cent...
2267160,Ethical use of mobile technology in the academ...,2021,mobile technologies are becoming more and more...,,Philosophical aspects; Telecommunication equip...,mobile technology are becoming more and more a...
2267161,Real time detection of acoustic anomalies in i...,2021,development of intelligent systems with the pu...,acoustic anomaly detection; audio feature extr...,Audio signal processing; Convolution; Intellig...,development of intelligent system with the pur...


In [163]:
ak = df[['Author Keywords','Year']]

In [None]:
ak['Author Keywords'] = ak['Author Keywords'].apply(remove_brackets)