In [1]:
# Library
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

import re
import nltk

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from konlpy.tag import Okt

from gensim.models import Word2Vec
from gensim.models import KeyedVectors

from networkx.algorithms.community import greedy_modularity_communities
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx

In [2]:
t = pd.read_csv('./Data/Preprocess_Data/Abstract_preprocessing.csv')
words_data = pd.read_csv('./Data/Preprocess_Data/up20_words_완료.csv')

In [3]:
ak = t[['Author Keywords','Year']]

In [4]:
ak = ak[ak['Author Keywords'].notnull()]

In [7]:
ak = ak[ak['Year']>=2014]

In [16]:
a = ak.groupby('Year')[['Author Keywords']].count()
a.reset_index(drop=False, inplace=True)

In [18]:
a.to_csv('14~23년도total_Author_keywords.csv', index=False, encoding='utf-8-sig')

In [9]:
# Author Keywords 칼럼에서 괄호 안의 내용은 모두 제거하는 함수
def remove_brackets(text):
    # 괄호와 괄호 안의 내용을 제거하는 정규 표현식
    pattern = r'\([^)]*\)'
    result = re.sub(pattern, '', text)

    # 공백이 두 개 이상 연속으로 있는 경우, 이를 하나의 공백으로 바꿈
    result = re.sub(' +', ' ', result)

    return result

In [10]:
ak['Author Keywords'] = ak['Author Keywords'].apply(remove_brackets)

In [12]:
ak.reset_index(drop=True, inplace=True)

In [13]:
# ';'를 기준으로 키워드 분할 후 소문자로 변환하고 '-'를 공백으로 대체하는 함수
def process_keywords(keyword):
    return keyword.lower().replace('-', ' ')

# tqdm을 사용하여 진행 상황 표시
for i in tqdm(range(len(ak))):
    keywords = []
    for word in ak['Author Keywords'][i].split('; '):
        keywords.append(process_keywords(word))
    ak['Author Keywords'][i] = '; '.join(keywords)

100%|███████████████████████████████| 1021356/1021356 [01:59<00:00, 8581.76it/s]


In [15]:
# 새로운 DataFrame 생성
ak_1423 = pd.DataFrame(columns=['Author Keywords', 'Year'])

# ';'를 기준으로 행을 구분하여 데이터프레임 생성
keywords = ak['Author Keywords'].str.split('; ')
years = ak['Year']

# 행 복제를 위한 리스트 생성
keywords_expanded = []
years_expanded = []

# ';'로 분할된 키워드와 연도를 각각 확장하여 리스트에 추가
for i in tqdm(range(len(keywords))):
    keywords_expanded.extend(keywords[i])
    years_expanded.extend([years[i]] * len(keywords[i]))
    
# 확장된 데이터를 새로운 DataFrame에 할당
ak_1423['Author Keywords'] = keywords_expanded
ak_1423['Year'] = years_expanded

100%|█████████████████████████████| 1021356/1021356 [00:05<00:00, 183933.09it/s]


In [20]:
# lemmatizer 실행
from nltk.stem import WordNetLemmatizer
lemma = WordNetLemmatizer()

for i in tqdm(range(len(ak_1423))):
    w = ak_1423['Author Keywords'][i]
    for j in range(len(w.split(' '))):
        a = w.split(' ')
        if j == len(a)-1:
            word = a[j] # 마지막 단어만 추출
            word = lemma.lemmatize(word,'n')# 단수형태로 변환
#             print(word)
            a[-1] = word
            a = " ".join(a)# 다시 합치기
    ak_1423['Author Keywords'][i] = a

100%|███████████████████████████████| 4693922/4693922 [08:12<00:00, 9526.07it/s]


In [21]:
ak_1423['Author Keywords'] = ak_1423['Author Keywords'].str.strip()

ak_1423['keywords'] = ak_1423['Author Keywords'].str.replace(' ', '_')

ak_1423['keywords2'] = ak_1423['keywords']

In [22]:
b = ak_1423.groupby(['keywords','Year'])[['keywords2']].count()
b.reset_index(drop=False, inplace=True)
b.columns = ['keywords','year','count']
b.sort_values(by='count',ascending=False, inplace=True)

In [23]:
pivot_df = b.pivot_table(index='keywords', columns='year', values='count', aggfunc='sum', fill_value=0)
pivot_df.reset_index(drop=False, inplace=True)

In [24]:
pivot_df.sort_values(by=[2023,2022,2021,2020], ascending=False, inplace=True)

In [26]:
words_data = words_data[words_data['제거']!='x']
words_data.reset_index(drop=True, inplace=True)
words_data.drop('제거',axis=1, inplace=True)
len(words_data)

9187

In [27]:
# 양쪽 공백 제거
words_data['word'] = words_data['word'].str.strip()

In [28]:
# words_data에 공백을 언더바(_)로 대체
words_data['word2'] = 0
for i in range(len(words_data)):
    if len(str(words_data['word'][i]).split(' ')) >1:
        words_data['word2'][i] = words_data['word'][i].replace(' ','_')
    else:
        words_data['word2'][i] = words_data['word'][i]

In [29]:
words_data['length'] = 0
for i in range(len(words_data)):
    words_data['length'][i] = len(str(words_data['word'][i]))

In [30]:
# 중복되는 값 중 첫번째만 남기고 제거
words_data.drop_duplicates(subset='word2', keep='first', inplace=True)

In [31]:
# 단어 집합을 집합(Set)으로 변환하여 탐색 시간을 단축
words_subset = list(set(words_data['word2']))

In [35]:
pivot_df[pivot_df['keywords'].str.contains('algorithm')]

year,keywords,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,포함여부
357720,genetic_algorithm,735,832,867,876,989,1031,884,847,761,209,x
40221,and_algorithms,0,0,0,0,0,0,0,0,0,192,x
33892,algorithm,234,235,279,270,329,432,358,415,373,123,x
34529,algorithms:_machine_learning_architecture,0,0,0,0,0,0,0,0,0,94,x
34527,algorithms:_image_recognition_and_understanding,0,0,0,0,0,0,0,0,0,73,x
...,...,...,...,...,...,...,...,...,...,...,...,...
1006328,θ_curvature_algorithm,1,0,0,0,0,0,0,0,0,0,x
1006607,ϵ_greedy_algorithm,0,0,0,0,0,1,0,0,0,0,x
1006613,ϵ_preferred_evolutionary_algorithm,0,0,1,0,0,0,0,0,0,0,x
1006650,–_covering_algorithm,0,0,1,0,0,0,0,0,0,0,x


In [32]:
pivot_df['포함여부'] = 'x'
for i in tqdm(range(len(pivot_df))):
    if pivot_df['keywords'][i] in words_subset:
        pivot_df['포함여부'][i] = 'o'

100%|███████████████████████████████| 1007150/1007150 [01:41<00:00, 9878.94it/s]


In [None]:
pivot_df.to_csv('14~23년단어빈도.csv',encoding='utf-8-sig', index=False)

In [41]:
c = ak[ak['Author Keywords'].str.contains('and algorithm')]
c.reset_index(drop=True, inplace=True)

In [47]:
c

Unnamed: 0,Author Keywords,Year
0,lte baseband algorithms; slv; vrf,2015
1,disorder and forest; distribution of ancestral...,2015
2,batch of queries; gpus; top k query processing...,2020
3,guidelines and algorithm; haptics; human robot...,2017
4,frequent pattern mining; methods and algorithm...,2015
...,...,...
267,machine learning theory; models of learning; t...,2019
268,aircraft design; computer modeling; integro di...,2021
269,competitive learning; ensemble learning; mixtu...,2021
270,classification; fisher’s type criteria; method...,2021


In [48]:
pivot_df

year,keywords,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,포함여부
517060,machine_learning,732,935,1229,1918,3253,5418,5672,7314,7706,2496,o
216030,deep_learning,127,271,640,1454,3319,6478,6266,7996,8236,2400,o
179995,convolutional_neural_network,46,152,445,992,1990,3091,2728,3141,2643,809,o
54844,artificial_intelligence,147,174,202,292,580,1167,1531,2177,2332,800,o
96546,blockchain,0,3,37,174,729,1701,1923,2092,1991,647,o
...,...,...,...,...,...,...,...,...,...,...,...,...
1007145,�heart_rate_variability,0,0,0,0,1,0,0,0,0,0,x
1007146,�iot,0,0,0,0,1,0,0,0,0,0,x
1007147,�premature_beat_classification,0,0,0,0,1,0,0,0,0,0,x
1007148,�time_series_data,0,0,0,0,1,0,0,0,0,0,x


In [50]:
ab = pivot_df[(pivot_df[2022]!=0)&(pivot_df[2023]!=0)]

In [51]:
ab.head()

year,keywords,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,포함여부
517060,machine_learning,732,935,1229,1918,3253,5418,5672,7314,7706,2496,o
216030,deep_learning,127,271,640,1454,3319,6478,6266,7996,8236,2400,o
179995,convolutional_neural_network,46,152,445,992,1990,3091,2728,3141,2643,809,o
54844,artificial_intelligence,147,174,202,292,580,1167,1531,2177,2332,800,o
96546,blockchain,0,3,37,174,729,1701,1923,2092,1991,647,o


In [58]:
ab2 = ab.loc[ab[2019] == 0]
ab2.to_csv('19년도0인버전.csv',encoding='utf-8-sig', index=False)

In [59]:
ab3 = ab.loc[ab.loc[:, 2014:2019].eq(0).all(axis=1)]
ab3.to_csv('19년도까지0인버전.csv',encoding='utf-8-sig', index=False)

In [61]:
filtered_df = pivot_df[
    (pivot_df.loc[:, 2014:2019].sum(axis=1) <= 10) &
    (pivot_df.loc[:, 2020:2023].sum(axis=1) >= 5) &
    (pivot_df.loc[:, 2022:2023].sum(axis=1) >= 3)
]

In [62]:
filtered_df.to_csv('조건에해당하는단어들.csv',encoding='utf-8-sig', index=False)