# キーワード抽出

In [1]:
# import libraries

import numpy as np
import pandas as pd
import gc
import os
import json
from collections import Counter, defaultdict
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
%matplotlib inline
import plotly.express as px
import re
import datetime as dt
import time
import japanize_matplotlib
year_pattern = r'([1-2][0-9]{3})'
import difflib

In [2]:
DATAPATH = "../../../data/"
START_YEAR = 2000
END_YEAR = 2015
YEAR_STD = END_YEAR - START_YEAR

In [3]:
def get_metadata(filename):
    with open(DATAPATH + filename, encoding="utf-8", mode='r') as f:
        for line in f:
            yield line

## キーワードの抽出

In [4]:
%%time
# 2min30s needed
keywords = {}
count, count2, count3 = 0, 0, 0

metadata = get_metadata("dblpv13.txt")
for paper in tqdm(metadata):
    data = json.loads(paper)
    try:
        k = data.get('keywords')
        y = data.get('year')
        if len(k) == 0:
            pass
        else:
            if y >= START_YEAR and  END_YEAR >= y :
                for i in k:
                    i = i.lower().strip()
                    if i == "":
                        pass
                    elif i not in keywords:
                        keywords[i] = 1
                    else:
                        keywords[i] = keywords[i] + 1
                count2 += 1
            else:
                count3 += 1
    except Exception as e:
        # keywordがないものも存在
        count += 1
        pass
print("キーワードの数:{}, キーワードのない論文数:{},\n 期間が妥当である論文数{}, 期間外の論文数{}".format(len(keywords), count, count2, count3))

0it [00:00, ?it/s]

キーワードの数:6272597, キーワードのない論文数:1171591,
 期間が妥当である論文数2324357, 期間外の論文数1089509
CPU times: user 2min 2s, sys: 13.7 s, total: 2min 16s
Wall time: 2min 5s


In [5]:
%%time
important_keywords = []
M = 1000
for keyword in tqdm(keywords):
    # keyword の出現回数が、Mを超えるとき
    appear_count = keywords[keyword]
    if appear_count >= M:
        important_keywords.append(keyword)
print("出現数が" + str(M) + "以上のキーワード数:{}".format(len(important_keywords)))

  0%|          | 0/6272597 [00:00<?, ?it/s]

出現数が1000以上のキーワード数:2533
CPU times: user 3.44 s, sys: 88 ms, total: 3.53 s
Wall time: 3.49 s


In [6]:
important_keywords.sort()

In [7]:
# important_keywords = np.array(important_keywords)
# np.save(DATAPATH + "DBLP/keywords.npy", important_keywords)

In [8]:
with open(DATAPATH + "DBLP/keywords.txt", mode="a") as f:
    for i in important_keywords:
        f.write(i + "\n")

In [9]:
N = len(important_keywords)
N

2533

In [10]:
%%time
# 平均0.260 の類似度
# 8minほどかかる
similar_pair = []
start_time = time.time()
for i in tqdm(range(N)):
    if i % 1000 == 0:
        end_time = time.time()
        print("達成率: {}, 経過時間: {}".format(str(i // 100) + "%", end_time-start_time))
    for j in range(i+1,N):
        a = important_keywords[i]
        b = important_keywords[j]
        r = difflib.SequenceMatcher(isjunk=None, a=a, b=b, autojunk=True).ratio()
        if r > 0.9:
            similar_pair.append([keywords[a],keywords[b],a,b,round(r,3)])
len(similar_pair)

  0%|          | 0/2533 [00:00<?, ?it/s]

達成率: 0%, 経過時間: 0.029930830001831055
達成率: 10%, 経過時間: 55.98400354385376
達成率: 20%, 経過時間: 84.11372375488281
CPU times: user 1min 28s, sys: 588 ms, total: 1min 28s
Wall time: 1min 28s


253

In [11]:
for num, sim_pair in enumerate(similar_pair):
    if sim_pair[0] >= sim_pair[1]:
        continue
    else:
        similar_pair[num][0], similar_pair[num][1] = similar_pair[num][1], similar_pair[num][0]
        similar_pair[num][2], similar_pair[num][3] = similar_pair[num][3], similar_pair[num][2]
        

In [12]:
similar_pair = sorted(similar_pair, key=lambda x:(x[2], x[3]))

In [13]:
similar_pair = np.array(similar_pair)
for num, sim_pair in enumerate(similar_pair):
    if sim_pair[2] in similar_pair[:,3]:
        print(num, np.where(similar_pair[:,3]==sim_pair[2]), sim_pair)

0 (array([144]),) ['2831' '1046' '3g mobile communication' 'mobile communications' '0.909']
22 (array([21]),) ['1515' '1417' 'case-based reasoning' 'case based reasoning' '0.95']
92 (array([91]),) ['18051' '1471' 'genetic algorithms' 'generic algorithm' '0.914']
96 (array([98]),) ['2218' '1436' 'graphic user interface' 'graphical user interface' '0.957']
152 (array([156]),) ['6596' '3035' 'multi agent system' 'multi-agent system' '0.944']
153 (array([156]),) ['6596' '5862' 'multi agent system' 'multi-agent systems' '0.919']
154 (array([156]),) ['6596' '1283' 'multi agent system' 'multiagent system' '0.971']
155 (array([156]),) ['6596' '2976' 'multi agent system' 'multiagent systems' '0.944']
161 (array([170]),) ['1924' '1535' 'multi objective optimization'
 'multi-objective optimization' '0.964']
163 (array([152, 157, 165]),) ['3035' '1283' 'multi-agent system' 'multiagent system' '0.971']
164 (array([152, 157, 165]),) ['3035' '2976' 'multi-agent system' 'multiagent systems' '0.944']
1

In [14]:
with open(DATAPATH + "DBLP/sim_keywords.txt", mode="a") as f:
    for i in similar_pair:
        f.write(i[2] + "," + i[3] + "\n")

In [16]:
# TODO 現在は手動でやっているが、類義語同士の重複などを消去する必要がある
あ

In [17]:
with open(DATAPATH + "DBLP/sim_keywords.txt") as f:
    sim_keywords = list(map(lambda x: x.replace("\n", ""), f.readlines()))
    sim_keywords = list(map(lambda x: list(x.split(",")), sim_keywords))
len(sim_keywords), sim_keywords[0]

(235, ['3g mobile communication', 'mobile communications'])

In [18]:
sim_keywords_dic = {}
for i in sim_keywords:
    sim_keywords_dic[i[0]] = i[1]
sim_keywords_dic

{'3g mobile communication': 'mobile communications',
 'ad hoc networks': 'ad hoc network',
 'adaptive filters': 'adaptive filter',
 'adaptive systems': 'adaptive system',
 'agents': 'agent',
 'algorithms': 'algorithm',
 'analytical model': 'analytical models',
 'antenna arrays': 'antenna array',
 'approximation algorithms': 'approximation algorithm',
 'artificial intelligence': 'artificial intelligent',
 'artificial neural networks': 'artificial neural network',
 'association rule': 'association rules',
 'authorisation': 'authorization',
 'backpropagation': 'back propagation',
 'base station': 'base stations',
 'bayesian methods': 'bayesian method',
 'bayesian network': 'bayesian networks',
 'biomedical imaging': 'medical imaging',
 'boolean functions': 'boolean function',
 'brain computer interface': 'brain computer interfaces',
 'case base reasoning': 'case based reasoning',
 'case-based reasoning': 'case based reasoning',
 'cellular automata': 'cellular automaton',
 'cellular networ

In [34]:
def delete_similar_words( keyword_list: list, sim_keywords_dict: dict):
    ans = []
    if len(keyword_list) == 0:
        return keyword_list
    else:
        for keyword_ in keyword_list:
            keyword_ = keyword_.lower().strip()
            if keyword_ in sim_keywords_dict:
                ans.append(sim_keywords_dict[keyword_])
            else:
                ans.append(keyword_)
    return list(set(ans))

In [35]:
%%time
metadata = get_metadata("dblpv13.txt")
for paper in tqdm(metadata):
# for paper in metadata:
    data = json.loads(paper)
    try:
        keyword = data["keywords"]
        modified_keywords = delete_similar_words(keyword, sim_keywords_dic)
        data["keywords"] = modified_keywords
    except Exception as e:
        pass
    with open(DATAPATH + 'dblpv13_delete_similar_words.txt', 'a') as f:
        f.write(json.dumps(data)+"\n")

0it [00:00, ?it/s]

CPU times: user 5min 39s, sys: 56.6 s, total: 6min 36s
Wall time: 6min 35s


In [36]:
%%time
# 2min30s needed
keywords = {}
count, count2, count3 = 0, 0, 0

metadata = get_metadata("dblpv13_delete_similar_words.txt")
for paper in tqdm(metadata):
    data = json.loads(paper)
    try:
        k = data.get('keywords')
        y = data.get('year')
        if len(k) == 0:
            pass
        else:
            if y >= START_YEAR and  END_YEAR >= y :
                for i in k:
                    i = i.lower().strip()
                    if i == "":
                        pass
                    elif i not in keywords:
                        keywords[i] = 1
                    else:
                        keywords[i] = keywords[i] + 1
                count2 += 1
            else:
                count3 += 1
    except Exception as e:
        # keywordがないものも存在
        count += 1
        pass
print("キーワードの数:{}, キーワードのない論文数:{},\n 期間が妥当である論文数{}, 期間外の論文数{}".format(len(keywords), count, count2, count3))

0it [00:00, ?it/s]

キーワードの数:6272362, キーワードのない論文数:1171591,
 期間が妥当である論文数2324357, 期間外の論文数1089509
CPU times: user 1min 34s, sys: 8.69 s, total: 1min 42s
Wall time: 1min 36s


In [37]:
%%time
important_keywords = []
M = 1000
for keyword in tqdm(keywords):
    # keyword の出現回数が、Mを超えるとき
    appear_count = keywords[keyword]
    if appear_count >= M:
        important_keywords.append(keyword)
print("出現数が" + str(M) + "以上のキーワード数:{}".format(len(important_keywords)))

  0%|          | 0/6272362 [00:00<?, ?it/s]

出現数が1000以上のキーワード数:2276
CPU times: user 3.54 s, sys: 128 ms, total: 3.67 s
Wall time: 3.63 s


In [40]:
important_keywords.sort()

In [38]:
for i in important_keywords:
    if i in sim_keywords_dic:
        print(i)

In [41]:
with open(DATAPATH + "DBLP/keywords_delete_similar_words.txt", mode="a") as f:
    for i in important_keywords:
        f.write(i + "\n")