# Aminerデータの基本操作

In [2]:
# import libraries

import numpy as np
import pandas as pd
import gc
import os
import json
from collections import Counter, defaultdict
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
%matplotlib inline
import plotly.express as px
import re
import datetime as dt
import japanize_matplotlib
year_pattern = r'([1-2][0-9]{3})'

In [73]:
DATAPATH = "../../data/"
START_YEAR = 2008
END_YEAR = 2020
YEAR_STD = END_YEAR - START_YEAR

In [4]:
def get_metadata(filename):
    with open(DATAPATH + filename, encoding="utf-8", mode='r') as f:
        for line in f:
            yield line

## データ整形
元データ：dbplv13.json (17GB)<br>
https://www.aminer.org/citation でダウンロードし、解凍

データ中身：
```
[
{
    "_id": "53e99784b7602d9701f3e3f4",
    "title": "360degree",
    "authors": [
        {
        "_id": "53f46946dabfaec09f24b4ed",
        "name": "Miguel Palma",
        "sid": "8515747"
        }
    ],
    "venue": {
        "_id": "53a72b2d20f7420be8c1c5a8",
        "raw": "SIGGRAPH ASIA Art Gallery & Emerging Technologies"
    },
    "year": 2009.0,
    "keywords": [],
    "n_citation": 0.0,
    "page_start": "39",
    "lang": "en",
    "volume": "",
    "issue": "",
    "doi": "10.1145/1665137.1665166",
    "url": [
        "http://doi.acm.org/10.1145/1665137.1665166",
        "db/conf/siggraph/siggraph2009asiaart.html#Palma09",
        "https://doi.org/10.1145/1665137.1665166"
    ],
    "abstract": ""
},
....
{
    各論文データ
}
]
```

使いづらい点：
- 一行ずつ読み込んでいけない
    - データが大きい
    - 改行
- intデータをNumberInt()を用いている
    - 例：　1 -> NumberInt(1) // NumberInt(1)はエラーを引き起こす

下記の形式のテキストファイルに変換 -> 整形後 dblpv13.txt
```
{ 論文データ1 }
{ 論文データ2 }
...
{ 論文データN }
```

In [14]:
%%time
with open(DATAPATH + 'dblpv13.json') as fin:
    d = ""
    for line in tqdm(fin):
        if line == "[\n":
            pass
        elif line == "},\n":
            d += "}"
            d = d.replace("\n", "")
            d = re.sub("NumberInt\(|\)", "", d)
            with open(DATAPATH + 'dbplv13.txt', 'a') as f:
                f.write(d+"\n")
            d = ""
        elif line == "}\n":
            d += "}"
            d = d.replace("\n", "")
            d = re.sub("NumberInt\(|\)", "", d)
            with open(DATAPATH + 'dbplv13.txt', 'a') as f:
                f.write(d+"\n")
            break
        else:
            d += line

0it [00:00, ?it/s]

FIN
CPU times: user 7min 38s, sys: 55.6 s, total: 8min 34s
Wall time: 8min 33s


In [15]:
%%time

titles = []

metadata = get_metadata("dblpv13.txt")
for paper in tqdm(metadata):
    data = json.loads(paper)
    try:
        t = paper_dict.get('title')   
        titles.append(t)

    except Exception as e:
        print(e)
        print(paper_dict)
        break
len(titles)

0it [00:00, ?it/s]

CPU times: user 1min 50s, sys: 9.51 s, total: 2min
Wall time: 1min 53s


5354309

データセットのメタデータにある論文数と一致

In [11]:
%%time

categories = {}
count = 0

metadata = get_metadata("dblpv13.txt")
for paper in tqdm(metadata):
    data = json.loads(paper)
    try:
        k = data.get('fos')
        if len(k) == 0:
            pass
        else:
            for i in k:
                if i not in categories:
                    categories[i] = 1
                else:
                    categories[i] = categories[i] + 1
    except Exception as e:
        # keywordがないものも存在
        count += 1
        pass
len(categories), count

0it [00:00, ?it/s]

CPU times: user 2min 9s, sys: 14 s, total: 2min 23s
Wall time: 2min 11s


(166003, 350925)

In [17]:
important_keywords = []
for keyword in tqdm(categories):
    if categories[keyword] > 10000:
        important_keywords.append(keyword)
important_keywords

  0%|          | 0/166003 [00:00<?, ?it/s]

['Environmental science',
 'Pattern recognition',
 'Computer science',
 'Feature (computer vision',
 'Feature extraction',
 'Artificial intelligence',
 'Monte Carlo method',
 'Algorithm',
 'Statistics',
 'World Wide Web',
 'Programming language',
 'Software engineering',
 'XML',
 'Virtualization',
 'Virtual machine',
 'Testbed',
 'Quality of service',
 'Resource allocation',
 'Web application',
 'Operating system',
 'Distributed computing',
 'Schedule',
 'Engineering',
 'Information system',
 'Mobile device',
 'Scalability',
 'The Internet',
 'Decision support system',
 'Process management',
 'Business',
 'Mobile computing',
 'Computer network',
 'Real-time computing',
 'Latency (engineering',
 'Mobile telephony',
 'Computer architecture',
 'Memory management',
 'Embedded system',
 'Heuristic',
 'Microeconomics',
 'Concurrency',
 'Pure mathematics',
 'Mathematics',
 'Component-based software engineering',
 'Software construction',
 'Software development',
 'Health care',
 'Population',

## キーワードの解析

In [72]:
%%time

keywords = {}
count, count2, count3 = 0, 0, 0

metadata = get_metadata("dblpv13.txt")
for paper in tqdm(metadata):
    data = json.loads(paper)
    try:
        k = data.get('keywords')
        y = data.get('year')
        if len(k) == 0 or len(k) == 1:
            pass
        else:
            if y >= START_YEAR and  END_YEAR >= y :
                for i in k:
                    if i not in keywords:
                        keywords[i] = 1
                    else:
                        keywords[i] = keywords[i] + 1
                count2 += 1
            else:
                count3 += 1
    except Exception as e:
        # keywordがないものも存在
        count += 1
        pass
len(keywords), count, count2, count3

0it [00:00, ?it/s]

CPU times: user 2min 6s, sys: 12.5 s, total: 2min 19s
Wall time: 2min 10s


(5537095, 1171589, 2009801, 1209909)

In [74]:
important_keywords = []
for keyword in tqdm(keywords):
    if keywords[keyword] >= 200:
        important_keywords.append(keyword)
len(important_keywords)

  0%|          | 0/5537095 [00:00<?, ?it/s]

8672

In [76]:
important_keywords = list(map(lambda x: x.lower(), important_keywords))
important_keywords = list(map(lambda x: x.strip(), important_keywords))
important_keywords = list(set(important_keywords))
len(important_keywords)

7076

In [77]:
important_keywords.remove("")

In [79]:
with open(DATAPATH + "DBLP/keywords.txt", mode="a") as f:
    for i in important_keywords:
        f.write(i + "\n")