# Aminerデータの基本操作

In [16]:
# import libraries

import numpy as np
import pandas as pd
import gc
import os
import json
from collections import Counter, defaultdict
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
%matplotlib inline
import plotly.express as px
import re
import datetime as dt
import japanize_matplotlib
year_pattern = r'([1-2][0-9]{3})'

In [2]:
DATAPATH = "../../data/"

In [3]:
def get_metadata(filename):
    with open(DATAPATH + filename, encoding="utf-8", mode='r') as f:
        for line in f:
            yield line

## データ整形
元データ：dbplv13.json (17GB)<br>
https://www.aminer.org/citation でダウンロードし、解凍

データ中身：
```
[
{
    "_id": "53e99784b7602d9701f3e3f4",
    "title": "360degree",
    "authors": [
        {
        "_id": "53f46946dabfaec09f24b4ed",
        "name": "Miguel Palma",
        "sid": "8515747"
        }
    ],
    "venue": {
        "_id": "53a72b2d20f7420be8c1c5a8",
        "raw": "SIGGRAPH ASIA Art Gallery & Emerging Technologies"
    },
    "year": 2009.0,
    "keywords": [],
    "n_citation": 0.0,
    "page_start": "39",
    "lang": "en",
    "volume": "",
    "issue": "",
    "doi": "10.1145/1665137.1665166",
    "url": [
        "http://doi.acm.org/10.1145/1665137.1665166",
        "db/conf/siggraph/siggraph2009asiaart.html#Palma09",
        "https://doi.org/10.1145/1665137.1665166"
    ],
    "abstract": ""
},
....
{
    各論文データ
}
]
```

使いづらい点：
- 一行ずつ読み込んでいけない
    - データが大きい
    - 改行
- intデータをNumberInt()を用いている
    - 例：　1 -> NumberInt(1) // NumberInt(1)はエラーを引き起こす

下記の形式のテキストファイルに変換 -> 整形後 dblpv13.txt
```
{ 論文データ1 }
{ 論文データ2 }
...
{ 論文データN }
```

In [14]:
%%time
with open(DATAPATH + 'dblpv13.json') as fin:
    d = ""
    for line in tqdm(fin):
        if line == "[\n":
            pass
        elif line == "},\n":
            d += "}"
            d = d.replace("\n", "")
            d = re.sub("NumberInt\(|\)", "", d)
            with open(DATAPATH + 'dbplv13.txt', 'a') as f:
                f.write(d+"\n")
            d = ""
        elif line == "}\n":
            d += "}"
            d = d.replace("\n", "")
            d = re.sub("NumberInt\(|\)", "", d)
            with open(DATAPATH + 'dbplv13.txt', 'a') as f:
                f.write(d+"\n")
            break
        else:
            d += line

0it [00:00, ?it/s]

FIN
CPU times: user 7min 38s, sys: 55.6 s, total: 8min 34s
Wall time: 8min 33s


In [15]:
%%time

titles = []

metadata = get_metadata("dblpv13.txt")
for paper in tqdm(metadata):
    data = json.loads(paper)
    try:
        t = paper_dict.get('title')   
        titles.append(t)

    except Exception as e:
        print(e)
        print(paper_dict)
        break
len(titles)

0it [00:00, ?it/s]

CPU times: user 1min 50s, sys: 9.51 s, total: 2min
Wall time: 1min 53s


5354309

データセットのメタデータにある論文数と一致

## キーワードの解析

In [34]:
%%time

keywords = []
count = 0

metadata = get_metadata("dblpv13.txt")
for num, paper in enumerate(tqdm(metadata)):
    data = json.loads(paper)
    try:
        k = data.get('keywords')   
        keywords += k
    except Exception as e:
        # keywordがないものも存在
        count += 1
        pass
    if num % 10000 == 9999:
        keywords = list(set(keywords))
len(keywords), count

0it [00:00, ?it/s]

CPU times: user 14min 53s, sys: 1min 42s, total: 16min 36s
Wall time: 16min 20s


(8102810, 1171587)

In [35]:
keywords[:100]

['',
 'foster writing skill',
 'binary serially-concatenated convolutional code',
 'strong subcontract pre-order',
 'policy-aware sender anonymity',
 'Java enabled Web browser',
 'graph-theoretical indices',
 'interpolative coding',
 'panorama video streaming',
 'wifi-plc dual-links',
 'statechart description',
 'degree of matching',
 'cooperative concatenated coding',
 'Stock return',
 'airborne multiple-input-multiple-output (mimo communications',
 'insulin pumps',
 'fast converging iteration technique',
 'spindle-shaped graph',
 'perceptual gap',
 'towards participatory design',
 'Asymmetric underlap',
 'heavy regular expression',
 'effective schur decomposition',
 'deep Q-networks',
 'complete sampling',
 'allocating user experience meta-data',
 'Ferrofluid jet',
 'handling synchronization problem',
 'small divide',
 'cubic fourfolds',
 'haf algorithm',
 'Answer Set Programming (ASP',
 'Network RAM',
 'active population',
 'fast marching farthest point',
 'QoE diagnostics',
 'inevi