# Aminerデータの基本操作

In [1]:
# import libraries

import numpy as np
import pandas as pd
import gc
import os
import json
from collections import Counter, defaultdict
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
%matplotlib inline
import plotly.express as px
import re
import datetime as dt
import japanize_matplotlib
year_pattern = r'([1-2][0-9]{3})'

In [24]:
DATAPATH = "../../data/"
START_YEAR = 2010
END_YEAR = 2020
YEAR_STD = END_YEAR - START_YEAR

In [3]:
def get_metadata(filename):
    with open(DATAPATH + filename, encoding="utf-8", mode='r') as f:
        for line in f:
            yield line

## データ整形
元データ：dbplv13.json (17GB)<br>
https://www.aminer.org/citation でダウンロードし、解凍

データ中身：
```
[
{
    "_id": "53e99784b7602d9701f3e3f4",
    "title": "360degree",
    "authors": [
        {
        "_id": "53f46946dabfaec09f24b4ed",
        "name": "Miguel Palma",
        "sid": "8515747"
        }
    ],
    "venue": {
        "_id": "53a72b2d20f7420be8c1c5a8",
        "raw": "SIGGRAPH ASIA Art Gallery & Emerging Technologies"
    },
    "year": 2009.0,
    "keywords": [],
    "n_citation": 0.0,
    "page_start": "39",
    "lang": "en",
    "volume": "",
    "issue": "",
    "doi": "10.1145/1665137.1665166",
    "url": [
        "http://doi.acm.org/10.1145/1665137.1665166",
        "db/conf/siggraph/siggraph2009asiaart.html#Palma09",
        "https://doi.org/10.1145/1665137.1665166"
    ],
    "abstract": ""
},
....
{
    各論文データ
}
]
```

使いづらい点：
- 一行ずつ読み込んでいけない
    - データが大きい
    - 改行
- intデータをNumberInt()を用いている
    - 例：　1 -> NumberInt(1) // NumberInt(1)はエラーを引き起こす

下記の形式のテキストファイルに変換 -> 整形後 dblpv13.txt
```
{ 論文データ1 }
{ 論文データ2 }
...
{ 論文データN }
```

In [4]:
%%time
# 行数 409129300 8min30s程度かかる
with open(DATAPATH + 'dblpv13.json') as fin:
    d = ""
    for line in tqdm(fin):
        if line == "[\n":
            pass
        elif line == "},\n":
            d += "}"
            d = d.replace("\n", "")
            d = re.sub("NumberInt\(|\)", "", d)
            with open(DATAPATH + 'dblpv13.txt', 'a') as f:
                f.write(d+"\n")
            d = ""
        elif line == "}\n":
            d += "}"
            d = d.replace("\n", "")
            d = re.sub("NumberInt\(|\)", "", d)
            with open(DATAPATH + 'dblpv13.txt', 'a') as f:
                f.write(d+"\n")
            break
        else:
            d += line

0it [00:00, ?it/s]

CPU times: user 7min 22s, sys: 57 s, total: 8min 19s
Wall time: 8min 18s


In [6]:
%%time

titles = []

metadata = get_metadata("dblpv13.txt")
for paper in tqdm(metadata):
    data = json.loads(paper)
    try:
        t = data.get('title')   
        titles.append(t)

    except Exception as e:
        print(e)
        print(data)
        break
len(titles)

0it [00:00, ?it/s]

CPU times: user 1min 51s, sys: 12.3 s, total: 2min 3s
Wall time: 1min 54s


5354309

データセットのメタデータにある論文数と一致

In [25]:
%%time

categories = {}
count = 0

metadata = get_metadata("dblpv13.txt")
for paper in tqdm(metadata):
    data = json.loads(paper)
    try:
        k = data.get('fos')
        y = data.get('year')
        if len(k) == 0:
            pass
        else:
            if y >= START_YEAR and  END_YEAR >= y :
                for i in k:
                    if i not in categories:
                        categories[i] = 1
                    else:
                        categories[i] = categories[i] + 1
    except Exception as e:
        # keywordがないものも存在
        count += 1
        pass
print("カテゴリー数:{}, カテゴリーを持たない論文数:{}".format(len(categories), count))

0it [00:00, ?it/s]

カテゴリー数:152178, カテゴリーを持たない論文数:350935
CPU times: user 2min 6s, sys: 14.5 s, total: 2min 20s
Wall time: 2min 7s


In [26]:
important_keywords = []
for keyword in tqdm(categories):
    if categories[keyword] > 200:
        important_keywords.append(keyword)
len(important_keywords)
print("出現数が200以上のカテゴリー数:{}".format(len(important_keywords)))

  0%|          | 0/152178 [00:00<?, ?it/s]

出現数が200以上のカテゴリー数:12604


In [27]:
important_keywords[:10]

['Agronomy',
 'Moisture',
 'Hydrology',
 'Environmental science',
 'Water content',
 'Irrigation',
 'Soil water',
 'Canopy',
 'Virtualization',
 'Service level objective']

## キーワードの解析

In [28]:
%%time

keywords = {}
count, count2, count3 = 0, 0, 0

metadata = get_metadata("dblpv13.txt")
for paper in tqdm(metadata):
    data = json.loads(paper)
    try:
        k = data.get('keywords')
        y = data.get('year')
        if len(k) == 0 or len(k) == 1:
            pass
        else:
            if y >= START_YEAR and  END_YEAR >= y :
                for i in k:
                    if i not in keywords:
                        keywords[i] = 1
                    else:
                        keywords[i] = keywords[i] + 1
                count2 += 1
            else:
                count3 += 1
    except Exception as e:
        # keywordがないものも存在
        count += 1
        pass
print("キーワードの数:{}, キーワードのない論文数:{},\n 期間が妥当である論文数{}, 期間街の論文数{}".format(len(keywords), count, count2, count3))

0it [00:00, ?it/s]

キーワードの数:4563126, キーワードのない論文数:1171589,
 期間が妥当である論文数1667206, 期間街の論文数1552504
CPU times: user 2min 5s, sys: 14.8 s, total: 2min 20s
Wall time: 2min 7s


In [38]:
important_keywords = []
for keyword in tqdm(keywords):
    # keyword の出現回数が、400を超えるとき
    if keywords[keyword] >= 400:
        important_keywords.append(keyword)
print("出現数が400以上のキーワード数:{}".format(len(important_keywords)))

  0%|          | 0/4563126 [00:00<?, ?it/s]

出現数が400以上のキーワード数:3388


In [39]:
important_keywords[:10]

['irrigation',
 'radiometry',
 'indexes',
 'vegetation',
 'indexation',
 'soil moisture',
 'indexing terms',
 'spectrum',
 'resource allocation',
 'quality of service']

In [40]:
important_keywords = list(map(lambda x: x.lower(), important_keywords))
important_keywords = list(map(lambda x: x.strip(), important_keywords))
important_keywords = list(set(important_keywords))
print("キーワードの個数: {}".format(len(important_keywords)))
try:
    important_keywords.remove("")
except Exception as e:
    print(e)


キーワードの個数: 2701
list.remove(x): x not in list


In [36]:
with open(DATAPATH + "DBLP/keywords.txt", mode="a") as f:
    for i in important_keywords:
        f.write(i + "\n")