In [2]:
import json
from collections import defaultdict
import re
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split

In [99]:
data = pd.read_excel('origin/Meta-data/Data.xlsx')
data = json.loads(data.to_json(orient='index'))
data = list(data.values())

In [100]:
with open('label-description-with-example.jsonl') as f:
# with open('label-description.jsonl') as f:
    label_desc = f.readlines()
    label_desc = list(map(json.loads, label_desc))
    label_desc = {line['label']: line['description'] for line in label_desc}

In [101]:
for line in data:
    line['text'] = line['Abstract']
    line['label'] = f"{line['Domain'].strip()}.{line['area'].strip()}"
    for k in {'Y1', 'Y2', 'Y', 'Domain', 'area', 'keywords', 'Abstract'}:
        line.pop(k)

In [102]:
train, test = train_test_split(data, test_size=0.2, random_state=42)

In [103]:
# count classes
train_class_count = defaultdict(int)
test_class_count = defaultdict(int)

for line in train:
    train_class_count[line['label']] += 1

for line in test:
    test_class_count[line['label']] += 1

unseen_class_count = train_class_count.keys() - test_class_count.keys()

In [104]:
print('Number of train data:', len(train))
print('Number of test data:', len(test))

print('Number of classes in train data:', len(train_class_count))
print('Number of classes in test data:', len(test_class_count))
print('Number of unseen classes:', len(unseen_class_count))

print('Count of train label:\n', sorted(train_class_count.items(), key=lambda x: -x[1]))
print('Count of test label:\n', sorted(test_class_count.items(), key=lambda x: -x[1]))

Number of train data: 37588
Number of test data: 9397
Number of classes in train data: 144
Number of classes in test data: 143
Number of unseen classes: 2
Count of train label:
 [('biochemistry.Molecular biology', 599), ('biochemistry.Polymerase chain reaction', 593), ('biochemistry.Northern blotting', 561), ('biochemistry.Immunology', 506), ('biochemistry.Human Metabolism', 502), ('biochemistry.Enzymology', 452), ('biochemistry.Genetics', 445), ('biochemistry.Cell biology', 440), ('biochemistry.Southern blotting', 403), ('Civil.Rainwater Harvesting', 362), ('ECE.Electricity', 358), ('Civil.Water Pollution', 356), ('ECE.Digital control', 352), ('CS.network security', 351), ('CS.Parallel computing', 345), ('Civil.Geotextile', 345), ('ECE.PID controller', 344), ('ECE.Operational amplifier', 344), ('Civil.Green Building', 340), ('ECE.System identification', 340), ('CS.Computer vision', 338), ('Psychology.Attention', 337), ('CS.Software engineering', 335), ('ECE.Analog signal processing', 

In [105]:
# label2id
label2id = dict()
cnt = 0

for line in train:
    if line['label'] not in label2id:
        label2id[line['label']] = cnt
        cnt += 1

for line in test:
    if line['label'] not in label2id:
        label2id[line['label']] = cnt
        cnt += 1

id2label = {v: k for k, v in label2id.items()}

In [106]:
# delete long tail data
label_collection = [[] for _ in range(len(train_class_count))]
for line in train:
    idx = label2id[line['label']]
    label_collection[idx].append(line)

threshold = int(np.percentile([len(t) for t in label_collection], 25))
label_collection = [random.sample(t, threshold) for t in label_collection if len(t) > threshold]

In [107]:
# refresh label2id
label2id = dict()
for idx, piece in enumerate(label_collection):
    label2id[piece[0]['label']] = idx

id2label = {v: k for k, v in label2id.items()}

In [108]:
# delete unseen test
idx = 0
while idx < len(test):
    if test[idx]['label'] not in label2id:
        test.pop(idx)
    else:
        idx += 1

In [109]:
with open('id2label.json', 'w') as f:
    json.dump(id2label, f, ensure_ascii=False, indent=2)

with open('label2id.json', 'w') as f:
    json.dump(label2id, f, ensure_ascii=False, indent=2)

In [110]:
# shuffle train dataset
collection_pointer = [0] * len(label_collection)

shuffle_train = []
global_idx, idx = 0, 0
while global_idx < sum(len(t) for t in label_collection):
    if collection_pointer[idx] < len(label_collection[idx]):
        shuffle_train.append(label_collection[idx][collection_pointer[idx]])
        collection_pointer[idx] += 1
        idx = (idx + 1) % len(label_collection)
        global_idx += 1
    else:
        idx = (idx + 1) % len(label_collection) 

In [111]:
print('Number of train data:', len(shuffle_train))
print('Number of test data:', len(test))

Number of train data: 22470
Number of test data: 8317


In [112]:
# add label description
for line in shuffle_train:
    label_text = line['label']
    line['label'] = label2id[line['label']]
    line['label_description'] = label_desc[label_text]

for line in test:
    label_text = line['label']
    line['label'] = label2id[line['label']]
    line['label_description'] = label_desc[label_text]

In [113]:
with open('train.json', 'w') as f:
    json.dump(shuffle_train, f, ensure_ascii=False, indent=2)

with open('test.json', 'w') as f:
    json.dump(test, f, ensure_ascii=False, indent=2)

In [3]:
with open('train.json') as f:
    t = json.load(f)