In [1]:
import json
from collections import defaultdict
import re
import numpy as np
import random
from sklearn.model_selection import train_test_split
import pandas as pd

In [2]:
state = '98%*1'

data_df = pd.read_csv('origin/freecode_data.csv', index_col=0)
data = [{'text': line[1]['question'], 'labels': line[1]['tag'].split()} for line in data_df.iterrows()]

In [3]:
train, test = train_test_split(data, test_size=0.2, random_state=42)

In [4]:
# delete \n and \
for line in train:
    line['text'] = line['text'].replace('\n', ' ')
    line['text'] = line['text'].replace('\\', '')

for line in test:
    line['text'] = line['text'].replace('\n', ' ')
    line['text'] = line['text'].replace('\\', '')

In [5]:
# count classes
train_class_count = defaultdict(int)
test_class_count = defaultdict(int)

for line in train:
    for label in line['labels']:
        train_class_count[label] += 1

for line in test:
    for label in line['labels']:
        test_class_count[label] += 1

unseen_class_count = train_class_count.keys() - test_class_count.keys()

In [6]:
print('Number of train data:', len(train))
print('Number of test data:', len(test))

print('Number of classes in train data:', len(train_class_count))
print('Number of classes in test data:', len(test_class_count))
print('Number of unseen classes:', len(unseen_class_count))

print('Count of train label:\n', sorted(train_class_count.items(), key=lambda x: -x[1]))
print('Count of test label:\n', sorted(test_class_count.items(), key=lambda x: -x[1]))

Number of train data: 37596
Number of test data: 9399
Number of classes in train data: 8005
Number of classes in test data: 3411
Number of unseen classes: 5589
Count of train label:
Count of test label:
 [('Software-Development', 1906), ('Internet', 1607), ('Web', 1234), ('Libraries', 1137), ('multimedia', 966), ('Communications', 921), ('Utilities', 693), ('Dynamic-Content', 640), ('Networking', 567), ('Games/Entertainment', 522), ('Scientific/Engineering', 505), ('Desktop-Environment', 489), ('Text-Processing', 474), ('Graphics', 452), ('Database', 444), ('Sound/Audio', 415), ('Security', 404), ('Monitoring', 370), ('Systems-Administration', 369), ('Office/Business', 363), ('Email', 310), ('GPL', 278), ('Information-Management', 276), ('Site-Management', 264), ('Markup', 249), ('Java-Libraries', 225), ('Chat', 206), ('education', 198), ('Archiving', 179), ('Video', 179), ('php-classes', 160), ('Application-Frameworks', 159), ('Front-Ends', 154), ('XML', 152), ('Testing', 149), ('Play

In [7]:
train_text_length, test_text_length = 0, 0
for line in train:
    train_text_length += len(line['text'])
for line in test:
    test_text_length += len(line['text'])

print('Avg length of train text:', train_text_length / len(train))
print('Avg length of test text:', test_text_length / len(test))

Avg length of train text: 317.38094478135974
Avg length of test text: 314.44260027662517


In [8]:
# label2id
label2id = dict()
cnt = 0

for line in train:
    for label in line['labels']:
        if label not in label2id:
            label2id[label] = cnt
            cnt += 1

for line in test:
    for label in line['labels']:
        if label not in label2id:
            label2id[label] = cnt
            cnt += 1

id2label = {v: k for k, v in label2id.items()}

In [9]:
# to one label, labels[0] is the rarest one
for line in train:
    minn, minn_label = 20000, ''
    for label in line['labels']:
        if train_class_count[label] < minn:
            minn = train_class_count[label]
            minn_label = label
    if line['labels'][0] != minn_label:
        line['labels'].remove(minn_label)
        line['labels'].insert(0, minn_label)

In [10]:
# delete long tail data
label_collection = [[] for _ in range(len(train_class_count))]
for line in train:
    idx = label2id[line['labels'][0]]
    label_collection[idx].append(line)

if state == '98%*1':
    threshold = int(np.percentile([len(t) for t in label_collection], 98))
    label_collection = [t[:threshold * 1] for t in label_collection if len(t) >= threshold]

In [11]:
# refresh label2id
label2id = dict()
for idx, piece in enumerate(label_collection):
    label2id[piece[0]['labels'][0]] = idx

id2label = {v: k for k, v in label2id.items()}

In [12]:
# delete unseen test
idx = 0
while idx < len(test):
    for label in test[idx]['labels']:
        if label not in label2id:
            test.pop(idx)
            break
    else:
        idx += 1

In [13]:
with open(f'{state}/id2label.json', 'w') as f:
    json.dump(id2label, f, ensure_ascii=False, indent=2)

with open(f'{state}/label2id.json', 'w') as f:
    json.dump(label2id, f, ensure_ascii=False, indent=2)

In [14]:
# shuffle train dataset
collection_pointer = [0] * len(label_collection)

shuffle_train = []
global_idx, idx = 0, 0
while global_idx < sum(len(t) for t in label_collection):
    if collection_pointer[idx] < len(label_collection[idx]):
        shuffle_train.append(label_collection[idx][collection_pointer[idx]])
        collection_pointer[idx] += 1
        idx = (idx + 1) % len(label_collection)
        global_idx += 1
    else:
        idx = (idx + 1) % len(label_collection) 

In [15]:
# subset of test
test = test[:len(shuffle_train) * 3]

In [16]:
print('Number of train data:', len(shuffle_train))
print('Number of test data:', len(test))

Number of train data: 10854
Number of test data: 5417


In [17]:
# delete unseen train labels
for line in shuffle_train:
    idx = 0
    while idx < len(line['labels']):
        if line['labels'][idx] not in label2id:
            line['labels'].pop(idx)
        else:
            idx += 1

In [18]:
# count classes
train_class_count = defaultdict(int)
test_class_count = defaultdict(int)

for line in shuffle_train:
    for label in line['labels']:
        train_class_count[label] += 1

for line in test:
    for label in line['labels']:
        test_class_count[label] += 1

unseen_class_count = train_class_count.keys() - test_class_count.keys()
print('Number of train data:', len(shuffle_train))
print('Number of test data:', len(test))

print('Number of classes in train data:', len(train_class_count))
print('Number of classes in test data:', len(test_class_count))
print('Number of unseen classes:', len(unseen_class_count))

print('Count of train label:\n', sorted(train_class_count.items(), key=lambda x: -x[1]))
print('Count of test label:\n', sorted(test_class_count.items(), key=lambda x: -x[1]))

train_text_length, test_text_length = 0, 0
for line in shuffle_train:
    train_text_length += len(line['text'])
for line in test:
    test_text_length += len(line['text'])

print('Avg length of train text:', train_text_length / len(shuffle_train))
print('Avg length of test text:', test_text_length / len(test))

Number of train data: 10854
Number of test data: 5417
Number of classes in train data: 162
Number of classes in test data: 147
Number of unseen classes: 15
Count of train label:
 [('Software-Development', 2684), ('Internet', 2338), ('Web', 1747), ('Communications', 1534), ('Libraries', 1492), ('Dynamic-Content', 996), ('Scientific/Engineering', 975), ('Utilities', 888), ('Games/Entertainment', 754), ('Office/Business', 718), ('Graphics', 707), ('Text-Processing', 691), ('Desktop-Environment', 663), ('Email', 627), ('Sound/Audio', 622), ('Database', 605), ('Networking', 574), ('Information-Management', 571), ('Systems-Administration', 464), ('Security', 437), ('Site-Management', 416), ('Archiving', 414), ('Monitoring', 387), ('education', 350), ('Java-Libraries', 325), ('Chat', 264), ('CGI-Tools/Libraries', 250), ('Video', 234), ('Visualization', 231), ('Financial', 230), ('Front-Ends', 225), ('Players', 216), ('Testing', 215), ('Application-Frameworks', 215), ('php-classes', 206), ('Te

In [19]:
# add label description
with open('label-description.jsonl') as f:
    label_desc = f.readlines()
    label_desc = list(map(json.loads, label_desc))
    label_desc = {line['label']: line['description'] for line in label_desc}

for line in shuffle_train:
    label_text = line['labels'][0]
    for i, label in enumerate(line['labels']):
        line['labels'][i] = label2id[label]
    line['label_description'] = label_desc[label_text]

for line in test:
    label_text = line['labels'][0]
    for i, label in enumerate(line['labels']):
        line['labels'][i] = label2id[label]
    line['label_description'] = label_desc[label_text]

with open(f'{state}/train.json', 'w') as f:
    json.dump(shuffle_train, f, ensure_ascii=False, indent=2)

with open(f'{state}/test.json', 'w') as f:
    json.dump(test, f, ensure_ascii=False, indent=2)

In [20]:
# add label description
with open('label-description-with-example.jsonl') as f:
    label_desc = f.readlines()
    label_desc = list(map(json.loads, label_desc))
    label_desc = {line['label']: line['description'] for line in label_desc}

for line in shuffle_train:
    label_text = id2label[line['labels'][0]]
    line['label_description'] = label_desc[label_text]

for line in test:
    label_text = id2label[line['labels'][0]]
    line['label_description'] = label_desc[label_text]

with open(f'{state}/train-with-example.json', 'w') as f:
    json.dump(shuffle_train, f, ensure_ascii=False, indent=2)

with open(f'{state}/test-with-example.json', 'w') as f:
    json.dump(test, f, ensure_ascii=False, indent=2)