# AFP balanced dataset
I want to extract from this dataset a balanced one respect to the categories in the ground truth

In [1]:
dataset = '../data/afp.txt'
labels = '../data/afp_labels.txt'

In [2]:
with open(labels, "r") as datafile:
    true_topics = [line.rstrip() for line in datafile if line]
true_topics[0:10]

[',,',
 '13000000,13007000 13006000 13008000,',
 '11000000,11014000 11001000 11002000 11006000,',
 '11000000,11014000 11001000 11002000 11006000,',
 '11000000,11014000 11001000 11002000 11006000,',
 '11000000,11014000 11001000 11002000 11006000,',
 '15000000,,',
 '11000000,11014000 11001000 11002000 11006000,',
 '02000000,02003000 02001000 16001000,02003001',
 '15000000,15054000,']

In [3]:
true1, true2, true3 = zip(*[line.split(',') for line in true_topics])
true1[0:10]

('',
 '13000000',
 '11000000',
 '11000000',
 '11000000',
 '11000000',
 '15000000',
 '11000000',
 '02000000',
 '15000000')

In [4]:
categories = [x if ' ' not in x else 'multiple' for x in true1]

from collections import Counter
counts = Counter(categories)
counts

Counter({'': 3542,
         '13000000': 419,
         '11000000': 12703,
         '15000000': 34025,
         '02000000': 2161,
         'multiple': 52838,
         '03000000': 1606,
         '10000000': 61,
         '08000000': 260,
         '16000000': 5462,
         '04000000': 9778,
         '12000000': 196,
         '01000000': 1509,
         '06000000': 269,
         '05000000': 13,
         '17000000': 172,
         '07000000': 335,
         '09000000': 8,
         '14000000': 159})

In [5]:
with open(dataset, "r") as datafile:
    text = [line.rstrip() for line in datafile if line]

### Subset 1: Only mono-label entries

In [6]:
selected_idx = []
num = 0
for c in counts:
    if c == '' or c == 'multiple':
        continue
    if counts[c] < 159:
        continue
    num += 1
    indices = [i for i, x in enumerate(categories) if x == c]
    selected_idx.extend(indices)


print('Number of classes %d' % num)
print('Number of documents %d' % len(selected_idx))

Number of classes 14
Number of documents 69054


In [7]:
def extract_and_write(name, dataset, idx):
    with open(name, 'w') as f:
        for i in idx:
            f.write(dataset[i])
            f.write('\n')
            
extract_and_write('../data/afp_mono.txt', text, selected_idx)
extract_and_write('../data/afp_mono_label.txt', true1, selected_idx)

### Subset 2: Only mono-label entries, balanced

In [8]:
from random import choices

balanced_idx = []
num = 0
for c in counts:
    if c == '' or c == 'multiple':
        continue
    if counts[c] < 159:
        continue
    num += 1
    indices = [i for i, x in enumerate(categories) if x == c]
    balanced_idx.extend(choices(indices, k=159))

print('Number of classes %d' % num)
print('Number of documents %d' % len(balanced_idx))

Number of classes 14
Number of documents 2226


In [9]:
extract_and_write('../data/afp_balanced.txt', text, balanced_idx)
extract_and_write('../data/afp_balanced_label.txt', true1, balanced_idx)