In [8]:
import json

def load_student_data(data_path:str):
    errors = []
    data = set()
    with open(data_path) as f:
        for line in f:
            elem = json.loads(line)
            if len(elem['choices']) != 4:
                print(f"There are more/less than 4 choices found for: {elem}")
                errors.append(elem)
            elif len(set(elem['choices'])) < 4:
                print(f"There are duplicated choices: {elem['choices']}")
                errors.append(elem)
            data.add(elem['text'].strip())
    print(f"{len(errors)} errors.")
    return data

def load_original_data(data_path:str):
    data = set()
    with open(data_path, "r", encoding = 'utf-8') as fin:
        for line in fin:
            item = line.strip().replace('Concept', '').replace('Entity', '').strip('\t')
            data.add(item)
    return data

def check_data(orig, processed):
    a = orig.difference(processed)
    b = processed.difference(orig)
    assert len(a) == 0, f"{len(a)} entries in original data and not in processed: {a}" 
    assert len(b) == 0, f"{len(b)} entries in processed data and not in original: {b}" 

original_train_data = load_original_data('original_datasets/task-26-hypernym_discovery/training/data/1B.italian.training.data.txt')
original_test_data = load_original_data('original_datasets/task-26-hypernym_discovery/test/data/1B.italian.test.data.txt')

In [16]:
STUDENT_TRAIN_FILE = 'students_submissions/cardullo.2127806/HM1_A-2127806/27_hypernym_discovery/hypernym_discovery-task1-train-data.jsonl'
STUDENT_TEST_FILE  = 'students_submissions/cardullo.2127806/HM1_A-2127806/27_hypernym_discovery/hypernym_discovery-task1-test-data.jsonl'

In [17]:
# Train file
student_train_data = load_student_data(STUDENT_TRAIN_FILE)
check_data(original_train_data, student_train_data)

There are duplicated choices: ['rischiaramento', 'trattato internazionale', 'condottiero', 'condottiero']
There are duplicated choices: ['matematica pura', 'circolazione', 'matematica pura', 'guru']
There are duplicated choices: ['pettegolezzo', 'pettegolezzo', 'struttura architettonica', 'malformazione']
There are duplicated choices: ['borgo\n', 'materiale illustrativo', 'seguace', 'materiale illustrativo']
There are duplicated choices: ['torta', 'discendenza', 'discendenza', 'tempo']
5 errors.


In [18]:
# Test file
student_test_data = load_student_data(STUDENT_TEST_FILE)
check_data(original_test_data, student_test_data)

There are duplicated choices: ['indicatore', 'indicatore', 'ragazza', 'asserzione']
There are duplicated choices: ['idrovia', 'idrovia', 'rete\n', 'insetto']
There are duplicated choices: ['beni culturali', 'nota musicale', 'spazio\n', 'nota musicale']
There are duplicated choices: ['bacchetta', 'epistola', 'indicatore', 'bacchetta']
4 errors.
