In [100]:
import re
import pandas as pd
import numpy as np
from pathlib import Path
from itertools import combinations
from collections import defaultdict

def load_uses(filename='TRoTR/data/uses.tsv', sep='\t'):
    tmp = list()
    with open(filename, mode='r', encoding='utf-8') as f:
        columns = f.readline().rstrip().split(sep)
        for line in f.readlines():
            tmp.append(dict(zip(columns, line.rstrip().split(sep))))
    
    return pd.DataFrame(tmp)

def load_instances(filename, dirname='TRoTR/rounds', sep='\t'):
    tmp = list()
    with open(f'{dirname}/{filename}', mode='r', encoding='utf-8') as f:
        columns = f.readline().rstrip().split(sep) + ['dataID1', 'dataID2']
        for line in f.readlines():
            tmp_record = dict(zip(columns, line[:-1].split('\t')))
            tmp_record['dataID1'], tmp_record['dataID2'] = tmp_record['dataIDs'].split(',')
            tmp.append(tmp_record)
    
    return pd.DataFrame(tmp)

def load_judgments(filename, dirname='TRoTR/judgments', sep='\t'):
    tmp = list()
    with open(f'{dirname}/{filename}', mode='r', encoding='utf-8') as f:
        columns = f.readline().rstrip().split(sep)
        for line in f.readlines():
            tmp_record = dict(zip(columns, line.rstrip().split(sep)))
            tmp.append(tmp_record)
            
    #tmp - to remove after fixing the bug
    tmp = list()
    with open(f'{dirname}/{filename}', mode='r', encoding='utf-8') as f:
        columns = f.readline().rstrip().split(sep)
        f = f.read().replace('iosakwe\n', 'iosakwe@@@').replace('shur\n', 'shur@@@').replace('Nisha\n', 'Nisha@@@').replace('AndreaMariaC\n', 'AndreaMariaC@@@').replace('\n', '--')
        lines = f.split('@@@')
        for line in lines:
            tmp_record = dict(zip(columns, line.rstrip().split(sep)))
            tmp.append(tmp_record)

    # -1: can not decide
    df = pd.DataFrame(tmp).fillna('-1')
    df['label'] = df['label'].apply(lambda x: x.replace('-', '-1')).astype(int)
    
    return df

def merge_data(df_uses, df_instances, df_judgments):
    df = df_judgments.merge(df_instances).merge(df_uses, left_on='dataID1', right_on='dataID')
    del df['dataID']
    del df['lemma']
    df = df.rename(columns={column: f'{column}1' for column in ['context', 'indices_target_token', 'indices_target_sentence']})
    df = df.merge(df_uses, left_on='dataID2', right_on='dataID')
    del df['dataID']
    df = df.rename(columns={column: f'{column}2' for column in ['context', 'indices_target_token', 'indices_target_sentence']})
    
    column_order = ['instanceID', 'dataID1', 'dataID2', 'label', 'annotator',  'lemma', 'context1', 'context2', 'indices_target_token1', 'indices_target_sentence1', 'indices_target_sentence2', 'indices_target_token2',  'comment', 'label_set', 'non_label', 'dataIDs']
    return df[column_order]

round_ = 'TRoTR.tsv'
df_uses = load_uses()
df_instances = load_instances(round_)
df_judgments = load_judgments(round_)
df = merge_data(df_uses, df_instances, df_judgments)

In [104]:
durel_uses = df_uses.copy()
#durel_uses['lemma'] = durel_uses['lemma'].apply(lambda x: x.replace(' ', '_').replace(',', ''))
durel_uses['description'] = ""
durel_uses['pos'] = ""
durel_uses['date'] = ""
durel_uses['grouping'] = ""
durel_uses = durel_uses.rename(columns={'dataID': 'identifier', 'indices_target_sentence': 'indexes_target_sentence', 'indices_target_token': 'indexes_target_token'})
columns = ['lemma', 'pos', 'date', 'grouping', 'identifier', 'description', 'context', 'indexes_target_token', 'indexes_target_sentence']
durel_uses = durel_uses[columns]

for lemma in durel_uses[columns].lemma.unique():    
    tmp = durel_uses[durel_uses['lemma'] == lemma]
    id_quote = re.search('\(.*\)', tmp.iloc[0].identifier).group(0).replace(':', ' ')
    Path(f'TRoTR/DURel_data/{id_quote}').mkdir(parents=True, exist_ok=True)
    tmp.to_csv(f'TRoTR/DURel_data/{id_quote}/uses.tsv', index=False, sep='\t')

    #df['lemma'] = df['lemma'].apply(lambda x: x.replace(' ', '_').replace(',', ''))
    durel_judgments = df[df['lemma'] == lemma].copy()    
    durel_judgments = durel_judgments.rename(columns={'dataID1': 'identifier1', 'dataID2': 'identifier2', 'label':'judgment'})
    columns = ['identifier1', 'identifier2', 'annotator', 'judgment', 'comment', 'lemma']
    durel_judgments = durel_judgments[columns]
    durel_judgments.to_csv(f'TRoTR/DURel_data/{id_quote}/judgments.tsv', index=False, sep='\t')