In [1]:
from csv import DictReader
import json

In [2]:
def read_csv_file(filename):
    with open(filename) as csvfile:
        reader = DictReader(csvfile, delimiter=",")
        data = list(reader)
        return data 

In [3]:
# read community postedits
comm_pe = read_csv_file("community-2023-10-03.csv")
# read translation postedits
trans_pe = read_csv_file("translators-2023-10-03.csv")
users = read_csv_file("users-2023-10-19.csv")

In [4]:
# some basic stats
print('# community postedits = ', len(comm_pe))
print('# translator postedits = ', len(trans_pe))
print('# postedits (either community or translator)', len(comm_pe) + len(trans_pe))

# community postedits =  95
# translator postedits =  242
# postedits (either community or translator) 337


In [5]:
# store postedits in dictionary {idhal : {trans_sys_id: {'translators': [], 'community': []}}
idhal2pe = {}
idhal2pe_trans = {}
idhal2pe_comm = {}

comm_users = []
trans_users = []

for pe in trans_pe:
    if pe['id_hal'] not in idhal2pe:
        idhal2pe[pe['id_hal']] = {}
        idhal2pe_trans[pe['id_hal']] = {}
    if pe['trans_sys_id'] not in idhal2pe[pe['id_hal']]:
        idhal2pe[pe['id_hal']][pe['trans_sys_id']] = {'translators': [], 'community': []}
        idhal2pe_trans[pe['id_hal']][pe['trans_sys_id']] = {'translators': []}
    idhal2pe[pe['id_hal']][pe['trans_sys_id']]['translators'].append(pe)
    idhal2pe_trans[pe['id_hal']][pe['trans_sys_id']]['translators'].append(pe)
    
    if pe['user_id'] not in trans_users:
        trans_users.append(pe['user_id'])
    
for pe in comm_pe:
    if pe['id_hal'] not in idhal2pe:
        idhal2pe[pe['id_hal']] = {}
    if pe['id_hal'] not in idhal2pe_comm:
        idhal2pe_comm[pe['id_hal']] = {}
    if pe['trans_sys_id'] not in idhal2pe[pe['id_hal']]:
        idhal2pe[pe['id_hal']][pe['trans_sys_id']] = {'translators': [], 'community': []}
    if pe['trans_sys_id'] not in idhal2pe_comm[pe['id_hal']]:
        idhal2pe_comm[pe['id_hal']][pe['trans_sys_id']] = {'community': []}
    idhal2pe[pe['id_hal']][pe['trans_sys_id']]['community'].append(pe)
    idhal2pe_comm[pe['id_hal']][pe['trans_sys_id']]['community'].append(pe)
    
    if pe['user_id'] not in comm_users:
        comm_users.append(pe['user_id'])

    
transsys2num = {}    
# of the form (idhal, trans_sys_id)
intersec_comm_trans = []
pe_several_comm, pe_several_trans, pe_several_both = [], [], []
abs_several_comm, abs_several_trans, abs_several_both = [], [], []

# for each hal id
for idhal in idhal2pe:
    # are there several translations for this article?
    if len(idhal2pe[idhal]) > 1:
        abs_several_both.append((idhal, idhal2pe[idhal].keys()))
    pe_comm, pe_trans = [], []
    
    # for each different translation
    for trans_sys_id in idhal2pe[idhal]:
        if trans_sys_id not in transsys2num:
            transsys2num[trans_sys_id] = 0
        transsys2num[trans_sys_id] += len(idhal2pe[idhal][trans_sys_id]['community'])
        transsys2num[trans_sys_id] += len(idhal2pe[idhal][trans_sys_id]['translators'])
        if len(idhal2pe[idhal][trans_sys_id]['community']) > 0 and \
            len(idhal2pe[idhal][trans_sys_id]['translators']):
            intersec_comm_trans.append((idhal, trans_sys_id))
        if len(idhal2pe[idhal][trans_sys_id]['community'] + idhal2pe[idhal][trans_sys_id]['translators']) > 1:
            pe_several_both.append((idhal, trans_sys_id))
        if len(idhal2pe[idhal][trans_sys_id]['community']) > 1:
            pe_several_comm.append((idhal, trans_sys_id))
        if len(idhal2pe[idhal][trans_sys_id]['translators']) > 1:
            pe_several_trans.append((idhal, trans_sys_id))
        if len(idhal2pe[idhal][trans_sys_id]['translators']) > 0:
            pe_trans.append(trans_sys_id)
        if len(idhal2pe[idhal][trans_sys_id]['community']) > 0:
            pe_comm.append(trans_sys_id)
    if len(pe_trans) > 1:
        abs_several_trans.append((idhal, pe_trans))
    if len(pe_comm) > 1:
        abs_several_comm.append((idhal, pe_comm))

In [6]:
# number of unique abstracts postedited
print('# unique abstracts postedited = ', len(idhal2pe))
print('# unique abstracts postedited by the community = ', len(idhal2pe_comm))
print('# unique abstracts postedited by the translators = ', len(idhal2pe_trans))
print('# unique abstracts postedited by both the community and translators = ', len([idhal for idhal in idhal2pe_comm if idhal in idhal2pe_trans]), '\n')

# number of unique translations postedited (can be several translations for each abstract)
print('# unique translations postedited = ', sum([len(idhal2pe[idhal].values()) for idhal in idhal2pe]))
print('# unique translations postedited by the community = ', sum([len(idhal2pe_comm[idhal].values()) for idhal in idhal2pe_comm]))
print('# unique translations postedited by the translators = ', sum([len(idhal2pe_trans[idhal].values()) for idhal in idhal2pe_trans]), '\n')

# abstracts/translations w/ multiple postedits
print('# articles w/ several community translations postedited = ', len(abs_several_comm))
print('# articles w/ several translator translations postedited = ', len(abs_several_trans))
print('# articles w/ several translations postedited (either community or translator) = ', len(abs_several_both), '\n')

print('# translations postedited by both the community and translators = ', len(intersec_comm_trans))
print('# translations w/ several community postedits = ', len(pe_several_comm))
print('# translations w/ several translator postedits = ', len(pe_several_trans))
print('# translations w/ several postedits (either community or translator) = ', len(pe_several_both))

# unique abstracts postedited =  268
# unique abstracts postedited by the community =  73
# unique abstracts postedited by the translators =  240
# unique abstracts postedited by both the community and translators =  45 

# unique translations postedited =  305
# unique translations postedited by the community =  91
# unique translations postedited by the translators =  241 

# articles w/ several community translations postedited =  14
# articles w/ several translator translations postedited =  1
# articles w/ several translations postedited (either community or translator) =  33 

# translations postedited by both the community and translators =  27
# translations w/ several community postedits =  4
# translations w/ several translator postedits =  1
# translations w/ several postedits (either community or translator) =  30


In [7]:
# number of posteditions per translation system
for trans_sys_id in transsys2num:
    print(trans_sys_id, transsys2num[trans_sys_id])

3 124
6 106
4 107


In [8]:
users

[{'id': '1',
  'token': '33f00d5e4920579db7635af19fffc105577516d7',
  'native_langs': 'anglais',
  'other_langs': 'français (bilingue)',
  'experience': '3-10',
  'written_en': '1',
  'written_fr': '0',
  'mt_tools': '1',
  'utility_mt_hal': 'utile',
  'free_text': '',
  'accepted_conditions': '1',
  'expert': '1',
  'active': '1',
  'delete_history': 'NULL'},
 {'id': '2',
  'token': '37e35be62fd9ddfd390b46ae3b42b64d8ba6d79f',
  'native_langs': 'français',
  'other_langs': 'anglais (courant), allemand (moyen)',
  'experience': '<3',
  'written_en': '0',
  'written_fr': '0',
  'mt_tools': '0',
  'utility_mt_hal': 'utile',
  'free_text': '',
  'accepted_conditions': '1',
  'expert': '1',
  'active': '1',
  'delete_history': 'NULL'},
 {'id': '3',
  'token': '303c16920ce0170b8c234372884f681286f8ecbd',
  'native_langs': 'Roumain',
  'other_langs': 'Anglais (courant); Français (courant) ',
  'experience': '<3',
  'written_en': '0',
  'written_fr': '0',
  'mt_tools': '0',
  'utility_mt_hal': 

In [9]:
translators = {'num': 0, 'exp_nlp': {}, 'lang_mat': []}
community = {'num': 0, 'exp_nlp': {}, 'lang_mat': []}

for user in users:
    # has user done some postediting?
    if user["id"] in trans_users:
        translators['num'] += 1
        if user['experience'] not in translators['exp_nlp']:
            translators['exp_nlp'][user['experience']] = 0
        translators['exp_nlp'][user['experience']] += 1
        translators['lang_mat'].append(user['native_langs'].lower())
    elif user["id"] in comm_users:
        community['num'] += 1
        if user['experience'] not in community['exp_nlp']:
            community['exp_nlp'][user['experience']] = 0
        community['exp_nlp'][user['experience']] += 1
        community['lang_mat'].append(user['native_langs'].lower())
        
print(translators)
print(community)
print(community['lang_mat'].count("français"))
    

{'num': 4, 'exp_nlp': {'<3': 4}, 'lang_mat': ['français', 'roumain', 'français', 'français']}
{'num': 16, 'exp_nlp': {'3-10': 7, '10+': 4, '<3': 5}, 'lang_mat': ['anglais', 'français', 'chinois', 'français', 'français', 'français', 'français', 'français', 'français', 'français', 'français', 'français', 'kabyle', 'français', 'français', 'français']}
13
