# Usage

input : csv file with rows of [X, Y1, Y2, .. Yn] where
  X = erroneous text
  Yi = corrected text

output: csv file with header [X, rank1, rank2, ..rankn] where Y is ordered by rank

In [None]:
!pip install bert-score

In [2]:
import csv
from bert_score import BERTScorer
import editdistance

In [3]:
def get_bert(src, target): # returns negated f1 score, so that sent with higher f1 is ranked higher
  scorer = BERTScorer(model_type='bert-base-uncased', lang='en')
  _, _, F1 = scorer.score([src], [target])

  return -1 * F1

In [4]:
# use Levenshtein distance
def get_levenshtein(src, target): # low edit distance ranked higher
  return editdistance.eval(src, target)

# TODO implement other notions of edit distance? e.g tokenize and compare tokens/contiguous index of edits count as 1 edit?

In [5]:
input_csv = 'sample_translation.csv'
output_csv = 'ranked_translations.csv'
scoring_options = [get_bert, get_levenshtein]
scoring_fn = scoring_options[1]

In [19]:
def rank_dataset(input_file, output_file, scoring_fn):
  with open(input_file, 'r') as in_file:
    reader = csv.reader(in_file)
    rows = list(reader)
    y_values = len(rows[0]) - 1

    ranked_rows = []
    for row in rows:
      src = row[0]
      translations = row[1:]

      ranked_row = [src]
      ranked_translations = sorted(translations, key=lambda y: scoring_fn(src, y))
      ranked_row.extend(ranked_translations)

      ranked_rows.append(ranked_row)

    with open(output_file, 'w', newline='') as out_file:
      writer = csv.writer(out_file)
      # write header
      header = ['src'] +[f'rank{i}' for i in range(1, y_values+1)]
      writer.writerow(header)
      writer.writerows(ranked_rows)


In [20]:
rank_dataset(input_csv, output_csv, scoring_fn)