In [None]:
# @title Install Necessary Dependencies
!pip install evaluate sacrebleu==1.5.1

In [21]:
# @title Import All Necessary Dependencies
import evaluate
import sacrebleu
import os
import json
import pandas as pd

In [22]:
# @title Filtering and Removing Empty Translations
# Number of entries removed for each file is outputted in the notebook
removed_entries_count = dict()
name_indexed_files = dict()
file_translation_types = dict() #For accessing the translations from dictionaries in the future immidiately
DIR_PATH = '/content/reasoning_data'
all_files = os.listdir(DIR_PATH)
translation_types = [
    'teacher-Synthesized-CoT-translation',
    'teacher-CoT-translation',
    'direct_translation',
    'self-CoT-translation'
]
for file_name in all_files:
  all_file_lines = []
  removed = 0
  with open(DIR_PATH + '/' + file_name, 'r') as f:
    translation_type = None
    for line in f:
      curr_dict = json.loads(line.strip())
      if not translation_type:
        for t_type in translation_types:
          if t_type in curr_dict:
            translation_type = t_type
            break
      if curr_dict[translation_type] == '':
        removed += 1
      else:
        all_file_lines.append(curr_dict)
    file_translation_types[file_name] = translation_type
  name_indexed_files[file_name] = all_file_lines
  removed_entries_count[file_name] = removed
print('Empty entries per File:')
for file_key in removed_entries_count:
  print(f'{file_key} : {removed_entries_count[file_key]} removed entries')


Empty entries per File:
student_deepseek-ai_DeepSeek-R1-Distill-Qwen-7B__Self-CoT__NA_fr-en.jsonl : 25 removed entries
student_deepseek-ai_DeepSeek-R1-Distill-Llama-8B__Direct__NA_en-es.jsonl : 4 removed entries
student_deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B__Teacher-CoT__Qwen_Qwen3-32B_fr-en.jsonl : 27 removed entries
student_deepseek-ai_DeepSeek-R1-Distill-Llama-8B__Teacher-CoT__Qwen_Qwen3-32B_fr-en.jsonl : 26 removed entries
student_deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B__Direct__NA_es-en.jsonl : 24 removed entries
student_Qwen_Qwen3-8B__Teacher-CoT__Qwen_Qwen3-32B_es-en.jsonl : 5 removed entries
student_deepseek-ai_DeepSeek-R1-Distill-Qwen-7B__Direct__NA_fr-en.jsonl : 6 removed entries
student_deepseek-ai_DeepSeek-R1-Distill-Llama-8B__Direct__NA_fr-en.jsonl : 4 removed entries
student_deepseek-ai_DeepSeek-R1-Distill-Llama-8B__Direct__NA_es-en.jsonl : 1 removed entries
student_Qwen_Qwen3-8B__Self-CoT__NA_es-en.jsonl : 50 removed entries
student_deepseek-ai_DeepSeek-R1-Distill-

In [23]:
# @title Calculating and Aggregating BLEU Scores
# Aggregating sum and count of BLEU scores to calculate average for final reports
# Results for all model language pairs are stored in BLEU_scores.csv
# Pandas Dataframe for result is also outputted in the notebook
# Any 0's in the final CSV indicate that there were no available values to calculate that score (they were likely filtered for having empty translations)
bleu = evaluate.load('sacrebleu')
combined_performances = dict()
for file_name in name_indexed_files:
  for json_dict in name_indexed_files[file_name]:
    translation_type = file_translation_types[file_name]
    model, lp = json_dict['model'], json_dict['lp']
    predictions, references = [json_dict[translation_type]], [[json_dict['reference'], json_dict['reference2']]]
    BLEU_score = bleu.compute(predictions=predictions, references=references)['score']
    if (lp, model) not in combined_performances:
      combined_performances[(lp, model)] = {i:(0, 0) for i in translation_types if i != 'teacher-Synthesized-CoT-translation'}
    if translation_type == 'teacher-Synthesized-CoT-translation':
      prev_entry = combined_performances[(lp, model)]['teacher-CoT-translation']
      combined_performances[(lp, model)]['teacher-CoT-translation'] = (prev_entry[0] + BLEU_score, prev_entry[1] + 1)
    else:
      prev_entry = combined_performances[(lp, model)][translation_type]
      combined_performances[(lp, model)][translation_type] = (prev_entry[0] + BLEU_score, prev_entry[1] + 1)

# Creating pandas dataframe to neatly display results
df_template = {
    'Language Pair': [],
    'Model': [],
    'Average BLEU Teacher-CoT-Translation' : [],
    'Average BLEU Direct Translation' : [],
    'Average BLEU Self-CoT-Translation' : []
}
template_mappings = {
    'teacher-CoT-translation' : 'Average BLEU Teacher-CoT-Translation',
    'direct_translation' : 'Average BLEU Direct Translation',
    'self-CoT-translation' : 'Average BLEU Self-CoT-Translation'
}
for lp, model in combined_performances:
  df_template['Language Pair'].append(lp)
  df_template['Model'].append(model)
  perf_agg = combined_performances[(lp, model)]
  for display_type in perf_agg:
    total, count = perf_agg[display_type]
    df_template[template_mappings[display_type]].append(total/count if count != 0 else 0)
df = pd.DataFrame(df_template)
df.to_csv('BLEU_scores.csv', index=False)
df

Unnamed: 0,Language Pair,Model,Average BLEU Teacher-CoT-Translation,Average BLEU Direct Translation,Average BLEU Self-CoT-Translation
0,fr-en,deepseek-ai/DeepSeek-R1-Distill-Qwen-7B,35.472234,25.224905,20.119276
1,en-es,deepseek-ai/DeepSeek-R1-Distill-Llama-8B,3.681246,2.888469,2.436113
2,fr-en,deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B,31.248174,6.843925,7.740628
3,fr-en,deepseek-ai/DeepSeek-R1-Distill-Llama-8B,30.325791,29.365837,32.992425
4,es-en,deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B,30.843813,3.412208,9.944891
5,es-en,Qwen/Qwen3-8B,36.396418,35.370538,0.0
6,es-en,deepseek-ai/DeepSeek-R1-Distill-Llama-8B,40.377218,28.663043,31.549781
7,es-en,deepseek-ai/DeepSeek-R1-Distill-Qwen-7B,36.406743,23.514389,22.747862
8,en-es,deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B,5.245474,4.183732,4.765791
9,en-es,deepseek-ai/DeepSeek-R1-Distill-Qwen-7B,3.259162,3.954317,5.424789


Based on the BLEU scores for each methodology (across all language pairs and models), the best method is the Teacher Chain-of-Thought Translation method. For each language pair and model, the Teacher-CoT-Translation method performed best in all but one instance (with an average BLEU score of 24.285). This makes sense from a logical perspective as well, since in Teacher-CoT-Translation, the smaller model learns from the larger, more accurate model, and isn't self-reliant. This method struggled (similar to every other method) when translating from English into Spanish. The accompanying CSV file (BLEU_scores.csv) reports how each method performed (on average) across each language pair and model, so individual performances can also be seen.