In [1]:
!pip install matplotlib pandas seaborn



In [2]:
import os
import re
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from glob import glob

In [3]:
def parse_log_file(file_path):
    """Parse a single log file and extract metrics for each language."""
    with open(file_path, 'r') as f:
        content = f.read()
    
    # Extract model name from filename
    model_name = os.path.basename(file_path).split('.')[0]
    
    # Extract metrics for each language
    pattern = r"Metrics for the language (\w+): STA ([\d.]+) SIM ([\d.]+) CHRF ([\d.]+) J ([\d.]+) XCOMET ([\d.]+)"
    matches = re.findall(pattern, content)
    
    # Extract average metrics
    avg_j_pattern = r"Average all J: ([\d.]+)"
    avg_j = float(re.search(avg_j_pattern, content).group(1)) if re.search(avg_j_pattern, content) else None

    avg_pj_pattern = r"Average_p J: ([\d.]+)"
    avg_pj = float(re.search(avg_pj_pattern, content).group(1)) if re.search(avg_pj_pattern, content) else None    

    avg_npj_pattern = r"Average_np J: ([\d.]+)"
    avg_npj = float(re.search(avg_npj_pattern, content).group(1)) if re.search(avg_npj_pattern, content) else None    
    
    avg_xcomet_pattern = r"Average XCOMET: ([\d.]+)"
    avg_xcomet = float(re.search(avg_xcomet_pattern, content).group(1)) if re.search(avg_xcomet_pattern, content) else None
    
    results = []
    for match in matches:
        lang, sta, sim, chrf, j, xcomet = match
        results.append({
            'model': model_name,
            'language': lang,
            'STA': float(sta),
            'SIM': float(sim),
            'CHRF': float(chrf),
            'J': float(j),
            'XCOMET': float(xcomet)
        })
    
    # Add average metrics as a separate entry
    if avg_j is not None and avg_xcomet is not None:
        results.append({
            'model': model_name,
            'language': 'average',
            'STA': None,
            'SIM': None,
            'CHRF': None,
            'J': avg_j,
            'Average_p J': avg_pj,
            'Average_np J': avg_npj,
            'XCOMET': avg_xcomet
        })
    
    return results

In [4]:
file_path = "/home/alexander/Desktop/submit_results/en_comms_2000.txt"
results = parse_log_file(file_path)
results

[{'model': 'en_comms_2000',
  'language': 'am',
  'STA': 0.802,
  'SIM': 0.366,
  'CHRF': 0.17,
  'J': 0.134,
  'XCOMET': 0.483},
 {'model': 'en_comms_2000',
  'language': 'ar',
  'STA': 0.893,
  'SIM': 0.461,
  'CHRF': 0.107,
  'J': 0.236,
  'XCOMET': 0.556},
 {'model': 'en_comms_2000',
  'language': 'de',
  'STA': 0.805,
  'SIM': 0.662,
  'CHRF': 0.329,
  'J': 0.352,
  'XCOMET': 0.679},
 {'model': 'en_comms_2000',
  'language': 'en',
  'STA': 0.921,
  'SIM': 0.819,
  'CHRF': 0.689,
  'J': 0.654,
  'XCOMET': 0.847},
 {'model': 'en_comms_2000',
  'language': 'es',
  'STA': 0.82,
  'SIM': 0.656,
  'CHRF': 0.299,
  'J': 0.376,
  'XCOMET': 0.689},
 {'model': 'en_comms_2000',
  'language': 'fr',
  'STA': 0.822,
  'SIM': 0.688,
  'CHRF': 0.341,
  'J': 0.391,
  'XCOMET': 0.693},
 {'model': 'en_comms_2000',
  'language': 'he',
  'STA': 0.821,
  'SIM': 0.45,
  'CHRF': 0.099,
  'J': 0.192,
  'XCOMET': 0.522},
 {'model': 'en_comms_2000',
  'language': 'hi',
  'STA': 0.771,
  'SIM': 0.599,
  'CHR

In [5]:
def analyze_experiment_results(log_dir):
    """Analyze all experiment log files in the given directory."""
    log_files = glob(os.path.join(log_dir, "*.txt"))
    
    if not log_files:
        print(f"No log files found in {log_dir}")
        return None
    
    all_results = []
    for file_path in log_files:
        all_results.extend(parse_log_file(file_path))
    
    # Convert to DataFrame
    df = pd.DataFrame(all_results)
    return df

In [6]:
results_folder = "/home/alexander/Desktop/submit_results"
all_results = analyze_experiment_results(results_folder)
all_results

Unnamed: 0,model,language,STA,SIM,CHRF,J,XCOMET,Average_p J,Average_np J
0,en_comms_1000,am,0.913,0.216,0.042,0.079,0.403,,
1,en_comms_1000,ar,0.944,0.376,0.024,0.200,0.523,,
2,en_comms_1000,de,0.888,0.597,0.230,0.345,0.634,,
3,en_comms_1000,en,0.921,0.819,0.679,0.655,0.849,,
4,en_comms_1000,es,0.865,0.607,0.221,0.348,0.637,,
...,...,...,...,...,...,...,...,...,...
491,gemma_4b_2992_all_data_cleaned_final_lora,ru,0.917,0.837,0.673,0.692,0.888,,
492,gemma_4b_2992_all_data_cleaned_final_lora,tt,0.642,0.841,0.724,0.456,0.828,,
493,gemma_4b_2992_all_data_cleaned_final_lora,uk,0.915,0.857,0.700,0.700,0.883,,
494,gemma_4b_2992_all_data_cleaned_final_lora,zh,0.741,0.798,0.455,0.489,0.824,,


In [7]:
def find_top_models(df, metrics=None):
    """Find top models for each language and metric."""
    if metrics is None:
        metrics = ['STA', 'SIM', 'CHRF', 'J', 'XCOMET']
    
    top_models = {}
    
    # For each language and metric, find the top model
    for language in df['language'].unique():
        top_models[language] = {}
        lang_df = df[df['language'] == language]
        
        for metric in metrics:
            if metric in lang_df.columns and not lang_df[metric].isna().all():
                top_model = lang_df.loc[lang_df[metric].idxmax()]
                top_models[language][metric] = {
                    'model': top_model['model'],
                    'value': top_model[metric]
                }
    
    return top_models

In [8]:
find_top_models(all_results)

{'am': {'STA': {'model': 'en_comms_1000', 'value': np.float64(0.913)},
  'SIM': {'model': 'gemma_4b_lora_paradetox', 'value': np.float64(0.81)},
  'CHRF': {'model': 'baseline_delete_dev', 'value': np.float64(0.487)},
  'J': {'model': 'gemma_4b_lora_paradetox', 'value': np.float64(0.461)},
  'XCOMET': {'model': 'gemma_4b_lora_paradetox', 'value': np.float64(0.76)}},
 'ar': {'STA': {'model': 'en_comms_1000', 'value': np.float64(0.944)},
  'SIM': {'model': 'gemma_4b_lora_paradetox', 'value': np.float64(0.926)},
  'CHRF': {'model': 'sorted_qwen2_tr_vanilla_paradetox_en_prompt_450_it__lr_2e-6',
   'value': np.float64(0.786)},
  'J': {'model': 'filter-88', 'value': np.float64(0.668)},
  'XCOMET': {'model': 'gemma_4b_lora_paradetox', 'value': np.float64(0.89)}},
 'de': {'STA': {'model': 'sorted_qwen2_tr_vanilla_paradetox_multi_prompt_338_it__lr_2e-6',
   'value': np.float64(0.891)},
  'SIM': {'model': 'gemma_4b_lora_paradetox', 'value': np.float64(0.946)},
  'CHRF': {'model': 'sorted_qwen2_tr

In [9]:
def plot_metrics_by_language(df, metric, output_dir=None):
    """Create a bar plot for a specific metric across languages and models."""
    plt.figure(figsize=(14, 8))
    
    # Filter out rows where the metric is NaN
    filtered_df = df.dropna(subset=[metric])
    
    # Create the plot
    ax = sns.barplot(x='language', y=metric, hue='model', data=filtered_df)
    
    plt.title(f'{metric} Scores by Language and Model')
    plt.xlabel('Language')
    plt.ylabel(f'{metric} Score')
    plt.xticks(rotation=45)
    plt.legend(title='Model', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    
    if output_dir:
        os.makedirs(output_dir, exist_ok=True)
        plt.savefig(os.path.join(output_dir, f'{metric}_by_language.png'))
    else:
        plt.show()
    
    plt.close()


def plot_average_metrics(df, output_dir=None):
    """Create a bar plot for average metrics across models."""
    avg_df = df[df['language'] == 'average']
    
    if avg_df.empty:
        print("No average metrics found in the data")
        return
    
    # Melt the dataframe to get metrics in a single column
    melted_df = pd.melt(
        avg_df, 
        id_vars=['model'], 
        value_vars=['J', 'XCOMET'],
        var_name='metric', 
        value_name='score'
    )
    
    plt.figure(figsize=(12, 8))
    ax = sns.barplot(x='model', y='score', hue='metric', data=melted_df)
    
    plt.title('Average Metrics by Model')
    plt.xlabel('Model')
    plt.ylabel('Score')
    plt.xticks(rotation=45)
    plt.legend(title='Metric')
    plt.tight_layout()
    
    if output_dir:
        os.makedirs(output_dir, exist_ok=True)
        plt.savefig(os.path.join(output_dir, 'average_metrics.png'))
    else:
        plt.show()
    
    plt.close()


def plot_heatmap(df, metric, output_dir=None):
    """Create a heatmap of a specific metric for all models and languages."""
    # Pivot the dataframe to get models as rows and languages as columns
    pivot_df = df.pivot_table(index='model', columns='language', values=metric)
    
    plt.figure(figsize=(16, 10))
    ax = sns.heatmap(pivot_df, annot=True, fmt=".3f", cmap="YlGnBu", linewidths=.5)
    
    plt.title(f'{metric} Scores Heatmap')
    plt.tight_layout()
    
    if output_dir:
        os.makedirs(output_dir, exist_ok=True)
        plt.savefig(os.path.join(output_dir, f'{metric}_heatmap.png'))
    else:
        plt.show()
    
    plt.close()

In [10]:
log_dir = "/home/alexander/Desktop/submit_results"
output_dir = "."

# Analyze results
results_df = analyze_experiment_results(log_dir)

# Find top models for each language and metric
top_models = find_top_models(results_df)

# Print top models for each language and metric
print("Top Models by Language and Metric:")
for language, metrics in top_models.items():
    print(f"\nLanguage: {language}")
    for metric, model_info in metrics.items():
        print(f"  {metric}: {model_info['model']} ({model_info['value']:.3f})")

# Plot metrics by language
for metric in ['STA', 'SIM', 'CHRF', 'J', 'XCOMET']:
    plot_metrics_by_language(results_df, metric, output_dir)

# Plot average metrics
plot_average_metrics(results_df, output_dir)

# Create heatmaps for J and XCOMET metrics
plot_heatmap(results_df, 'J', output_dir)
plot_heatmap(results_df, 'XCOMET', output_dir)

Top Models by Language and Metric:

Language: am
  STA: en_comms_1000 (0.913)
  SIM: gemma_4b_lora_paradetox (0.810)
  CHRF: baseline_delete_dev (0.487)
  J: gemma_4b_lora_paradetox (0.461)
  XCOMET: gemma_4b_lora_paradetox (0.760)

Language: ar
  STA: en_comms_1000 (0.944)
  SIM: gemma_4b_lora_paradetox (0.926)
  CHRF: sorted_qwen2_tr_vanilla_paradetox_en_prompt_450_it__lr_2e-6 (0.786)
  J: filter-88 (0.668)
  XCOMET: gemma_4b_lora_paradetox (0.890)

Language: de
  STA: sorted_qwen2_tr_vanilla_paradetox_multi_prompt_338_it__lr_2e-6 (0.891)
  SIM: gemma_4b_lora_paradetox (0.946)
  CHRF: sorted_qwen2_tr_vanilla_paradetox_multi_prompt_450_it__lr_2e-6 (0.819)
  J: sorted_qwen2_tr_vanilla_paradetox_multi_prompt_338_it__lr_2e-6 (0.754)
  XCOMET: gemma_4b_lora_paradetox (0.959)

Language: en
  STA: sorted_qwen2_tr_vanilla_paradetox_multi_prompt_338_it__lr_2e-6 (0.930)
  SIM: gemma_4b_lora_paradetox (0.892)
  CHRF: sorted_qwen2_tr_vanilla_paradetox_multi_prompt_450_it__lr_2e-6 (0.730)
  J: ge

In [11]:
log_dir = "/home/alexander/Desktop/submit_results"
tsv_dir = "/home/alexander/Desktop/submit_results"

In [12]:
# Step 1: Parse log files to get best model per language by J
results = []
log_files = glob(os.path.join(log_dir, "*.txt"))
for log_path in log_files:
    model_name = os.path.splitext(os.path.basename(log_path))[0]
    with open(log_path, encoding='utf-8') as f:
        content = f.read()
    for m in re.findall(r"Metrics for the language (\w+):.*?J ([\d.]+)", content):
        lang, j = m
        results.append({'model': model_name, 'language': lang, 'J': float(j)})

df = pd.DataFrame(results)
if df.empty:
    raise Exception("No results found in log files.")

best_models = df.loc[df.groupby('language')['J'].idxmax()].reset_index(drop=True)

# Step 2: Create a mapping of language to best model
lang_to_best_model = dict(zip(best_models['language'], best_models['model']))

# Step 3: Find a reference .tsv file to determine the correct order
# We'll use the first .tsv file as a reference for order
tsv_files = glob(os.path.join(tsv_dir, "*.tsv"))
if not tsv_files:
    raise Exception("No .tsv files found.")

# Load the reference file to get the order
reference_file = tsv_files[0]
reference_df = pd.read_csv(reference_file, sep='\t')

# Create an order mapping: language -> list of rows in order
language_order = {}
for i, row in reference_df.iterrows():
    lang = row['lang']
    if lang not in language_order:
        language_order[lang] = []
    language_order[lang].append(i)

# Step 4: Load all .tsv files into a dictionary keyed by model name
model_tsvs = {}
for tsv_path in tsv_files:
    model_name = os.path.splitext(os.path.basename(tsv_path))[0]
    model_tsvs[model_name] = pd.read_csv(tsv_path, sep='\t')

# Step 5: Create the final submission by selecting rows from the best model for each language
# while preserving the original order
final_rows = []

# Process each language in the order it appears in the reference file
processed_langs = set()
for i, row in reference_df.iterrows():
    lang = row['lang']
    
    # Skip if we've already processed this language
    if lang in processed_langs:
        continue
    
    # Get the best model for this language
    best_model = lang_to_best_model.get(lang)
    if not best_model:
        print(f"Warning: No best model found for language {lang}")
        continue
    
    # Get the .tsv for the best model
    if best_model not in model_tsvs:
        print(f"Warning: No .tsv file for model {best_model}")
        continue
    
    # Get all rows for this language from the best model
    lang_df = model_tsvs[best_model][model_tsvs[best_model]['lang'] == lang]
    if lang_df.empty:
        print(f"Warning: No rows for language {lang} in model {best_model}'s .tsv")
        continue
    
    # Add to final rows
    final_rows.append(lang_df)
    processed_langs.add(lang)

# Step 6: Concatenate and save the submission
if final_rows:
    submission_df = pd.concat(final_rows, ignore_index=True)
    submission_df.to_csv("best_possible_submit.tsv", sep='\t', index=False)
    print(f"Submission file 'best_possible_submit.tsv' created with {len(submission_df)} rows.")
    
    # Print statistics about languages in the submission
    lang_counts = submission_df['lang'].value_counts()
    print("\nLanguage distribution in submission:")
    for lang, count in lang_counts.items():
        print(f"{lang}: {count} rows")
else:
    print("No data to submit.")


Submission file 'best_possible_submit.tsv' created with 9000 rows.

Language distribution in submission:
uk: 600 rows
hi: 600 rows
zh: 600 rows
ar: 600 rows
de: 600 rows
en: 600 rows
ru: 600 rows
am: 600 rows
es: 600 rows
it: 600 rows
fr: 600 rows
he: 600 rows
hin: 600 rows
tt: 600 rows
ja: 600 rows
