In [14]:
import os
import numpy as np

In [31]:
def get_word_stats(lines):
    '''
    Given a list of lines, return the total number of words and the average length of the lines.
    '''
    total_words = 0
    total_lines = len(lines)
    for line in lines:
        total_words += len(line.split())
    return total_words, total_words / total_lines

def get_word_stats_tsv(lines):
    '''
    Given a list of lines, return the total number of words and the average length of the lines.
    '''
    total_words = 0
    total_lines = len(lines)
    for line in lines:
        total_words += len(line.split('\t')[1].split())
    return total_words, total_words / total_lines

def get_summary_stats(top_path):
    '''
    Given a path to the top level directory of a language pair, provide a dictionary with these summary statistics:
    - Number of training lines
    - Number of dev lines
    - Number of synthetic lines
    - Total number of words in Spanish training set
    - Average length of Spanish training sentences
    '''
    spanish_train_path = top_path + '/train.es'
    spanish_dev_path = top_path + '/dev.es'

    synthetic_path = top_path + '/synthetic.tsv'
    extra_path = top_path + '/extra.tsv'

    synthetic_words, synthetic_avg_len = 0, 0
    synthetic_lines = []

    extra_words, extra_avg_len = 0, 0
    extra_lines = []

    # Count the number of training lines and dev lines
    with open(spanish_train_path) as f:
        train_lines = f.readlines()
    with open(spanish_dev_path) as f:
        dev_lines = f.readlines()

    # Count the number of synthetic lines
    if os.path.exists(synthetic_path):
        with open(synthetic_path) as f:
            synthetic_lines = f.readlines()
        synthetic_words, synthetic_avg_len = get_word_stats_tsv(synthetic_lines)    

    # Count the number of extra lines
    if os.path.exists(extra_path):
        with open(extra_path) as f:
            extra_lines = f.readlines()
        extra_words, extra_avg_len = get_word_stats_tsv(extra_lines)

    # Get the word statistics for the data
    train_words, train_avg_len = get_word_stats(train_lines)
    val_words, val_avg_len = get_word_stats(dev_lines)
    
    

    return {
        'train_lines': len(train_lines),
        'dev_lines': len(dev_lines),
        'synthetic_lines': len(synthetic_lines),
        'extra_lines': len(extra_lines),
        'train_words': train_words,
        'train_avg_len': train_avg_len,
        'val_words': val_words,
        'val_avg_len': val_avg_len,
        'synthetic_words': synthetic_words,
        'synthetic_avg_len': synthetic_avg_len,
        'extra_words': extra_words,
        'extra_avg_len': extra_avg_len
    }

# Gather simple summary statistics

In [32]:
# ashaninka
ashaninka_top_path = '../americasnlp2024/ST1_MachineTranslation/data/ashaninka-spanish'
ashaninka_stats = get_summary_stats(ashaninka_top_path)
print("Spanish - Ashaninka:\t", ashaninka_stats)


# aymara
aymara_top_path = '../americasnlp2024/ST1_MachineTranslation/data/aymara-spanish'
aymara_stats = get_summary_stats(aymara_top_path)
print("Spanish - Amayra:\t", aymara_stats)

# bribri
bribri_top_path = '../americasnlp2024/ST1_MachineTranslation/data/bribri-spanish'
bribri_stats = get_summary_stats(bribri_top_path)
print("Spanish - Bribri:\t", bribri_stats)

# chatino
chatino_top_path = '../americasnlp2024/ST1_MachineTranslation/data/chatino-spanish'
chatino_stats = get_summary_stats(chatino_top_path)
print("Spanish - Chatino:\t", chatino_stats)

# guarani
guarani_top_path = '../americasnlp2024/ST1_MachineTranslation/data/guarani-spanish'
guarani_stats = get_summary_stats(guarani_top_path)
print("Spanish - Guarani:\t", guarani_stats)

# hñähñu
hnahu_top_path = '../americasnlp2024/ST1_MachineTranslation/data/hñähñu-spanish'
hnahu_stats = get_summary_stats(hnahu_top_path)
print("Spanish - Hñähñu:\t", hnahu_stats)

# nahuatl
nahuatl_top_path = '../americasnlp2024/ST1_MachineTranslation/data/nahuatl-spanish'
nahuatl_stats = get_summary_stats(nahuatl_top_path)
print("Spanish - Nahuatl:\t", nahuatl_stats)

# quechua
quechua_top_path = '../americasnlp2024/ST1_MachineTranslation/data/quechua-spanish'
quechua_stats = get_summary_stats(quechua_top_path)
print("Spanish - Quechua:\t", quechua_stats)

# raramuri
raramuri_top_path = '../americasnlp2024/ST1_MachineTranslation/data/raramuri-spanish'
raramuri_stats = get_summary_stats(raramuri_top_path)
print("Spanish - Raramuri:\t", raramuri_stats)

# shipibo_konibo
shipibo_konibo_top_path = '../americasnlp2024/ST1_MachineTranslation/data/shipibo_konibo-spanish'
shipibo_konibo_stats = get_summary_stats(shipibo_konibo_top_path)
print("Spanish - Shipibo Konibo:\t", shipibo_konibo_stats)

# wixarika
wixarika_top_path = '../americasnlp2024/ST1_MachineTranslation/data/wixarika-spanish'
wixarika_stats = get_summary_stats(wixarika_top_path)
print("Spanish - Wixarika:\t", wixarika_stats)

Spanish - Ashaninka:	 {'train_lines': 3883, 'dev_lines': 883, 'synthetic_lines': 13195, 'extra_lines': 0, 'train_words': 48752, 'train_avg_len': 12.555240793201133, 'val_words': 9605, 'val_avg_len': 10.877689694224236, 'synthetic_words': 179032, 'synthetic_avg_len': 13.568169761273209, 'extra_words': 0, 'extra_avg_len': 0}
Spanish - Amayra:	 {'train_lines': 6531, 'dev_lines': 996, 'synthetic_lines': 16750, 'extra_lines': 24331, 'train_words': 128154, 'train_avg_len': 19.622416169039962, 'val_words': 11129, 'val_avg_len': 11.173694779116467, 'synthetic_words': 233832, 'synthetic_avg_len': 13.960119402985075, 'extra_words': 521865, 'extra_avg_len': 21.4485635608894}
Spanish - Bribri:	 {'train_lines': 7508, 'dev_lines': 996, 'synthetic_lines': 0, 'extra_lines': 0, 'train_words': 46820, 'train_avg_len': 6.236014917421417, 'val_words': 11129, 'val_avg_len': 11.173694779116467, 'synthetic_words': 0, 'synthetic_avg_len': 0, 'extra_words': 0, 'extra_avg_len': 0}
Spanish - Chatino:	 {'train_lin

In [33]:
# Create a table with the summary statistics for all language pairs tsv
table = []
table.append(['Language Pair', 'Train Lines', 'Dev Lines', 'Synthetic Lines', 'Extra Lines', 'Train Words', 'Train Avg Len', 'Val Words', 'Val Avg Len', 'Synthetic Words', 'Synthetic Avg Len', 'Extra Words', 'Extra Avg Len'])
table.append(['Spanish - Ashaninka', ashaninka_stats['train_lines'], ashaninka_stats['dev_lines'], ashaninka_stats['synthetic_lines'], ashaninka_stats['extra_lines'], ashaninka_stats['train_words'], ashaninka_stats['train_avg_len'], ashaninka_stats['val_words'], ashaninka_stats['val_avg_len'], ashaninka_stats['synthetic_words'], ashaninka_stats['synthetic_avg_len'], ashaninka_stats['extra_words'], ashaninka_stats['extra_avg_len']])
table.append(['Spanish - Aymara', aymara_stats['train_lines'], aymara_stats['dev_lines'], aymara_stats['synthetic_lines'], aymara_stats['extra_lines'], aymara_stats['train_words'], aymara_stats['train_avg_len'], aymara_stats['val_words'], aymara_stats['val_avg_len'], aymara_stats['synthetic_words'], aymara_stats['synthetic_avg_len'], aymara_stats['extra_words'], aymara_stats['extra_avg_len']])
table.append(['Spanish - Bribri', bribri_stats['train_lines'], bribri_stats['dev_lines'], bribri_stats['synthetic_lines'], bribri_stats['extra_lines'], bribri_stats['train_words'], bribri_stats['train_avg_len'], bribri_stats['val_words'], bribri_stats['val_avg_len'], bribri_stats['synthetic_words'], bribri_stats['synthetic_avg_len'], bribri_stats['extra_words'], bribri_stats['extra_avg_len']])
table.append(['Spanish - Chatino', chatino_stats['train_lines'], chatino_stats['dev_lines'], chatino_stats['synthetic_lines'], chatino_stats['extra_lines'], chatino_stats['train_words'], chatino_stats['train_avg_len'], chatino_stats['val_words'], chatino_stats['val_avg_len'], chatino_stats['synthetic_words'], chatino_stats['synthetic_avg_len'], chatino_stats['extra_words'], chatino_stats['extra_avg_len']])
table.append(['Spanish - Guarani', guarani_stats['train_lines'], guarani_stats['dev_lines'], guarani_stats['synthetic_lines'], guarani_stats['extra_lines'], guarani_stats['train_words'], guarani_stats['train_avg_len'], guarani_stats['val_words'], guarani_stats['val_avg_len'], guarani_stats['synthetic_words'], guarani_stats['synthetic_avg_len'], guarani_stats['extra_words'], guarani_stats['extra_avg_len']])
table.append(['Spanish - Hñähñu', hnahu_stats['train_lines'], hnahu_stats['dev_lines'], hnahu_stats['synthetic_lines'], hnahu_stats['extra_lines'], hnahu_stats['train_words'], hnahu_stats['train_avg_len'], hnahu_stats['val_words'], hnahu_stats['val_avg_len'], hnahu_stats['synthetic_words'], hnahu_stats['synthetic_avg_len'], hnahu_stats['extra_words'], hnahu_stats['extra_avg_len']])
table.append(['Spanish - Nahuatl', nahuatl_stats['train_lines'], nahuatl_stats['dev_lines'], nahuatl_stats['synthetic_lines'], nahuatl_stats['extra_lines'], nahuatl_stats['train_words'], nahuatl_stats['train_avg_len'], nahuatl_stats['val_words'], nahuatl_stats['val_avg_len'], nahuatl_stats['synthetic_words'], nahuatl_stats['synthetic_avg_len'], nahuatl_stats['extra_words'], nahuatl_stats['extra_avg_len']])
table.append(['Spanish - Quechua', quechua_stats['train_lines'], quechua_stats['dev_lines'], quechua_stats['synthetic_lines'], quechua_stats['extra_lines'], quechua_stats['train_words'], quechua_stats['train_avg_len'], quechua_stats['val_words'], quechua_stats['val_avg_len'], quechua_stats['synthetic_words'], quechua_stats['synthetic_avg_len'], quechua_stats['extra_words'], quechua_stats['extra_avg_len']])
table.append(['Spanish - Raramuri', raramuri_stats['train_lines'], raramuri_stats['dev_lines'], raramuri_stats['synthetic_lines'], raramuri_stats['extra_lines'], raramuri_stats['train_words'], raramuri_stats['train_avg_len'], raramuri_stats['val_words'], raramuri_stats['val_avg_len'], raramuri_stats['synthetic_words'], raramuri_stats['synthetic_avg_len'], raramuri_stats['extra_words'], raramuri_stats['extra_avg_len']])
table.append(['Spanish - Shipibo Konibo', shipibo_konibo_stats['train_lines'], shipibo_konibo_stats['dev_lines'], shipibo_konibo_stats['synthetic_lines'], shipibo_konibo_stats['extra_lines'], shipibo_konibo_stats['train_words'], shipibo_konibo_stats['train_avg_len'], shipibo_konibo_stats['val_words'], shipibo_konibo_stats['val_avg_len'], shipibo_konibo_stats['synthetic_words'], shipibo_konibo_stats['synthetic_avg_len'], shipibo_konibo_stats['extra_words'], shipibo_konibo_stats['extra_avg_len']])
table.append(['Spanish - Wixarika', wixarika_stats['train_lines'], wixarika_stats['dev_lines'], wixarika_stats['synthetic_lines'], wixarika_stats['extra_lines'], wixarika_stats['train_words'], wixarika_stats['train_avg_len'], wixarika_stats['val_words'], wixarika_stats['val_avg_len'], wixarika_stats['synthetic_words'], wixarika_stats['synthetic_avg_len'], wixarika_stats['extra_words'], wixarika_stats['extra_avg_len']])

# Write table to file
with open('summary_stats.tsv', 'w') as f:
    for row in table:
        f.write('\t'.join([str(x) for x in row]) + '\n')