In [1]:
import pandas as pd
import os
import json
import pickle

# Quantify the number of words that are OOV (without repetitions, just single words)

In [2]:
def print_oov_ratio_dataset(MODEL, DATASET, MASKING='_OriginalPLL'):
    print(f'{MODEL} | {DATASET}')
    """
    This function prints and returns the ratio of OOV words relative to the number of words
    in the dataset (WITHOUT duplicates, i.e., disregarding the fact that words occur several times in a dataset)
    """
    # Load likelihood scores
    if DATASET != 'Brown':
        likelihoods_df = pd.read_csv(f'results/unigram_likelihoods/{MODEL}/{DATASET}{MASKING}.csv')
        likelihoods_df = likelihoods_df.rename(columns={
            "word" : "Word"
        })
    else:
        likelihood_frames = []
        for chunk_ind in range(10):
            # Load likelihood scores
            likelihoods = pd.read_csv(f'results/unigram_likelihoods/{MODEL}/{DATASET}{MASKING}_chunk={chunk_ind}.csv')
            likelihood_frames.append(likelihoods)
        likelihoods_df = pd.concat(likelihood_frames)
        likelihoods_df = likelihoods_df.rename(columns={
            "word" : "Word"
        })
    
    single_token_df = likelihoods_df.loc[likelihoods_df['nr. of tokens'] == 1]
    in_vocab_ratio = len(single_token_df.index)/len(likelihoods_df.index)
    print(f'Number of words in model vocab: {len(single_token_df.index)}')
    print(f'Number of words: {len(likelihoods_df.index)}')
    print(f'OOV ratio: {100 - round(in_vocab_ratio*100,2)}%\n')
    return single_token_df

In [3]:
MODEL='bert-base-cased'
print_oov_ratio_dataset(MODEL=MODEL, DATASET='EventsAdapt')
print_oov_ratio_dataset(MODEL=MODEL, DATASET='LibriSpeech')
print_oov_ratio_dataset(MODEL=MODEL, DATASET='Brown')

bert-base-cased | EventsAdapt
Number of words in model vocab: 727
Number of words: 1204
OOV ratio: 39.62%

bert-base-cased | LibriSpeech
Number of words in model vocab: 5170
Number of words: 8138
OOV ratio: 36.47%

bert-base-cased | Brown
Number of words in model vocab: 10533
Number of words: 23470
OOV ratio: 55.12%



Unnamed: 0,Word,nr. of tokens,tokens,word score
0,a,1,a,-6.896328
4,abandon,1,abandon,-13.637449
5,abandoned,1,abandoned,-12.547158
6,abandoning,1,abandoning,-15.524755
7,abbey,1,abbey,-14.247076
...,...,...,...,...
2338,zipper,1,zipper,-13.062677
2339,zombies,1,zombies,-13.507646
2340,zone,1,zone,-13.064455
2342,zoo,1,zoo,-13.722513


# Quantify the number of words that are OOV (with repetitions, i.e. accounting for multiple occurrences of words across sentences)

In [4]:
def load_sentences(DATASET):
    if DATASET == "LibriSpeech":
        def _apply_tokenizer_opts(sent: str) -> str: # from Salazar et al. (2021)
            sent += '.'
            sent = sent.capitalize()
            return sent
        sentences = []
        with open('data/librispeech/data/test-clean.am.json') as json_file:
            corpus = json.load(json_file)
            for sent_idx, value in corpus.items():
                ref_stimulus = _apply_tokenizer_opts(value["ref"]) # Only scoring ref sentences
                sentences.append(ref_stimulus)

    elif DATASET == "Brown":
        with open('data/brown/brown_stimuli.pkl', 'rb') as f:
            df = pickle.load(f)
        sentences = df["sentence"].values

    else:
        df = pd.read_csv(os.path.abspath("data/eventsAdapt/clean_EventsAdapt_SentenceSet.csv"))
        sentences = df["Sentence"].values
    return sentences

In [5]:
def extract_words(sentences):
    words = [sent.lower().rstrip(".").split() for sent in sentences]
    words = [item for sublist in words for item in sublist]
    words = [word for word in words if word.isalpha()] #keep only letter strings
    return words

In [6]:
def print_oov_ratio_across_full_dataset(MODEL, DATASET, MASKING='_OriginalPLL'):
    """
    This function prints and returns the ratio of OOV words relative to the full number of words
    in the dataset (with duplicates)
    """
    print(f'{MODEL} | {DATASET}')
    # Load file containing tokenized words
    if DATASET != 'Brown':
        likelihoods_df = pd.read_csv(f'results/unigram_likelihoods/{MODEL}/{DATASET}{MASKING}.csv')
    else:
        likelihood_frames = []
        for chunk_ind in range(10):
            # Load likelihood scores
            likelihoods = pd.read_csv(f'results/unigram_likelihoods/{MODEL}/{DATASET}{MASKING}_chunk={chunk_ind}.csv')
            likelihood_frames.append(likelihoods)
        likelihoods_df = pd.concat(likelihood_frames)
    
    # Load sentences & words
    sentences = load_sentences(DATASET)
    words = extract_words(sentences)
    
    from collections import Counter
    word_occurrence_counter = Counter(words)

    single_token_words = []
    subtract_cnt = 0
    for w, cnt in word_occurrence_counter.items():
        try:
            nr_word_tokens = likelihoods_df.loc[likelihoods_df['word'] == w]['nr. of tokens'].iloc[0]
        except:
            print(f'Word #{w}# not found in dataframe!')
            subtract_cnt += 1 * cnt
        if nr_word_tokens == 1:
            single_token_words.extend([w]*cnt)
            
    print(f'Nr. of unmatched words: {subtract_cnt}')
    in_vocab_ratio = len(single_token_words)/(len(words) - subtract_cnt)
    
    print(f'Number of words in model vocab: {len(single_token_words)}')
    print(f'Number of words: {len(words)}')
    print(f'OOV word ratio: {100 - round(in_vocab_ratio*100,2)}%\n')

# BERT

In [7]:
MODEL='bert-base-cased'

In [8]:
print_oov_ratio_across_full_dataset(MODEL=MODEL, DATASET='EventsAdapt')

bert-base-cased | EventsAdapt
Nr. of unmatched words: 0
Number of words in model vocab: 7632
Number of words: 9492
OOV word ratio: 19.599999999999994%



In [9]:
print_oov_ratio_across_full_dataset(MODEL=MODEL, DATASET='LibriSpeech')

bert-base-cased | LibriSpeech
Nr. of unmatched words: 0
Number of words in model vocab: 47881
Number of words: 52032
OOV word ratio: 7.980000000000004%



In [10]:
print_oov_ratio_across_full_dataset(MODEL=MODEL, DATASET='Brown')

bert-base-cased | Brown
Word #null# not found in dataframe!
Nr. of unmatched words: 5
Number of words in model vocab: 287261
Number of words: 312734
OOV word ratio: 8.14%



# RoBERTa

In [11]:
MODEL='roberta-base'
print_oov_ratio_across_full_dataset(MODEL=MODEL, DATASET='EventsAdapt')

roberta-base | EventsAdapt
Nr. of unmatched words: 0
Number of words in model vocab: 5664
Number of words: 9492
OOV word ratio: 40.33%



In [12]:
print_oov_ratio_across_full_dataset(MODEL=MODEL, DATASET='LibriSpeech')

roberta-base | LibriSpeech
Nr. of unmatched words: 0
Number of words in model vocab: 39392
Number of words: 52032
OOV word ratio: 24.290000000000006%



In [13]:
print_oov_ratio_across_full_dataset(MODEL=MODEL, DATASET='Brown')

roberta-base | Brown
Word #null# not found in dataframe!
Nr. of unmatched words: 5
Number of words in model vocab: 234219
Number of words: 312734
OOV word ratio: 25.099999999999994%



# GPT

In [14]:
MODEL='gpt2-medium'
print_oov_ratio_across_full_dataset(MODEL, DATASET='EventsAdapt', MASKING='')

gpt2-medium | EventsAdapt
Nr. of unmatched words: 0
Number of words in model vocab: 5664
Number of words: 9492
OOV word ratio: 40.33%



In [15]:
print_oov_ratio_across_full_dataset(MODEL, DATASET='LibriSpeech', MASKING='')

gpt2-medium | LibriSpeech
Nr. of unmatched words: 0
Number of words in model vocab: 39392
Number of words: 52032
OOV word ratio: 24.290000000000006%



In [16]:
print_oov_ratio_across_full_dataset(MODEL, DATASET='Brown', MASKING='')

gpt2-medium | Brown
Word #null# not found in dataframe!
Nr. of unmatched words: 5
Number of words in model vocab: 234219
Number of words: 312734
OOV word ratio: 25.099999999999994%

