# Evaluate and Optimize Abbreviation Extraction

In [1]:
import pandas as pd
import string

## Load Pure Requirements from CSV-file

In [2]:
data = pd.read_csv('abbr_db.CSV', names=['abbr', 'long_forms'], sep=';', encoding='utf8')
abbreviations = list(data['abbr'].values)
expansions = list(data['long_forms'].values)

## Helper function to calculate the proportion of upper case letters

In [3]:
def portion_of_capital_letters(w):
    upper_cases = ''.join([c for c in w if c.isupper()])
    return len(upper_cases)/len(w)

## The method "extraction_rule()" is the function that has to be optimized

In [4]:
def extraction_rule(w, l_, ratio):
    if len(w) <= l_ and portion_of_capital_letters(w) >= ratio:
        return True
    else:
        return False

## Determine quality parameters for given extraction rule parameters "l_" (length of word) and "ratio" (proportion of upper case letters)

In [5]:
def determine_FN(l_, ratio):
    FN = 0
    for abb in abbreviations:
        if not extraction_rule(abb, l_, ratio):
            FN = FN + 1
    return FN

In [6]:
def determine_FP(l_, ratio):
    FP = 0
    for exp in expansions:
        for word in exp.split():
            if extraction_rule(word, l_, ratio):
                FP = FP + 1
    return FP

## The main function to conduct exhaustive search on

In [7]:
max_f1 = 0
memory = []
for word_length in range(1,21):
    for search_step in range(1, 101):
        ratio = search_step/100
        FP = determine_FP(word_length, ratio)
        FN = determine_FN(word_length, ratio)
        TP = len(abbreviations) - FN
        precision = TP/(TP + FP)
        recall = TP/(TP + FN)
        f1 = (2*recall*precision)/(recall + precision)
        if f1 > max_f1:
            max_f1 = f1
            memory = [word_length, ratio, precision, recall, f1]
print(memory)

[13, 0.29, 0.9222160044767768, 0.9227323628219485, 0.922474111390988]


## Check how many false negatives would still be produced 

In [None]:
FN = 0
for abb in abbreviations:
    if not extraction_rule(abb, 13, 0.29):
        FN = FN + 1
        print(abb)
print(str(FN))