# Evaluate and Optimize Abbreviation Extraction

In [1]:
import pandas as pd
import string
from spylls.hunspell import Dictionary
spellchecker = Dictionary.from_files('en_US')
from string import punctuation
import re

## Load List $L$ from File

In [2]:
data = pd.read_csv('abbr_db.CSV', names=['abbr', 'long_forms'], sep=';', encoding='utf8')
abbreviations = list(data['abbr'].values)
expansions = list(data['long_forms'].values)

# F1-Optimization for Abbreviations with Upper Case Letters

## Helper function to calculate the proportion of upper case letters

In [3]:
def portion_of_capital_letters(w):
    upper_cases = ''.join([c for c in w if c.isupper()])
    return len(upper_cases)/len(w)

## The method "extraction_rule()" is the function that has to be optimized

In [4]:
def extraction_rule(w, l_, ratio):
    if len(w) <= l_ and portion_of_capital_letters(w) >= ratio:
        return True
    else:
        return False

## Determine False Positive and False Negative rates for given extraction rule parameters "l_" (length of word) and "ratio" (proportion of upper case letters)

In [5]:
def determine_FN(l_, ratio):
    FN = 0
    for abb in abbreviations:
        if not extraction_rule(abb, l_, ratio):
            FN = FN + 1
    return FN

In [6]:
def determine_FP(l_, ratio):
    FP = 0
    for exp in expansions:
        for word in exp.split():
            if extraction_rule(word, l_, ratio):
                FP = FP + 1
    return FP

## The main function to conduct exhaustive search on $[0,1]$ in $1/100$ steps 

In [7]:
max_f1 = 0
memory = []
for word_length in range(1,21):
    for search_step in range(1, 101):
        ratio = search_step/100
        FP = determine_FP(word_length, ratio)
        FN = determine_FN(word_length, ratio)
        TP = len(abbreviations) - FN
        precision = TP/(TP + FP)
        recall = TP/(TP + FN)
        f1 = (2*recall*precision)/(recall + precision)
        if f1 > max_f1:
            max_f1 = f1
            memory = [word_length, ratio, precision, recall, f1]
result = {"Word length" : [memory[0]], 
          "Ratio" : [memory[1]], 
          "Precision" : [memory[2]], 
          "Recall" : [memory[3]], 
          "F1" : [memory[4]]}

In [8]:
pd.DataFrame.from_dict(result)

Unnamed: 0,Word length,Ratio,Precision,Recall,F1
0,13,0.29,0.922216,0.922732,0.922474


# F1-Optimization for Abbreviations with only Lower Case Letters

# Generate lower case abbreviations and terms

In [9]:
abbreviations_lower = set([abb.lower() for abb in abbreviations if len(abb.split()) == 1])
expansions_lower = [exp.lower().split() for exp in expansions]

expansion_tokens_lower = set()
for token_list in expansions_lower:
    for tok in token_list:
        expansion_tokens_lower.add(tok)

## Approach to check if a word is a lower-case-abbreviation

In [15]:
def check_if_word_is_lower_letter_abbv(word, max_wl):
    if len(word)<=1 and word != "a":
        return True
    if (not spellchecker.lookup(word)) and word.islower() and len(word) < max_wl:
        return True
    word = re.sub("(\w)(\W)(\w)", r"\1 \3", word)
    if len(word.split()) >= 2:
        return any([check_if_word_is_lower_letter_abbv(tok, max_wl) for tok in word.split()])
    return False

In [16]:
check_if_word_is_lower_letter_abbv("temp.", 7)

True

## Determine False Positive and False Negative rates for given extraction rule parameter "max_wl" (maximum word length)

In [17]:
def count_FNs(max_wl):
    fn_count = 0
    for word in abbreviations_lower:
        if not check_if_word_is_lower_letter_abbv(word, max_wl):
            fn_count +=1
    return fn_count

def count_FPs(max_wl):
    fp_count = 0
    for word in expansion_tokens_lower:
        for tok in word.split():
            if check_if_word_is_lower_letter_abbv(tok, max_wl):
                fp_count += 1
    return fp_count

## The main function to conduct exhaustive search on $[1,20]$ for the $max\_wl$ parameter

In [18]:
max_f1 = 0
memory = []
for max_wl in range(1,20):
    FP = count_FPs(max_wl)
    FN = count_FNs(max_wl)
    TP = len(abbreviations_lower) - FN
    if TP + FP == 0:
        precision = 0
    else:
        precision = TP/(TP + FP)
    if TP + FN == 0:
        recall = 0
    else:
        recall = TP/(TP + FN)
    f1 = (2*recall*precision)/(recall + precision)
    if f1 > max_f1:
        max_f1 = f1
        memory = [max_wl, precision, recall, f1]
result = {"Max_WL" : [memory[0]], 
          "Precision": [memory[1]],
          "Recall" : [memory[2]], 
          "F1" : [memory[3]]}

In [19]:
pd.DataFrame.from_dict(result)

Unnamed: 0,Max_WL,Precision,Recall,F1
0,7,0.91796,0.837492,0.875882


# Evaluation of overall performance on L

In [20]:
def overall_detection_approach(word):
    if extraction_rule(word, 13, 0.29):
        return True
    if check_if_word_is_lower_letter_abbv(word, 7):
        return True
    return False


In [21]:
def count_FNs():
    fn_count = 0
    for word in abbreviations:
        if not overall_detection_approach(word):
            fn_count +=1
    return fn_count

def count_FPs():
    fp_count = 0
    for word in expansions:
        if overall_detection_approach(word):
            fp_count += 1
    return fp_count

In [22]:
FP = count_FPs()
FN = count_FNs()
TP = len(abbreviations) - FN
if TP + FP == 0:
    precision = 0
else:
    precision = TP/(TP + FP)
if TP + FN == 0:
    recall = 0
else:
    recall = TP/(TP + FN)
f1 = (2*recall*precision)/(recall + precision)
result = {"Precision": [precision],
          "Recall" : [recall], 
          "F1" : [f1]}

In [23]:
pd.DataFrame.from_dict(result)

Unnamed: 0,Precision,Recall,F1
0,0.95496,0.93785,0.946328
