# Comparison of the 3 different AEP-Detection Types

In [1]:
import pandas as pd
import string
import jellyfish
import random
import ILLOD
import Syntactic_Classifiers

In [4]:
data = pd.read_csv('abbr_db.CSV', names=['abbr', 'long_forms'], sep=';', encoding='utf8')
abbreviations = list(data['abbr'].values)
expansions = list(data['long_forms'].values)

## Semantic Classifier based on fastText

In [5]:
import pandas as pd
import fasttext
import fasttext.util
from scipy import spatial
# if not yet available please download fasttext wordvectors by commenting out the next line
# fasttext.util.download_model('en', if_exists='ignore')
ft = fasttext.load_model("cc.en.300.bin")


def fast_text_similarity(a, t, threshold):  
    a_v = ft.get_word_vector(a)
    t_v = ft.get_word_vector(t)
    if 1 - spatial.distance.cosine(a_v, t_v) >= threshold:
        return True
    else:
        return False

    
def fast_text_sim(a, t):
    a_v = ft.get_word_vector(a)
    t_v = ft.get_word_vector(t)
    return 1 - spatial.distance.cosine(a_v, t_v)



## Cardinality of S

In [6]:
# construction and cardinality of S:
S = set()
for i, abb in enumerate(abbreviations):
    for j, exp in enumerate (expansions):
        if abb != abbreviations[j]:
            S.add((abb, exp))
print(len(S))

2710125


# Evaluation of the 3 different AEP-Detection Types (Section 5.4)

In [7]:
def find_and_count_false_negatives(algo, threshold):
    FN = 0
    for i, abb in enumerate(abbreviations):
        if not algo(abb, expansions[i], threshold):
            # print("\""+abb+"\""+", "+"\""+expansions[i]+"\"")
            FN += 1
    return FN, str(FN) + " FALSE NEGATIVES. Pairs that could not be detected out of " + str(len(abbreviations)) + " given pairs"

In [8]:
def find_and_count_false_positives(algo, threshold, alpha):
    test_set = []
    while len(test_set) <= alpha * len(abbreviations):
        rd1 = random.randint(0, len(abbreviations)-1)
        rd2 = random.randint(0, len(abbreviations)-1)
        if abbreviations[rd1] != abbreviations[rd2]:
            test_set.append ((abbreviations[rd1], expansions[rd2]))
            
    count_of_false_examples = 0
    FP = 0
    for j, tup in enumerate (test_set): 
        if algo(tup[0], tup[1], threshold):
            count_of_false_examples += 1
            FP +=1
    return FP, str(FP) + " FALSE POSITIVE detections out of " +  str(len(test_set)) + " created false examples"

In [9]:
def determine_quality_parameters(alpha, algo, search_space_for_F1_optimization):
    max_f1 = 0
    best_values = []
    for th_ in search_space_for_F1_optimization:
        result_on_L = find_and_count_false_negatives(algo, th_)
        result_on_S = find_and_count_false_positives(algo, th_, alpha)
        FN = result_on_L[0]
        FP = result_on_S[0]
        TP = len(abbreviations) - FN
        
        # A classifier that does nothing is not useful. This serves to avoid a division by zero    
        if FP + TP == 0:
            precision = 0
            recall = 0
            f1 = 0
        else:
            precision = TP/(TP + FP)
            recall = TP/(TP + FN)
            f1 = (2*precision*recall)/(precision+recall)
            
        # memorise the best F1 value in the loop so far.       
        if f1 > max_f1:
            best_values = [th_, precision, recall, f1]
            max_f1 = f1
    return best_values

In [10]:
def evaluate_algorithm (algorithm, F1_optimization_search_space):
    eval_data = {}
    for alpha in [8, 16, 24, 48, 72]:
        max_f1 = 0
        best_values = []     
        eval_data[alpha] = determine_quality_parameters(alpha, algorithm, F1_optimization_search_space)
    return eval_data

In [22]:
step_list = [h/100 for h in list(range(0,100))]

In [23]:
# FT (FASTTEXT)
pd.DataFrame.from_dict(evaluate_algorithm (fast_text_similarity, step_list), orient="index", columns=["threshold", "precision", "recall", "F1"]).T

Unnamed: 0,8,16,24,48,72
threshold,0.13,0.13,0.16,0.16,0.18
precision,0.229376,0.127232,0.102396,0.053063,0.038424
recall,0.382979,0.382979,0.253639,0.253639,0.197648
F1,0.286913,0.191008,0.145894,0.087765,0.06434


In [25]:
# LD (LEVENSHTEIN-DISTANCE)
pd.DataFrame.from_dict(evaluate_algorithm (Syntactic_Classifiers.levensthein_similarity_on_reduction_of_expansion, step_list), orient="index", columns=["threshold", "precision", "recall", "F1"]).T

Unnamed: 0,8,16,24,48,72
threshold,0.55,0.54,0.52,0.7,0.68
precision,0.958763,0.911169,0.87406,0.968412,0.958848
recall,0.781075,0.781075,0.781075,0.652296,0.652296
F1,0.860845,0.841121,0.824956,0.779525,0.776408


In [25]:
# JWS (JARO-WINKLER-SIMILARITY)
pd.DataFrame.from_dict(evaluate_algorithm (Syntactic_Classifiers.jaro_winkler_similarity_on_reduction_of_expansion, step_list), orient="index", columns=["threshold", "precision", "recall", "F1"]).T

Unnamed: 0,8,16,24,48,72
threshold,0.73,0.79,0.79,0.79,0.84
precision,0.889404,0.948169,0.903821,0.835566,0.899926
recall,0.860022,0.768197,0.768197,0.768197,0.68477
F1,0.874466,0.848747,0.830508,0.800467,0.777742


In [26]:
# DC (DICE-COEFFICIENT)
pd.DataFrame.from_dict(evaluate_algorithm (Syntactic_Classifiers.dice_coefficient_on_reduction_of_expansion , step_list), orient="index", columns=["threshold", "precision", "recall", "F1"]).T

Unnamed: 0,8,16,24,48,72
threshold,0.75,0.79,0.77,0.82,0.85
precision,0.921896,0.871383,0.822708,0.870246,0.810848
recall,0.773236,0.758679,0.758679,0.653415,0.652856
F1,0.841048,0.811134,0.789397,0.746402,0.723325


In [27]:
# ILLOD
pd.DataFrame.from_dict(evaluate_algorithm (ILLOD.illod, [-1]), orient="index", columns=["threshold", "precision", "recall", "F1"]).T

Unnamed: 0,8,16,24,48,72
threshold,-1.0,-1.0,-1.0,-1.0,-1.0
precision,0.987871,0.974282,0.963905,0.922946,0.888222
recall,0.912094,0.912094,0.912094,0.912094,0.912094
F1,0.948472,0.942163,0.937284,0.917488,0.9
