# Comparison of the 3 different AEP-Detection Types

In [1]:
import pandas as pd
import string
import jellyfish
import random
import ILLOD
import Function_Pool
import ODIL

In [2]:
data = pd.read_csv('abbr_db.CSV', names=['abbr', 'long_forms'], sep=';', encoding='utf8')
abbreviations = list(data['abbr'].values)
expansions = list(data['long_forms'].values)

## Semantic Classifier based on fastText

In [3]:
import pandas as pd
import fasttext
import fasttext.util
from scipy import spatial
# fasttext.util.download_model('en', if_exists='ignore')
ft = fasttext.load_model("cc.en.300.bin")


def fast_text_similarity(a, t, threshold):  
    a_v = ft.get_sentence_vector(a)
    t_v = ft.get_sentence_vector(t)
    if 1 - spatial.distance.cosine(a_v, t_v) >= threshold:
        return True
    else:
        return False

    
def fast_text_sim(a, t):
    a_v = ft.get_sentence_vector(a)
    t_v = ft.get_sentence_vector(t)
    return 1 - spatial.distance.cosine(a_v, t_v)



## Cardinality of S

In [4]:
# construction and cardinality of S:
S = set()
for i, abb in enumerate(abbreviations):
    for j, exp in enumerate (expansions):
        if abb != abbreviations[j]:
            S.add((abb, exp))
print(len(S))

2710125


# Evaluation of the 3 different AEP-Detection Types (Section 5.4)

In [5]:
def find_and_count_false_negatives(algo, threshold):
    FN = 0
    for i, abb in enumerate(abbreviations):
        if not algo(abb, expansions[i], threshold):
            # print("\""+abb+"\""+", "+"\""+expansions[i]+"\"")
            FN += 1
    return FN, str(FN) + " FALSE NEGATIVES. Pairs that could not be detected out of " + str(len(abbreviations)) + " given pairs"

In [6]:
def find_and_count_false_positives(algo, threshold, alpha):
    test_set = []
    while len(test_set) <= alpha * len(abbreviations):
        rd1 = random.randint(0, len(abbreviations)-1)
        rd2 = random.randint(0, len(abbreviations)-1)
        if abbreviations[rd1] != abbreviations[rd2]:
            test_set.append ((abbreviations[rd1], expansions[rd2]))
            
    count_of_false_examples = 0
    FP = 0
    for j, tup in enumerate (test_set): 
        if algo(tup[0], tup[1], threshold):
            count_of_false_examples += 1
            FP +=1
    return FP, str(FP) + " FALSE POSITIVE detections out of " +  str(len(test_set)) + " created false examples"

In [7]:
def determine_quality_parameters(alpha, algo, search_space_for_F1_optimization):
    max_f1 = 0
    best_values = []
    for th_ in search_space_for_F1_optimization:
        result_on_L = find_and_count_false_negatives(algo, th_)
        result_on_S = find_and_count_false_positives(algo, th_, alpha)
        FN = result_on_L[0]
        FP = result_on_S[0]
        TP = len(abbreviations) - FN
        
        # A classifier that does nothing is not useful. This serves to avoid a division by zero    
        if FP + TP == 0:
            precision = 0
            recall = 0
            f1 = 0
        else:
            precision = TP/(TP + FP)
            recall = TP/(TP + FN)
            f1 = (2*precision*recall)/(precision+recall)
            
        # memorise the best F1 value in the loop so far.       
        if f1 > max_f1:
            best_values = [th_, precision, recall, f1]
            max_f1 = f1
    return best_values

In [8]:
def evaluate_algorithm (algorithm, F1_optimization_search_space):
    eval_data = {}
    for alpha in [4, 8, 11, 14, 28, 42]:
        print("alpha = " + str(alpha))
        max_f1 = 0
        best_values = []     
        eval_data[alpha] = determine_quality_parameters(alpha, algorithm, F1_optimization_search_space)
    return eval_data

In [9]:
step_list = [h/100 for h in list(range(0,100))]

In [10]:
# FT (FASTTEXT)
pd.DataFrame.from_dict(evaluate_algorithm (fast_text_similarity, step_list), orient="index", columns=["threshold", "precision", "recall", "F1"]).T

alpha = 4


  dist = 1.0 - uv / np.sqrt(uu * vv)


alpha = 8
alpha = 11
alpha = 14
alpha = 28
alpha = 42


Unnamed: 0,4,8,11,14,28,42
threshold,0.25,0.27,0.29,0.3,0.33,0.33
precision,0.697446,0.612015,0.604096,0.601003,0.539514,0.438813
recall,0.596305,0.547592,0.495521,0.469765,0.397536,0.397536
F1,0.642922,0.578014,0.544448,0.527341,0.457769,0.417156


In [11]:
# LD (LEVENSHTEIN-DISTANCE)
pd.DataFrame.from_dict(evaluate_algorithm (Function_Pool.levensthein_similarity_on_reduction_of_expansion, step_list), orient="index", columns=["threshold", "precision", "recall", "F1"]).T

alpha = 4
alpha = 8
alpha = 11
alpha = 14
alpha = 28
alpha = 42


Unnamed: 0,4,8,11,14,28,42
threshold,0.37,0.52,0.52,0.53,0.56,0.64
precision,0.915796,0.949626,0.940027,0.931864,0.853121,0.808486
recall,0.882979,0.781075,0.781075,0.781075,0.780515,0.768197
F1,0.899088,0.857143,0.853211,0.849832,0.815205,0.787827


In [12]:
# JWS (JARO-WINKLER-SIMILARITY)
pd.DataFrame.from_dict(evaluate_algorithm (Function_Pool.jaro_winkler_similarity_on_reduction_of_expansion, step_list), orient="index", columns=["threshold", "precision", "recall", "F1"]).T

alpha = 4
alpha = 8
alpha = 11
alpha = 14
alpha = 28
alpha = 42


Unnamed: 0,4,8,11,14,28,42
threshold,0.77,0.76,0.76,0.79,0.84,0.84
precision,0.953213,0.900527,0.882454,0.88756,0.908422,0.872272
recall,0.855543,0.861702,0.861702,0.830907,0.760918,0.760918
F1,0.901741,0.880687,0.871955,0.8583,0.828154,0.812799


In [13]:
# DC (DICE-COEFFICIENT)
pd.DataFrame.from_dict(evaluate_algorithm (Function_Pool.dice_coefficient_on_reduction_of_expansion , step_list), orient="index", columns=["threshold", "precision", "recall", "F1"]).T

alpha = 4
alpha = 8
alpha = 11
alpha = 14
alpha = 28
alpha = 42


Unnamed: 0,4,8,11,14,28,42
threshold,0.63,0.76,0.7,0.69,0.79,0.84
precision,0.878531,0.944948,0.894703,0.868339,0.785052,0.885345
recall,0.870661,0.759239,0.775476,0.775476,0.758679,0.652856
F1,0.874578,0.841975,0.830834,0.819284,0.77164,0.751531


In [14]:
# ILLOD
pd.DataFrame.from_dict(evaluate_algorithm (ILLOD.illod, [-1]), orient="index", columns=["threshold", "precision", "recall", "F1"]).T

alpha = 4
alpha = 8
alpha = 11
alpha = 14
alpha = 28
alpha = 42


Unnamed: 0,4,8,11,14,28,42
threshold,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
precision,0.992092,0.987886,0.974895,0.98253,0.959976,0.947705
recall,0.913214,0.913214,0.913214,0.913214,0.913214,0.913214
F1,0.95102,0.949084,0.943047,0.946605,0.936011,0.93014


In [15]:
# ILLOD+(VariantA)
pd.DataFrame.from_dict(evaluate_algorithm (Function_Pool.illod_plus, [-1]), orient="index", columns=["threshold", "precision", "recall", "F1"]).T

alpha = 4
alpha = 8
alpha = 11
alpha = 14
alpha = 28
alpha = 42


Unnamed: 0,4,8,11,14,28,42
threshold,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
precision,0.995706,0.993268,0.990842,0.989031,0.968955,0.960924
recall,0.908735,0.908735,0.908735,0.908735,0.908735,0.908735
F1,0.950234,0.949123,0.948014,0.947184,0.937879,0.934101


In [16]:
# ILLOD+(VariantB)
pd.DataFrame.from_dict(evaluate_algorithm (Function_Pool.illod_plus, [-2]), orient="index", columns=["threshold", "precision", "recall", "F1"]).T

alpha = 4
alpha = 8
alpha = 11
alpha = 14
alpha = 28
alpha = 42


Unnamed: 0,4,8,11,14,28,42
threshold,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0
precision,0.998068,0.994227,0.991049,0.991049,0.979772,0.966334
recall,0.867861,0.867861,0.867861,0.867861,0.867861,0.867861
F1,0.928422,0.926756,0.925373,0.925373,0.920428,0.914454


In [17]:
# ODIL
pd.DataFrame.from_dict(evaluate_algorithm (ODIL.illod_plus, [-2]), orient="index", columns=["threshold", "precision", "recall", "F1"]).T

alpha = 4
alpha = 8
alpha = 11
alpha = 14
alpha = 28
alpha = 42


Unnamed: 0,4,8,11,14,28,42
threshold,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0
precision,0.997022,0.992294,0.991119,0.984127,0.974956,0.962622
recall,0.93729,0.93729,0.93729,0.93729,0.93729,0.93729
F1,0.966234,0.964008,0.963453,0.960138,0.955752,0.949787
