# Comparison of the 3 different AEP-Detection Types

In [16]:
import pandas as pd
import string
import jellyfish
import random
import ILLOD
import Function_Pool
import ODIL

In [17]:
data = pd.read_csv('abbr_db.CSV', names=['abbr', 'long_forms'], sep=';', encoding='utf8')
abbreviations = list(data['abbr'].values)
expansions = list(data['long_forms'].values)

## Semantic Classifier based on fastText

In [18]:
import pandas as pd
import fasttext
import fasttext.util
from scipy import spatial
# fasttext.util.download_model('en', if_exists='ignore')
ft = fasttext.load_model("cc.en.300.bin")


def fast_text_similarity(a, t, threshold):  
    a_v = ft.get_sentence_vector(a)
    t_v = ft.get_sentence_vector(t)
    if 1 - spatial.distance.cosine(a_v, t_v) >= threshold:
        return True
    else:
        return False

    
def fast_text_sim(a, t):
    a_v = ft.get_sentence_vector(a)
    t_v = ft.get_sentence_vector(t)
    return 1 - spatial.distance.cosine(a_v, t_v)



## Cardinality of S

In [19]:
# construction and cardinality of S:
S = set()
for i, abb in enumerate(abbreviations):
    for j, exp in enumerate (expansions):
        if abb != abbreviations[j]:
            S.add((abb, exp))
print(len(S))

2710125


# Evaluation of the 3 different AEP-Detection Types (Section 5.4)

In [20]:
def find_and_count_false_negatives(algo, threshold):
    FN = 0
    for i, abb in enumerate(abbreviations):
        if not algo(abb, expansions[i], threshold):
            # print("\""+abb+"\""+", "+"\""+expansions[i]+"\"")
            FN += 1
    return FN, str(FN) + " FALSE NEGATIVES. Pairs that could not be detected out of " + str(len(abbreviations)) + " given pairs"

In [21]:
def find_and_count_false_positives(algo, threshold, alpha):
    test_set = []
    while len(test_set) <= alpha * len(abbreviations):
        rd1 = random.randint(0, len(abbreviations)-1)
        rd2 = random.randint(0, len(abbreviations)-1)
        if abbreviations[rd1] != abbreviations[rd2]:
            test_set.append ((abbreviations[rd1], expansions[rd2]))
            
    count_of_false_examples = 0
    FP = 0
    for j, tup in enumerate (test_set): 
        if algo(tup[0], tup[1], threshold):
            count_of_false_examples += 1
            FP +=1
    return FP, str(FP) + " FALSE POSITIVE detections out of " +  str(len(test_set)) + " created false examples"

In [22]:
def determine_quality_parameters(alpha, algo, search_space_for_F1_optimization):
    max_f1 = 0
    best_values = []
    for th_ in search_space_for_F1_optimization:
        result_on_L = find_and_count_false_negatives(algo, th_)
        result_on_S = find_and_count_false_positives(algo, th_, alpha)
        FN = result_on_L[0]
        FP = result_on_S[0]
        TP = len(abbreviations) - FN
        
        # A classifier that does nothing is not useful. This serves to avoid a division by zero    
        if FP + TP == 0:
            precision = 0
            recall = 0
            f1 = 0
        else:
            precision = TP/(TP + FP)
            recall = TP/(TP + FN)
            f1 = (2*precision*recall)/(precision+recall)
            
        # memorise the best F1 value in the loop so far.       
        if f1 > max_f1:
            best_values = [th_, precision, recall, f1]
            max_f1 = f1
    return best_values

In [23]:
def evaluate_algorithm (algorithm, F1_optimization_search_space):
    eval_data = {}
    for alpha in [4, 8, 11, 14, 28, 42]:
        max_f1 = 0
        best_values = []     
        eval_data[alpha] = determine_quality_parameters(alpha, algorithm, F1_optimization_search_space)
    return eval_data

In [24]:
step_list = [h/100 for h in list(range(0,100))]

In [25]:
# FT (FASTTEXT)
pd.DataFrame.from_dict(evaluate_algorithm (fast_text_similarity, step_list), orient="index", columns=["threshold", "precision", "recall", "F1"]).T

  dist = 1.0 - uv / np.sqrt(uu * vv)


Unnamed: 0,4,8,11,14,28,42
threshold,0.25,0.28,0.28,0.31,0.33,0.33
precision,0.694263,0.639973,0.591398,0.631994,0.527097,0.439356
recall,0.596305,0.523516,0.523516,0.449048,0.397536,0.397536
F1,0.641566,0.575916,0.555391,0.525041,0.45324,0.417402


In [26]:
# LD (LEVENSHTEIN-DISTANCE)
pd.DataFrame.from_dict(evaluate_algorithm (Function_Pool.levensthein_similarity_on_reduction_of_expansion, step_list), orient="index", columns=["threshold", "precision", "recall", "F1"]).T

Unnamed: 0,4,8,11,14,28,42
threshold,0.4,0.57,0.51,0.56,0.54,0.51
precision,0.910983,0.954795,0.9375,0.926862,0.851129,0.800344
recall,0.882419,0.780515,0.781075,0.780515,0.781075,0.781075
F1,0.896473,0.858903,0.852169,0.847416,0.814599,0.790592


In [27]:
# JWS (JARO-WINKLER-SIMILARITY)
pd.DataFrame.from_dict(evaluate_algorithm (Function_Pool.jaro_winkler_similarity_on_reduction_of_expansion, step_list), orient="index", columns=["threshold", "precision", "recall", "F1"]).T

Unnamed: 0,4,8,11,14,28,42
threshold,0.76,0.77,0.78,0.78,0.84,0.84
precision,0.948829,0.903607,0.908039,0.886972,0.915768,0.867262
recall,0.861702,0.855543,0.834826,0.834826,0.760918,0.760918
F1,0.903169,0.878919,0.869895,0.86011,0.831193,0.810617


In [28]:
# DC (DICE-COEFFICIENT)
pd.DataFrame.from_dict(evaluate_algorithm (Function_Pool.dice_coefficient_on_reduction_of_expansion , step_list), orient="index", columns=["threshold", "precision", "recall", "F1"]).T

Unnamed: 0,4,8,11,14,28,42
threshold,0.61,0.72,0.72,0.74,0.8,0.81
precision,0.882653,0.923333,0.898767,0.875713,0.79379,0.89016
recall,0.871781,0.775476,0.775476,0.773236,0.758679,0.653415
F1,0.877183,0.84297,0.832582,0.821291,0.775837,0.753633


In [29]:
# ILLOD
pd.DataFrame.from_dict(evaluate_algorithm (ILLOD.illod, [-1]), orient="index", columns=["threshold", "precision", "recall", "F1"]).T

Unnamed: 0,4,8,11,14,28,42
threshold,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
precision,0.992092,0.987886,0.984309,0.971412,0.955477,0.94332
recall,0.913214,0.913214,0.913214,0.913214,0.913214,0.913214
F1,0.95102,0.949084,0.94743,0.941414,0.933868,0.928023


In [30]:
# ILLOD+(VariantA)
pd.DataFrame.from_dict(evaluate_algorithm (Function_Pool.illod_plus, [-1]), orient="index", columns=["threshold", "precision", "recall", "F1"]).T

Unnamed: 0,4,8,11,14,28,42
threshold,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
precision,0.997639,0.995289,0.989461,0.983702,0.977444,0.966819
recall,0.946249,0.946249,0.946249,0.946249,0.946249,0.946249
F1,0.971264,0.970149,0.967373,0.964612,0.961593,0.956423


In [31]:
# ILLOD+(VariantB)
pd.DataFrame.from_dict(evaluate_algorithm (Function_Pool.illod_plus, [-2]), orient="index", columns=["threshold", "precision", "recall", "F1"]).T

Unnamed: 0,4,8,11,14,28,42
threshold,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0
precision,0.996303,0.99569,0.992634,0.991416,0.977039,0.964797
recall,0.905375,0.905375,0.905375,0.905375,0.905375,0.905375
F1,0.948665,0.948387,0.946999,0.946444,0.939843,0.934142


In [32]:
# ODIL
pd.DataFrame.from_dict(evaluate_algorithm (ODIL.illod_plus, [-2]), orient="index", columns=["threshold", "precision", "recall", "F1"]).T

Unnamed: 0,4,8,11,14,28,42
threshold,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0
precision,0.996429,0.988777,0.990533,0.985866,0.965955,0.95439
recall,0.93729,0.93729,0.93729,0.93729,0.93729,0.93729
F1,0.965955,0.962346,0.963176,0.960964,0.951407,0.945763
