# Comparison of the 3 different AEP-Detection Types

In [2]:
import pandas as pd
import string
import jellyfish
import random
import ILLOD
import Function_Pool

In [3]:
data = pd.read_csv('abbr_db.CSV', names=['abbr', 'long_forms'], sep=';', encoding='utf8')
abbreviations = list(data['abbr'].values)
expansions = list(data['long_forms'].values)

## Semantic Classifier based on fastText

In [4]:
import pandas as pd
import fasttext
import fasttext.util
from scipy import spatial
# fasttext.util.download_model('en', if_exists='ignore')
ft = fasttext.load_model("cc.en.300.bin")


def fast_text_similarity(a, t, threshold):  
    a_v = ft.get_sentence_vector(a)
    t_v = ft.get_sentence_vector(t)
    if 1 - spatial.distance.cosine(a_v, t_v) >= threshold:
        return True
    else:
        return False

    
def fast_text_sim(a, t):
    a_v = ft.get_sentence_vector(a)
    t_v = ft.get_sentence_vector(t)
    return 1 - spatial.distance.cosine(a_v, t_v)



## Cardinality of S

In [5]:
# construction and cardinality of S:
S = set()
for i, abb in enumerate(abbreviations):
    for j, exp in enumerate (expansions):
        if abb != abbreviations[j]:
            S.add((abb, exp))
print(len(S))

2710125


# Evaluation of the 3 different AEP-Detection Types (Section 5.4)

In [6]:
def find_and_count_false_negatives(algo, threshold):
    FN = 0
    for i, abb in enumerate(abbreviations):
        if not algo(abb, expansions[i], threshold):
            # print("\""+abb+"\""+", "+"\""+expansions[i]+"\"")
            FN += 1
    return FN, str(FN) + " FALSE NEGATIVES. Pairs that could not be detected out of " + str(len(abbreviations)) + " given pairs"

In [7]:
def find_and_count_false_positives(algo, threshold, alpha):
    test_set = []
    while len(test_set) <= alpha * len(abbreviations):
        rd1 = random.randint(0, len(abbreviations)-1)
        rd2 = random.randint(0, len(abbreviations)-1)
        if abbreviations[rd1] != abbreviations[rd2]:
            test_set.append ((abbreviations[rd1], expansions[rd2]))
            
    count_of_false_examples = 0
    FP = 0
    for j, tup in enumerate (test_set): 
        if algo(tup[0], tup[1], threshold):
            count_of_false_examples += 1
            FP +=1
    return FP, str(FP) + " FALSE POSITIVE detections out of " +  str(len(test_set)) + " created false examples"

In [8]:
def determine_quality_parameters(alpha, algo, search_space_for_F1_optimization):
    max_f1 = 0
    best_values = []
    for th_ in search_space_for_F1_optimization:
        result_on_L = find_and_count_false_negatives(algo, th_)
        result_on_S = find_and_count_false_positives(algo, th_, alpha)
        FN = result_on_L[0]
        FP = result_on_S[0]
        TP = len(abbreviations) - FN
        
        # A classifier that does nothing is not useful. This serves to avoid a division by zero    
        if FP + TP == 0:
            precision = 0
            recall = 0
            f1 = 0
        else:
            precision = TP/(TP + FP)
            recall = TP/(TP + FN)
            f1 = (2*precision*recall)/(precision+recall)
            
        # memorise the best F1 value in the loop so far.       
        if f1 > max_f1:
            best_values = [th_, precision, recall, f1]
            max_f1 = f1
    return best_values

In [9]:
def evaluate_algorithm (algorithm, F1_optimization_search_space):
    eval_data = {}
    for alpha in [4, 8, 11, 14, 28, 42]:
        max_f1 = 0
        best_values = []     
        eval_data[alpha] = determine_quality_parameters(alpha, algorithm, F1_optimization_search_space)
    return eval_data

In [10]:
step_list = [h/100 for h in list(range(0,100))]

In [11]:
# FT (FASTTEXT)
pd.DataFrame.from_dict(evaluate_algorithm (fast_text_similarity, step_list), orient="index", columns=["threshold", "precision", "recall", "F1"]).T

  dist = 1.0 - uv / np.sqrt(uu * vv)


Unnamed: 0,4,8,11,14,28,42
threshold,0.25,0.28,0.31,0.31,0.32,0.32
precision,0.686003,0.646581,0.674897,0.622155,0.48534,0.39613
recall,0.609183,0.534714,0.459127,0.459127,0.43561,0.43561
F1,0.645314,0.585351,0.546485,0.528351,0.459132,0.414933


In [12]:
# LD (LEVENSHTEIN-DISTANCE)
pd.DataFrame.from_dict(evaluate_algorithm (Function_Pool.levensthein_similarity_on_reduction_of_expansion, step_list), orient="index", columns=["threshold", "precision", "recall", "F1"]).T

Unnamed: 0,4,8,11,14,28,42
threshold,0.36,0.54,0.56,0.55,0.57,0.65
precision,0.906322,0.960744,0.941256,0.92568,0.868536,0.811834
recall,0.882979,0.781075,0.780515,0.781075,0.780515,0.768197
F1,0.894498,0.861643,0.853382,0.847252,0.822176,0.789413


In [13]:
# JWS (JARO-WINKLER-SIMILARITY)
pd.DataFrame.from_dict(evaluate_algorithm (Function_Pool.jaro_winkler_similarity_on_reduction_of_expansion, step_list), orient="index", columns=["threshold", "precision", "recall", "F1"]).T

Unnamed: 0,4,8,11,14,28,42
threshold,0.76,0.78,0.79,0.78,0.84,0.84
precision,0.945332,0.931875,0.91492,0.881727,0.913306,0.867262
recall,0.861702,0.834826,0.830907,0.834826,0.760918,0.760918
F1,0.901582,0.880685,0.870892,0.857636,0.830177,0.810617


In [14]:
# DC (DICE-COEFFICIENT)
pd.DataFrame.from_dict(evaluate_algorithm (Function_Pool.dice_coefficient_on_reduction_of_expansion , step_list), orient="index", columns=["threshold", "precision", "recall", "F1"]).T

Unnamed: 0,4,8,11,14,28,42
threshold,0.61,0.75,0.7,0.75,0.77,0.83
precision,0.894314,0.929966,0.898767,0.871293,0.791935,0.900463
recall,0.871781,0.773236,0.775476,0.773236,0.758679,0.653415
F1,0.882903,0.84439,0.832582,0.819341,0.77495,0.7573


In [15]:
# ILLOD
pd.DataFrame.from_dict(evaluate_algorithm (ILLOD.illod, [-1]), orient="index", columns=["threshold", "precision", "recall", "F1"]).T

Unnamed: 0,4,8,11,14,28,42
threshold,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
precision,0.992696,0.987288,0.986094,0.983122,0.950466,0.929875
recall,0.913214,0.913214,0.913214,0.913214,0.913214,0.913214
F1,0.951298,0.948807,0.948256,0.94688,0.931468,0.921469


In [16]:
# ILLOD+(VariantA)
pd.DataFrame.from_dict(evaluate_algorithm (Function_Pool.illod_plus, [-1]), orient="index", columns=["threshold", "precision", "recall", "F1"]).T

Unnamed: 0,4,8,11,14,28,42
threshold,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
precision,0.995875,0.994703,0.990621,0.984848,0.971823,0.970149
recall,0.946249,0.946249,0.946249,0.946249,0.946249,0.946249
F1,0.970428,0.969871,0.967927,0.965163,0.958865,0.95805


In [17]:
# ILLOD+(VariantB)
pd.DataFrame.from_dict(evaluate_algorithm (Function_Pool.illod_plus, [-2]), orient="index", columns=["threshold", "precision", "recall", "F1"]).T

Unnamed: 0,4,8,11,14,28,42
threshold,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0
precision,0.997532,0.994465,0.990809,0.992025,0.97586,0.972339
recall,0.905375,0.905375,0.905375,0.905375,0.905375,0.905375
F1,0.949222,0.947831,0.946167,0.946721,0.939297,0.937663
