# Experiments with Hybrid Appraches
The syntactic classifiers presented in the paper are used in a modified form in this experiment. Before the similarity calculation, this run checks whether the abbreviation and the possible expansion have the same initial letter. If not, "False" is returned as the classification result.

In [None]:
import pandas as pd
import string
import pandas as pd
import jellyfish
import random
import ILLOD

# Loading of Abbreviation-Expansion List

In [4]:
data = pd.read_csv('abbr_db.CSV', names=['abbr', 'long_forms'], sep=';', encoding='utf8')
abbreviations = list(data['abbr'].values)
expansions = list(data['long_forms'].values)
#for i, abb in enumerate(abbreviations):
#    print(str(i) + ": " + abb + "| " + expansions[i])

# Helper Functions

In [5]:
def dice_coefficient(a, b):
    """dice coefficient 2nt/(na + nb)."""
    a_bigrams = set(a.lower())
    b_bigrams = set(b.lower())
    overlap = len(a_bigrams & b_bigrams)
    return overlap * 2.0 / (len(a_bigrams) + len(b_bigrams))

def clean_string(s):
    s_lower = s.lower()
    invalidcharacters = set(string.punctuation)
    if any(char in invalidcharacters for char in s):
        s_ = s_lower.translate(str.maketrans('', '', string.punctuation))
    else:
        s_ = s_lower
    return s_

def stop_words_handling(term):
    splitted_term = term.split()
    stop_words = set(["for", "and", "of", "in", "via", "be"])
    
    # As first character matching is important, stop words are not removed when they are the first word
    if splitted_term[0] in stop_words:
        stop_words = stop_words - set([splitted_term[0]])
                
    for sw in stop_words:
        while sw in splitted_term:
            splitted_term.remove(sw)
    sanitized_term = " ".join([w for w in splitted_term]) 
        
    return sanitized_term

def clean_string_pair_and_reduce_expansion(abb, term):
    abb_lower = abb.lower()
    term_lower = term.lower()
    sanitized_abbv = clean_string(abb_lower)
    sanitized_term = clean_string(term_lower)   
    sanitized_term_without_stopswords = stop_words_handling(sanitized_term)
    initial_letters_of_tokens_of_sanitized_term_without_stopswords = ''.join([c[0] for c in sanitized_term_without_stopswords.split()])
    return sanitized_abbv, initial_letters_of_tokens_of_sanitized_term_without_stopswords

# Classifiers based on hybrid approaches (Compare initial letters first before calculating syntactic similarity with  LD, JWS or DC).
## Algorithm 2 in different variants:

In [7]:
def levensthein_distance_on_reduction_of_expansion(a, term, threshold):
    a_, t_ = clean_string_pair_and_reduce_expansion(a, term)
    if a_[0] == t_[0]:
        return jellyfish.levenshtein_distance(a_, t_) <= threshold
    else:
        return False

In [8]:
def jaro_winkler_similarity_on_reduction_of_expansion(a, term, threshold):
    a_, t_ = clean_string_pair_and_reduce_expansion(a, term)
    if a_[0] == t_[0]:
        return jellyfish.jaro_winkler_similarity(a_, t_) >= threshold
    else:
        return False

In [9]:
def dice_coefficient_on_reduction_of_expansion(a, term, threshold):
    a_, t_ = clean_string_pair_and_reduce_expansion(a, term)
    if a_[0] == t_[0]:
        return dice_coefficient(a_, t_) >= threshold
    else:
        return False

# Evaluation of the 3 different AEP-Detection Types (Section 4.4)

In [10]:
def find_and_count_false_negatives(algo, threshold):
    FN = 0
    for i, abb in enumerate(abbreviations):
        if not algo(abb, expansions[i], threshold):
            # print("\""+abb+"\""+", "+"\""+expansions[i]+"\"")
            FN += 1
    return FN, str(FN) + " FALSE NEGATIVES. Pairs that could not be detected out of " + str(len(abbreviations)) + " given pairs"

In [11]:
def find_and_count_false_positives(algo, threshold, alpha):
    test_set = []
    while len(test_set) <= alpha * len(abbreviations):
        rd1 = random.randint(0, len(abbreviations)-1)
        rd2 = random.randint(0, len(abbreviations)-1)
        if abbreviations[rd1] != abbreviations[rd2]:
            test_set.append ((abbreviations[rd1], expansions[rd2]))
            
    count_of_false_examples = 0
    FP = 0
    for j, tup in enumerate (test_set): 
        if algo(tup[0], tup[1], threshold):
            count_of_false_examples += 1
            FP +=1
    return FP, str(FP) + " FALSE POSITIVE detections out of " +  str(len(test_set)) + " created false examples"

In [12]:
def determine_quality_parameters(alpha, algo, search_space_for_F1_optimization):
    max_f1 = 0
    best_values = []
    for th_ in search_space_for_F1_optimization:
        result_on_L = find_and_count_false_negatives(algo, th_)
        result_on_S = find_and_count_false_positives(algo, th_, alpha)
        FN = result_on_L[0]
        FP = result_on_S[0]
        TP = len(abbreviations) - FN
        
        # A classifier that does nothing is not useful. This serves to avoid a division by zero    
        if FP + TP == 0:
            precision = 0
            recall = 0
            f1 = 0
        else:
            precision = TP/(TP + FP)
            recall = TP/(TP + FN)
            f1 = (2*precision*recall)/(precision+recall)
            
        # memorise the best F1 value in the loop so far.       
        if f1 > max_f1:
            best_values = [th_, precision, recall, f1]
            max_f1 = f1
    return best_values

In [13]:
def evaluate_algorithm (algorithm, F1_optimization_search_space):
    eval_data = {}
    for alpha in [8, 16, 24, 48, 72]:  
        eval_data[alpha] = determine_quality_parameters(alpha, algorithm, F1_optimization_search_space)
    return eval_data

In [14]:
step_list = [h/100 for h in list(range(0,100))]

In [17]:
# LD (LEVENSHTEIN_DISTANCE)
pd.DataFrame.from_dict(evaluate_algorithm (levensthein_distance_on_reduction_of_expansion, list(range(0, 4))), orient="index", columns=["threshold", "precision", "recall", "F1"]).T

Unnamed: 0,8,16,24,48,72
threshold,1.0,1.0,1.0,1.0,1.0
precision,0.959677,0.907819,0.859206,0.762821,0.678385
recall,0.799552,0.799552,0.799552,0.799552,0.799552
F1,0.872327,0.850253,0.828306,0.780755,0.734002


In [18]:
# JWS (JARO-WINKLER-SIMILARITY)
pd.DataFrame.from_dict(evaluate_algorithm (jaro_winkler_similarity_on_reduction_of_expansion, step_list), orient="index", columns=["threshold", "precision", "recall", "F1"]).T

Unnamed: 0,8,16,24,48,72
threshold,0.67,0.73,0.79,0.78,0.79
precision,0.904094,0.860124,0.921159,0.855444,0.800351
recall,0.865622,0.853863,0.765398,0.765398,0.765398
F1,0.884439,0.856982,0.836086,0.80792,0.782484


In [19]:
# DC (DICE-COEFFICIENT)
pd.DataFrame.from_dict(evaluate_algorithm (dice_coefficient_on_reduction_of_expansion , step_list), orient="index", columns=["threshold", "precision", "recall", "F1"]).T

Unnamed: 0,8,16,24,48,72
threshold,0.41,0.6,0.69,0.68,0.68
precision,0.863335,0.882723,0.936185,0.872785,0.81937
recall,0.947928,0.863942,0.772116,0.772116,0.772116
F1,0.903656,0.873231,0.846272,0.81937,0.795042


In [20]:
# ILLOD
pd.DataFrame.from_dict(evaluate_algorithm (ILLOD.illod, [-1]), orient="index", columns=["threshold", "precision", "recall", "F1"]).T

Unnamed: 0,8,16,24,48,72
threshold,-1.0,-1.0,-1.0,-1.0,-1.0
precision,0.987273,0.980144,0.951519,0.920339,0.895547
recall,0.912094,0.912094,0.912094,0.912094,0.912094
F1,0.948196,0.944896,0.931389,0.916198,0.903745
