In [24]:
import pandas as pd
import string
import pandas as pd
import jellyfish
import random

# Abbreviation-Expansion List
Before we go into the details of the ILLOD tool, we first will give some insights into our evaluation data for AEP-Detection

In [25]:
data = pd.read_csv('abbr_db.CSV', names=['abbr', 'long_forms'], sep=';', encoding='utf8')
abbreviations = list(data['abbr'].values)
expansions = list(data['long_forms'].values)
#for i, abb in enumerate(abbreviations):
#    print(str(i) + ": " + abb + "| " + expansions[i])

# Introducing Helper Functions
These helper functions are needed in order to provide important methods for syntactic and semantic similarity measures and for ILLOD. We need a method to calculate the dice coefficient between two given strings since the jellyfish package doesnt provide this funcionality

In [26]:
def dice_coefficient(a, b):
    """dice coefficient 2nt/(na + nb)."""
    a_bigrams = set(a.lower())
    b_bigrams = set(b.lower())
    overlap = len(a_bigrams & b_bigrams)
    return overlap * 2.0 / (len(a_bigrams) + len(b_bigrams))

### Method to remove puntuation marks from  a given strings 

In [27]:
def clean_string(s):
    s_lower = s.lower()
    invalidcharacters = set(string.punctuation)
    if any(char in invalidcharacters for char in s):
        s_ = s_lower.translate(str.maketrans('', '', string.punctuation))
    else:
        s_ = s_lower
    return s_

### Method to remove stop words from  a given term

In [28]:
def stop_words_handling(term):
    splitted_term = term.split()
    stop_words = set(["for", "and", "of", "in", "via", "be"])
    
    # As first character matching is important, stop words are not removed when they are the first word
    if splitted_term[0] in stop_words:
        stop_words = stop_words - set([splitted_term[0]])
                
    for sw in stop_words:
        while sw in splitted_term:
            splitted_term.remove(sw)
    sanitized_term = " ".join([w for w in splitted_term]) 
        
    return sanitized_term

### Method to calculate and return $(a^{c}, potAbb(t^{c}))$  for a given pair $(a,t)$

In [29]:
def clean_string_pair_and_reduce_expansion(abb, term):
    abb_lower = abb.lower()
    term_lower = term.lower()
    sanitized_abbv = clean_string(abb_lower)
    sanitized_term = clean_string(term_lower)   
    sanitized_term_without_stopswords = stop_words_handling(sanitized_term)
    initial_letters_of_tokens_of_sanitized_term_without_stopswords = ''.join([c[0] for c in sanitized_term_without_stopswords.split()])
    return sanitized_abbv, initial_letters_of_tokens_of_sanitized_term_without_stopswords

# Classifiers based on hybrid approaches:
## Compare initial letters first before calculating syntactic similarity with  LD, JWS or DC.
## Algorithm 2 in different variants

In [30]:
def levensthein_distance_on_reduction_of_expansion(a, term, threshold):
    a_, t_ = clean_string_pair_and_reduce_expansion(a, term)
    if a_[0] == t_[0]:
        if jellyfish.levenshtein_distance(a_, t_) <= threshold:
            return True
        else:
            return False
    else:
        return False

In [31]:
def jaro_winkler_similarity_on_reduction_of_expansion(a, term, threshold):
    a_, t_ = clean_string_pair_and_reduce_expansion(a, term)
    if a_[0] == t_[0]:
        if jellyfish.jaro_winkler_similarity(a_, t_) >= threshold:
            return True
        else:
            return False
    else:
        return False

In [32]:
def dice_coefficient_on_reduction_of_expansion(a, term, threshold):
    a_, t_ = clean_string_pair_and_reduce_expansion(a, term)
    if a_[0] == t_[0]:
        if dice_coefficient(a_, t_) >= threshold:
            return True
        else:
            return False
    else:
        return False

# ILLOD with its Methods (Section 4.3)

In [33]:
def check_initial_letters(a, t):
    initial_letters_of_tokens_of_t = ''.join([c[0] for c in t.split()])
    if initial_letters_of_tokens_of_t == a or initial_letters_of_tokens_of_t.upper() == a:
        return True

def check_length_consistency(a, t):
    length_consistency = False
    if len(t.split()) <= len(a):
        length_consistency = True
    return length_consistency

def check_order(a, t):
    abbv_reversed = a.lower()[::-1]
    term_reversed = t.lower()[::-1]
    len_of_term = len(t)
    
    pos_memory = 0
    pos_memory_list = []
    order_matching_string_rev = ""
    
    for j, char_from_abbv in enumerate(abbv_reversed):
        if j == len(abbv_reversed) - 1 and len(pos_memory_list) > 0 and pos_memory == len(term_reversed):
            break
        else:
            for i, char_from_term in enumerate(term_reversed[pos_memory:]):
                if char_from_abbv == char_from_term:
                    order_matching_string_rev = order_matching_string_rev + char_from_abbv
                    pos_memory = pos_memory + i + 1
                    pos_memory_list.append(len_of_term - pos_memory)
                    break
    if order_matching_string_rev == abbv_reversed:
        return True, pos_memory_list[::-1]
    else:
        return False, []

def check_distribution_of_matching_characters(pos_of_chars_list, t):
    term_intervals = []
    len_of_term = len(t)
    i = 0
    while i < len_of_term:
        sublist = []
        j = i
        while j < len_of_term and t[j] != " ":
            sublist.append(j)
            j = j+ 1
        i = j+1
        term_intervals.append(sublist)
        
    splitted_term = t.split()      
    
    containment_list = []
    for i, interval in enumerate(term_intervals):
        contanment_sublist = []
        for pos in pos_of_chars_list:
            if (pos in interval) and (splitted_term[i][0] == t[pos]):
                contanment_sublist.append(0)
            elif pos in interval:
                contanment_sublist.append(interval.index(pos))
        if len(contanment_sublist) == 0:
            contanment_sublist.append(-1)
        containment_list.append(contanment_sublist)
    
    result_of_distribution_check = False
    if len(containment_list) <= 1:
        result_of_distribution_check = True
    elif len (containment_list) >= 2:
        non_zero_count = 0
        for sublist in containment_list[1:]:
            if len(sublist) == 1 and 0 not in sublist:
                non_zero_count += 1
        if non_zero_count == 0:
            result_of_distribution_check = True
    
    return result_of_distribution_check



def illod(abbv, term, threshold=None):
    if (abbv[0].lower() == term[0].lower()):
        
        
        ###################################### Step (a) ##########################################
        # check wether initial letters of tokens in t match with the letters in abbreviation
        if check_initial_letters(abbv, term):
            return True
        
        
        
        ###################################### Step (b) ########################################
        # clean abbreviation and term from special characters and stopwords
        a_, t_ = clean_string_pair_and_reduce_expansion(abbv, term)
        if a_ == t_:
            return True
        
        sanitized_abbv = clean_string(abbv) 
        sanitized_term = clean_string(term)
        sanitized_term_without_stopswords = stop_words_handling(sanitized_term)
        sanitized_term_without_stopswords_splitted  = sanitized_term_without_stopswords.split()
        
        ###################################### Step (c), (d), (e) ###############################
        # Sequential call of the methods that check and compare lengths, order and distribution of characters
        length_consistency = check_length_consistency(sanitized_abbv, sanitized_term_without_stopswords)
        order, pos_of_chars_list = check_order(sanitized_abbv, sanitized_term_without_stopswords)
        distribution = check_distribution_of_matching_characters(pos_of_chars_list, sanitized_term_without_stopswords)


        if length_consistency and order and distribution:
            return True
        else:
            return False

        ################################## in case first letter differs ##########################
    else:
        return False


# Evaluation of the 3 different AEP-Detection Types (Section 4.4)

In [34]:
def find_and_count_false_negatives(algo, threshold):
    FN = 0
    for i, abb in enumerate(abbreviations):
        if not algo(abb, expansions[i], threshold):
            # print("\""+abb+"\""+", "+"\""+expansions[i]+"\"")
            FN += 1
    return FN, str(FN) + " FALSE NEGATIVES. Pairs that could not be detected out of " + str(len(abbreviations)) + " given pairs"

In [35]:
def find_and_count_false_positives(algo, threshold, alpha):
    test_set = []
    while len(test_set) <= alpha * len(abbreviations):
        rd1 = random.randint(0, len(abbreviations)-1)
        rd2 = random.randint(0, len(abbreviations)-1)
        if abbreviations[rd1] != abbreviations[rd2]:
            test_set.append ((abbreviations[rd1], expansions[rd2]))
            
    count_of_false_examples = 0
    FP = 0
    for j, tup in enumerate (test_set): 
        if algo(tup[0], tup[1], threshold):
            count_of_false_examples += 1
            FP +=1
    return FP, str(FP) + " FALSE POSITIVE detections out of " +  str(len(test_set)) + " created false examples"

In [36]:
def determine_quality_parameters(alpha, algo, search_space_for_F1_optimization):
    max_f1 = 0
    best_values = []
    for th_ in search_space_for_F1_optimization:
        result_on_L = find_and_count_false_negatives(algo, th_)
        result_on_S = find_and_count_false_positives(algo, th_, alpha)
        FN = result_on_L[0]
        FP = result_on_S[0]
        TP = len(abbreviations) - FN
        
        # A classifier that does nothing is not useful. This serves to avoid a division by zero    
        if FP + TP == 0:
            precision = 0
            recall = 0
            f1 = 0
        else:
            precision = TP/(TP + FP)
            recall = TP/(TP + FN)
            f1 = (2*precision*recall)/(precision+recall)
            
        # memorise the best F1 value in the loop so far.       
        if f1 > max_f1:
            best_values = [th_, precision, recall, f1]
            max_f1 = f1
    return best_values

In [37]:
def evaluate_algorithm (algorithm, F1_optimization_search_space):
    eval_data = {}
    for alpha in [8, 16, 24, 48, 72]:  
        eval_data[alpha] = determine_quality_parameters(alpha, algorithm, F1_optimization_search_space)
    return eval_data

In [38]:
step_list = [h/100 for h in list(range(0,100))]

In [40]:
# LD (LEVENSHTEIN_DISTANCE)
pd.DataFrame.from_dict(evaluate_algorithm (levensthein_distance_on_reduction_of_expansion, list(range(0, 4))), orient="index", columns=["threshold", "precision", "recall", "F1"])

Unnamed: 0,threshold,precision,recall,F1
8,1,0.952,0.799552,0.869142
16,1,0.913043,0.799552,0.852537
24,1,0.870732,0.799552,0.833625
48,1,0.775244,0.799552,0.787211
72,1,0.679353,0.799552,0.734568


In [41]:
# JWS (JARO-WINKLER-SIMILARITY)
pd.DataFrame.from_dict(evaluate_algorithm (jaro_winkler_similarity_on_reduction_of_expansion, step_list), orient="index", columns=["threshold", "precision", "recall", "F1"])

Unnamed: 0,threshold,precision,recall,F1
8,0.76,0.919277,0.854423,0.885665
16,0.77,0.862265,0.848264,0.855207
24,0.8,0.910703,0.776596,0.83832
48,0.84,0.873312,0.760358,0.81293
72,0.84,0.806413,0.760358,0.782709


In [42]:
# DC (DICE-COEFFICIENT)
pd.DataFrame.from_dict(evaluate_algorithm (dice_coefficient_on_reduction_of_expansion , step_list), orient="index", columns=["threshold", "precision", "recall", "F1"])

Unnamed: 0,threshold,precision,recall,F1
8,0.44,0.85794,0.946809,0.900186
16,0.63,0.884483,0.861702,0.872944
24,0.67,0.938095,0.772116,0.847052
48,0.72,0.887959,0.772116,0.825996
72,0.8,0.830873,0.756439,0.791911


In [43]:
# ILLOD
pd.DataFrame.from_dict(evaluate_algorithm (illod, [-1]), orient="index", columns=["threshold", "precision", "recall", "F1"])

Unnamed: 0,threshold,precision,recall,F1
8,-1,0.987871,0.912094,0.948472
16,-1,0.9737,0.912094,0.941891
24,-1,0.958235,0.912094,0.934596
48,-1,0.922424,0.912094,0.91723
72,-1,0.885326,0.912094,0.898511
