In [42]:
import pandas as pd
import string
import pandas as pd
import jellyfish
import random
import fasttext

In [43]:
import fasttext.util
from scipy import spatial
# fasttext.util.download_model('en', if_exists='ignore')
ft = fasttext.load_model("cc.en.300.bin")



# Abbreviation-Expansion List
Bevor we go into the details of the ILLOD tool, we first will give some insights into our evaluation data for AEP-Detection

In [44]:
data = pd.read_csv('abbr_db.CSV', names=['abbr', 'long_forms'], sep=';', encoding='utf8')
abbreviations = list(data['abbr'].values)
expansions = list(data['long_forms'].values)
for i, abb in enumerate(abbreviations):
    print(str(i) + ": " + abb + "| " + expansions[i])

0: AA| Anti-alias
1: AAC| Advanced Audio Coding
2: AAM| autmoatic acoustic management
3: AAS| Auto Area Segmentation
4: AAS| as a service
5: ABC| Atanasoff-Berry Computer
6: ABIOS| Advanced BIOS
7: ABP| AdBlock Plus
8: AC| alternating current
9: ACCT| account
10: ACE| access control entry
11: Ack| acknowledgment
12: ACL| access control list
13: ACM| Association for Computing Machinery
14: ACPI| Advanced Configuration and Power Interface
15: ACR| annual compliance report
16: ACR| actual cell rate
17: ACR| attenuation crosstalk ratio
18: ACR| absolute cell reference
19: ACS| access control system
20: AD| Active Directory
21: ADB| Apple Desktop Bus
22: ADB| Android Debug Bridge
23: ADC| analog-to-digital
24: ADO| ActiveX Data Object
25: ADPCM| adaptive delta pulse code modulation
26: ADSI| analog display service interface
27: ADSI| Active Directory Service Interface
28: ADSL| asymmetric digital subscriber line
29: ADSM| ADSTAR Distributed Storage Management
30: AE| automatic exposure
31: 

# Introducing Helper Functions
These helper functions are needed in order to provide important methods for syntactic and semantic similarity measures and for ILLOD. We need a method to calculate the dice coefficient between two given strings since the jellyfish package doesnt provide this funcionality

In [45]:
def dice_coefficient(a, b):
    """dice coefficient 2nt/(na + nb)."""
    a_bigrams = set(a.lower())
    b_bigrams = set(b.lower())
    overlap = len(a_bigrams & b_bigrams)
    return overlap * 2.0 / (len(a_bigrams) + len(b_bigrams))

### Method to remove puntuation marks from  a given strings 

In [46]:
def clean_string(s):
    s_lower = s.lower()
    invalidcharacters = set(string.punctuation)
    if any(char in invalidcharacters for char in s):
        s_ = s_lower.translate(str.maketrans('', '', string.punctuation))
    else:
        s_ = s_lower
    return s_

### Method to remove stop words from  a given term

In [47]:
def stop_words_handling(term):
    splitted_term = term.split()
    stop_words = set(["for", "and", "of", "in", "via", "be"])
    
    if splitted_term[0] in stop_words:
        stop_words = stop_words - set([splitted_term[0]])
                
    for sw in stop_words:
        while sw in splitted_term:
            splitted_term.remove(sw)
    sanitized_term = " ".join([w for w in splitted_term]) 
        
    return sanitized_term

### Method to calculate and return $(a^{c}, potAbb(t^{c}))$  for a given pair $(a,t)$

In [48]:
def clean_string_pair_and_reduce_expansion(abb, term):
    abb_lower = abb.lower()
    term_lower = term.lower()
    sanitized_abbv = clean_string(abb_lower)
    sanitized_term = clean_string(term_lower)   
    sanitized_term_without_stopswords = stop_words_handling(sanitized_term)
    initial_letters_of_tokens_of_sanitized_term_without_stopswords = ''.join([c[0] for c in sanitized_term_without_stopswords.split()])
    return sanitized_abbv, initial_letters_of_tokens_of_sanitized_term_without_stopswords

# Classifiers based on semantic similarity (FastText)
## Algortihm 1

In [49]:
def fast_text_similarity(a, t, threshold):
    
    a_v = ft.get_word_vector(a)
    t_v = ft.get_word_vector(t)
    if 1 - spatial.distance.cosine(a_v, t_v) >= threshold:
        return True
    else:
        return False

##  Cosine Similarity on Fasttext Wordvectors

In [50]:
def fast_text_sim(a, t):
    
    a_v = ft.get_word_vector(a)
    t_v = ft.get_word_vector(t)
    return 1 - spatial.distance.cosine(a_v, t_v)

# Classifiers based on syntactic similarity (LD, JWS, DC, DC)
## Algorithm 2 in different variants

In [51]:
def levensthein_distance_on_reduction_of_expansion(a, term, threshold):
    a_, t_ = clean_string_pair_and_reduce_expansion(a, term)
    if jellyfish.levenshtein_distance(a_, t_) <= threshold:
        return True
    else:
        return False

In [52]:
def jaro_winkler_similarity_on_reduction_of_expansion(a, term, threshold):
    a_, t_ = clean_string_pair_and_reduce_expansion(a, term)
    if jellyfish.jaro_winkler_similarity(a_, t_) >= threshold:
        return True
    else:
        return False

In [53]:
def dice_coefficient_on_reduction_of_expansion(a, term, threshold):
    a_, t_ = clean_string_pair_and_reduce_expansion(a, term)
    if dice_coefficient(a_, t_) >= threshold:
        return True
    else:
        return False

# Replicability: 
## Similarities for Table 1 (Section 4.2)

In [54]:
random_AEP_pairs = [
    ("LED monitor", "light-emitting diode"),
    ("Int", "integer"),
    ("PS/2", "Personal System/2"),
    ("IANA", "Internet Assigned Numbers Authority"),
    ("SMM", "System Management Mode"),
    ("U/L", "upload"),
    ("IAP", "Internet access provider"),
    ("CLNS", "connectionless network service"),
    ("MMC", "MultiMediaCard"),
    ("I/O", "input/output")
]

In [55]:
measures = ["LD", "DLD", "JS", "JWS", "DC", "FT"]
for aep_tuple in random_AEP_pairs:
    print("+++++++++++++++++++++++++++++++++++++++++++++++")
    print("Similarities for (" + aep_tuple[0] + ", " + aep_tuple[1] + ")")
    for j, measure in enumerate([jellyfish.levenshtein_distance, jellyfish.damerau_levenshtein_distance, jellyfish.jaro_similarity, jellyfish.jaro_winkler_similarity, dice_coefficient, fast_text_sim]):
        if measure in [jellyfish.levenshtein_distance, jellyfish.damerau_levenshtein_distance]:
            print(measures[j] +": " + str(1 - (measure(aep_tuple[0], aep_tuple[1])/max(len(aep_tuple[0]), len(aep_tuple[1])))))
        else:
            print(measures[j] +": " + str(measure(aep_tuple[0], aep_tuple[1])))

+++++++++++++++++++++++++++++++++++++++++++++++
Similarities for (LED monitor, light-emitting diode)
LD: 0.15000000000000002
DLD: 0.15000000000000002
JS: 0.4348484848484849
JWS: 0.4348484848484849
DC: 0.8181818181818182
FT: 0.2985321581363678
+++++++++++++++++++++++++++++++++++++++++++++++
Similarities for (Int, integer)
LD: 0.2857142857142857
DLD: 0.2857142857142857
JS: 0.6507936507936508
JWS: 0.6507936507936508
DC: 0.6666666666666666
FT: 0.20010310411453247
+++++++++++++++++++++++++++++++++++++++++++++++
Similarities for (PS/2, Personal System/2)
LD: 0.23529411764705888
DLD: 0.23529411764705888
JS: 0.4362745098039216
JWS: 0.4362745098039216
DC: 0.4444444444444444
FT: 0.18993335962295532
+++++++++++++++++++++++++++++++++++++++++++++++
Similarities for (IANA, Internet Assigned Numbers Authority)
LD: 0.11428571428571432
DLD: 0.11428571428571432
JS: 0.611904761904762
JWS: 0.611904761904762
DC: 0.3157894736842105
FT: 0.09341581165790558
+++++++++++++++++++++++++++++++++++++++++++++++
Simi

## Methods to calculate values for Table 2 (Section 4.2)

In [56]:
def calculate_average_similarity (abbreviation_list, terms_list):
    similarity_measures = [jellyfish.levenshtein_distance, jellyfish.jaro_winkler_similarity, dice_coefficient]
    result_list = []
    for sim in similarity_measures:
        tmp_sim = 0
        for index, abb in enumerate(abbreviation_list):
            term = terms_list[index]
            if sim == jellyfish.levenshtein_distance:
                tmp_sim = tmp_sim + (1 - (sim(abb, term)/max(len(abb), len(term))))
            else:
                tmp_sim = tmp_sim + sim(abb, term)
        result_list.append(tmp_sim/len(abbreviation_list))
    return result_list

In [57]:
# Average distance on pairs (a,t) for the measures LD, JWS, DC:

print(calculate_average_similarity(abbreviations, expansions))

[0.09244587264953216, 0.3104687280041735, 0.41868180511878067]


In [58]:
# Average distance on pairs (a^{c},t^{c}) for the measures LD, JWS, DC:

abbreviations_removed_sw = [stop_words_handling(abb) for abb in abbreviations]
terms_removed_sw = [stop_words_handling(term) for term in expansions]
abbreviations_cleaned = [clean_string(abb) for abb in abbreviations_removed_sw]
terms_cleaned = [clean_string(term) for term in terms_removed_sw]


print(calculate_average_similarity(abbreviations_cleaned, terms_cleaned))

[0.18251453867995351, 0.644198269507842, 0.42211256293446175]


In [59]:
# Average distance on pairs (a,â) for the measures LD, JWS, DC:

potential_abbreviations = [''.join([c[0] for c in term.split()]) for term in expansions]
print(calculate_average_similarity(abbreviations, potential_abbreviations))

[0.3613525520945073, 0.42489010048036335, 0.8611033252550622]


In [60]:
# Average distance on pairs (a^{c},â^{c}) for the measures LD, JWS, DC:

potential_abbreviations_of_cleaned_terms = [''.join([c[0] for c in term.split()]) for term in terms_cleaned]
print(calculate_average_similarity(abbreviations_cleaned, potential_abbreviations_of_cleaned_terms))

[0.7969252528477795, 0.9022384139498926, 0.8646839240764218]


#### Average length of abbreviations after pre-processing

In [61]:
# Average length after pre-processing
tmp_len = 0
for abb in abbreviations_cleaned:
    tmp_len = tmp_len + len(abb)
print(tmp_len/len(abbreviations_cleaned))

3.5498320268756998


#### cardinality of S:

In [62]:
# construction and cardinality of S:
S = set()
for i, abb in enumerate(abbreviations):
    for j, exp in enumerate (expansions):
        if abb != abbreviations[j]:
            S.add((abb, exp))
print(len(S))

2710125


# ILLOD with its Methods (Section 4.3)

In [63]:
def check_initial_letters(a, t):
    initial_letters_of_tokens_of_t = ''.join([c[0] for c in t.split()])
    if initial_letters_of_tokens_of_t == a or initial_letters_of_tokens_of_t.upper() == a:
        return True

In [64]:
def check_length_consistency(a, t):
    length_consistency = False
    if len(t.split()) <= len(a):
        length_consistency = True
    return length_consistency

In [65]:
def check_order(a, t):
    abbv_reversed = a.lower()[::-1]
    term_reversed = t.lower()[::-1]
    len_of_term = len(t)
    
    pos_memory = 0
    pos_memory_list = []
    order_matching_string_rev = ""
    
    for j, char_from_abbv in enumerate(abbv_reversed):
        if j == len(abbv_reversed) - 1 and len(pos_memory_list) > 0 and pos_memory == len(term_reversed):
            break
        else:
            for i, char_from_term in enumerate(term_reversed[pos_memory:]):
                if char_from_abbv == char_from_term:
                    order_matching_string_rev = order_matching_string_rev + char_from_abbv
                    pos_memory = pos_memory + i + 1
                    pos_memory_list.append(len_of_term - pos_memory)
                    break
    if order_matching_string_rev == abbv_reversed:
        return True, pos_memory_list[::-1]
    else:
        return False, []

In [66]:
def check_distribution_of_matching_characters(pos_of_chars_list, t):
    term_intervals = []
    len_of_term = len(t)
    i = 0
    while i < len_of_term:
        sublist = []
        j = i
        while j < len_of_term and t[j] != " ":
            sublist.append(j)
            j = j+ 1
        i = j+1
        term_intervals.append(sublist)
        
    splitted_term = t.split()      
    
    containment_list = []
    for i, interval in enumerate(term_intervals):
        contanment_sublist = []
        for pos in pos_of_chars_list:
            if (pos in interval) and (splitted_term[i][0] == t[pos]):
                contanment_sublist.append(0)
            elif pos in interval:
                contanment_sublist.append(interval.index(pos))
        if len(contanment_sublist) == 0:
            contanment_sublist.append(-1)
        containment_list.append(contanment_sublist)
    
    result_of_distribution_check = False
    if len(containment_list) <= 1:
        result_of_distribution_check = True
    elif len (containment_list) >= 2:
        non_zero_count = 0
        for sublist in containment_list[1:]:
            if len(sublist) == 1 and 0 not in sublist:
                non_zero_count += 1
        if non_zero_count == 0:
            result_of_distribution_check = True
    
    return result_of_distribution_check

In [67]:
def illod(abbv, term, threshold=None):
    if (abbv[0].lower() == term[0].lower()):
        
        
        ###################################### Step (a) ##########################################
        # check wether initial letters of tokens in t match with the letters in abbreviation
        if check_initial_letters(abbv, term):
            return True
        
        
        
        ###################################### Step (b) ########################################
        # clean abbreviation and term from special characters and stopwords
        a_, t_ = clean_string_pair_and_reduce_expansion(abbv, term)
        if a_ == t_:
            return True
        
        sanitized_abbv = clean_string(abbv) 
        sanitized_term = clean_string(term)
        sanitized_term_without_stopswords = stop_words_handling(sanitized_term)
        sanitized_term_without_stopswords_splitted  = sanitized_term_without_stopswords.split()
        
        ###################################### Step (c), (d), (e) ###############################
        # Sequential call of the methods that check and compare lengths, order and distribution of characters
        length_consistency = check_length_consistency(sanitized_abbv, sanitized_term_without_stopswords)
        order, pos_of_chars_list = check_order(sanitized_abbv, sanitized_term_without_stopswords)
        distribution = check_distribution_of_matching_characters(pos_of_chars_list, sanitized_term_without_stopswords)


        if length_consistency and order and distribution:
            return True
        else:
            return False

        ################################## in case first letter differs ##########################
    else:
        return False

# Evaluation of the 3 different AEP-Detection Types (Section 4.4)

In [68]:
def find_and_count_false_negatives(algo, threshold):
    FN = 0
    for i, abb in enumerate(abbreviations):
        if not algo(abb, expansions[i], threshold):
            # print("\""+abb+"\""+", "+"\""+expansions[i]+"\"")
            FN += 1
    return FN, str(FN) + " FALSE NEGATIVES. Pairs that could not be detected out of " + str(len(abbreviations)) + " given pairs"

In [69]:
def find_and_count_false_positives(algo, threshold, alpha):
    test_set = []
    while len(test_set) <= alpha * len(abbreviations):
        rd1 = random.randint(0, len(abbreviations)-1)
        rd2 = random.randint(0, len(abbreviations)-1)
        if abbreviations[rd1] != abbreviations[rd2]:
            test_set.append ((abbreviations[rd1], expansions[rd2]))
            
    count_of_false_examples = 0
    FP = 0
    for j, tup in enumerate (test_set): 
        if algo(tup[0], tup[1], threshold):
            count_of_false_examples += 1
            FP +=1
    return FP, str(FP) + " FALSE POSITIVE detections out of " +  str(len(test_set)) + " created false examples"

In [70]:
def determine_quality_parameters(alpha, algo, search_space_for_F1_optimization):
    max_f1 = 0
    best_values = []
    for th_ in search_space_for_F1_optimization:
        result_on_L = find_and_count_false_negatives(algo, th_)
        result_on_S = find_and_count_false_positives(algo, th_, alpha)
        FN = result_on_L[0]
        FP = result_on_S[0]
        TP = len(abbreviations) - FN
        
        
        
        if FP + TP == 0:
            precision = 0
            recall = 0
            f1 = 0
        else:
            precision = TP/(TP + FP)
            recall = TP/(TP + FN)
            f1 = (2*precision*recall)/(precision+recall)
            
            
            
        if f1 > max_f1:
            best_values = [th_, precision, recall, f1]
            max_f1 = f1
    return best_values

In [71]:
def evaluate_algorithm (algorithm, F1_optimization_search_space):
    eval_data = {}
    for alpha in [8, 16, 24, 48, 72]:
        max_f1 = 0
        best_values = []     
        eval_data[alpha] = determine_quality_parameters(alpha, algorithm, F1_optimization_search_space)
    return eval_data

In [72]:
step_list = [h/100 for h in list(range(0,100))]

In [73]:
pd.DataFrame.from_dict(evaluate_algorithm (fast_text_similarity, step_list), orient="index", columns=["threshold", "precision", "recall", "F1"])

  dist = 1.0 - uv / np.sqrt(uu * vv)


Unnamed: 0,threshold,precision,recall,F1
8,0.12,0.217585,0.43785,0.290706
16,0.13,0.128668,0.382979,0.192622
24,0.15,0.09864,0.288354,0.146996
48,0.16,0.053063,0.253639,0.087765
72,0.41,0.080103,0.052072,0.063115


In [74]:
# LD (LEVENSHTEIN_DISTANCE)
pd.DataFrame.from_dict(evaluate_algorithm (levensthein_distance_on_reduction_of_expansion, list(range(0, 4))), orient="index", columns=["threshold", "precision", "recall", "F1"])

Unnamed: 0,threshold,precision,recall,F1
8,1,0.909033,0.805711,0.854259
16,1,0.831792,0.805711,0.818544
24,1,0.751436,0.805711,0.777628
48,0,0.989109,0.559351,0.714592
72,0,0.980373,0.559351,0.712299


In [75]:
# JWS (JARO-WINKLER-SIMILARITY)
pd.DataFrame.from_dict(evaluate_algorithm (jaro_winkler_similarity_on_reduction_of_expansion, step_list), orient="index", columns=["threshold", "precision", "recall", "F1"])

Unnamed: 0,threshold,precision,recall,F1
8,0.77,0.904564,0.854423,0.878779
16,0.79,0.869795,0.830347,0.849613
24,0.84,0.923861,0.760918,0.83451
48,0.84,0.862857,0.760918,0.808688
72,0.86,0.889685,0.695409,0.780641


In [76]:
# DC (DICE-COEFFICIENT)
pd.DataFrame.from_dict(evaluate_algorithm (dice_coefficient_on_reduction_of_expansion , step_list), orient="index", columns=["threshold", "precision", "recall", "F1"])

Unnamed: 0,threshold,precision,recall,F1
8,0.67,0.924566,0.775476,0.843484
16,0.76,0.872587,0.759239,0.811976
24,0.8,0.816265,0.758679,0.786419
48,0.82,0.871546,0.653415,0.74688
72,0.81,0.811544,0.653415,0.723945


In [77]:
# ILLOD
pd.DataFrame.from_dict(evaluate_algorithm (illod, [-1]), orient="index", columns=["threshold", "precision", "recall", "F1"])

Unnamed: 0,threshold,precision,recall,F1
8,-1,0.986675,0.912094,0.94792
16,-1,0.969066,0.912094,0.939717
24,-1,0.961629,0.912094,0.936207
48,-1,0.927149,0.912094,0.91956
72,-1,0.890651,0.912094,0.901245
