In [1]:
import pandas as pd
import string
import pandas as pd
import jellyfish
import random

# Abbreviation-Expansion List
Before we go into the details of the ILLOD tool, we first will give some insights into our evaluation data for AEP-Detection

In [2]:
data = pd.read_csv('abbr_db.CSV', names=['abbr', 'long_forms'], sep=';', encoding='utf8')
abbreviations = list(data['abbr'].values)
expansions = list(data['long_forms'].values)
#for i, abb in enumerate(abbreviations):
#    print(str(i) + ": " + abb + "| " + expansions[i])

# Introducing Helper Functions
These helper functions are needed in order to provide important methods for syntactic and semantic similarity measures and for ILLOD. We need a method to calculate the dice coefficient between two given strings since the jellyfish package doesnt provide this funcionality

In [3]:
def dice_coefficient(a, b):
    """dice coefficient 2nt/(na + nb)."""
    a_bigrams = set(a.lower())
    b_bigrams = set(b.lower())
    overlap = len(a_bigrams & b_bigrams)
    return overlap * 2.0 / (len(a_bigrams) + len(b_bigrams))

### Method to remove puntuation marks from  a given strings 

In [4]:
def clean_string(s):
    s_lower = s.lower()
    invalidcharacters = set(string.punctuation)
    if any(char in invalidcharacters for char in s):
        s_ = s_lower.translate(str.maketrans('', '', string.punctuation))
    else:
        s_ = s_lower
    return s_

### Method to remove stop words from  a given term

In [5]:
def stop_words_handling(term):
    splitted_term = term.split()
    stop_words = set(["for", "and", "of", "in", "via", "be"])
    
    if splitted_term[0] in stop_words:
        stop_words = stop_words - set([splitted_term[0]])
                
    for sw in stop_words:
        while sw in splitted_term:
            splitted_term.remove(sw)
    sanitized_term = " ".join([w for w in splitted_term]) 
        
    return sanitized_term

### Method to calculate and return $(a^{c}, potAbb(t^{c}))$  for a given pair $(a,t)$

In [6]:
def clean_string_pair_and_reduce_expansion(abb, term):
    abb_lower = abb.lower()
    term_lower = term.lower()
    sanitized_abbv = clean_string(abb_lower)
    sanitized_term = clean_string(term_lower)   
    sanitized_term_without_stopswords = stop_words_handling(sanitized_term)
    initial_letters_of_tokens_of_sanitized_term_without_stopswords = ''.join([c[0] for c in sanitized_term_without_stopswords.split()])
    return sanitized_abbv, initial_letters_of_tokens_of_sanitized_term_without_stopswords

# Classifiers based on semantic similarity (FastText)
## Algortihm 1

In [7]:
import fasttext
import fasttext.util
from scipy import spatial
# fasttext.util.download_model('en', if_exists='ignore')
ft = fasttext.load_model("cc.en.300.bin")



In [8]:
def fast_text_similarity(a, t, threshold):
    
    a_v = ft.get_word_vector(a)
    t_v = ft.get_word_vector(t)
    if 1 - spatial.distance.cosine(a_v, t_v) >= threshold:
        return True
    else:
        return False

##  Cosine Similarity on Fasttext Wordvectors

In [9]:
def fast_text_sim(a, t):
    
    a_v = ft.get_word_vector(a)
    t_v = ft.get_word_vector(t)
    return 1 - spatial.distance.cosine(a_v, t_v)

# Classifiers based on syntactic similarity (LD, JWS, DC, DC)
## Algorithm 2 in different variants

In [10]:
def levensthein_distance_on_reduction_of_expansion(a, term, threshold):
    a_, t_ = clean_string_pair_and_reduce_expansion(a, term)
    if jellyfish.levenshtein_distance(a_, t_) <= threshold:
        return True
    else:
        return False

In [11]:
def jaro_winkler_similarity_on_reduction_of_expansion(a, term, threshold):
    a_, t_ = clean_string_pair_and_reduce_expansion(a, term)
    if jellyfish.jaro_winkler_similarity(a_, t_) >= threshold:
        return True
    else:
        return False

In [12]:
def dice_coefficient_on_reduction_of_expansion(a, term, threshold):
    a_, t_ = clean_string_pair_and_reduce_expansion(a, term)
    if dice_coefficient(a_, t_) >= threshold:
        return True
    else:
        return False

# Replicability: 
## Similarities for Table 1 (Section 4.2)

In [13]:
random_AEP_pairs = [
    ("LED monitor", "light-emitting diode"),
    ("Int", "integer"),
    ("PS/2", "Personal System/2"),
    ("IANA", "Internet Assigned Numbers Authority"),
    ("SMM", "System Management Mode"),
    ("U/L", "upload"),
    ("IAP", "Internet access provider"),
    ("CLNS", "connectionless network service"),
    ("MMC", "MultiMediaCard"),
    ("I/O", "input/output")
]

In [14]:
measures = ["LD", "DLD", "JS", "JWS", "DC", "FT"]
for aep_tuple in random_AEP_pairs:
    print("+++++++++++++++++++++++++++++++++++++++++++++++")
    print("Similarities for (" + aep_tuple[0] + ", " + aep_tuple[1] + ")")
    for j, measure in enumerate([jellyfish.levenshtein_distance, jellyfish.damerau_levenshtein_distance, jellyfish.jaro_similarity, jellyfish.jaro_winkler_similarity, dice_coefficient, fast_text_sim]):
        if measure in [jellyfish.levenshtein_distance, jellyfish.damerau_levenshtein_distance]:
            print(measures[j] +": " + str(1 - (measure(aep_tuple[0], aep_tuple[1])/max(len(aep_tuple[0]), len(aep_tuple[1])))))
        else:
            print(measures[j] +": " + str(measure(aep_tuple[0], aep_tuple[1])))

+++++++++++++++++++++++++++++++++++++++++++++++
Similarities for (LED monitor, light-emitting diode)
LD: 0.15000000000000002
DLD: 0.15000000000000002
JS: 0.4348484848484849
JWS: 0.4348484848484849
DC: 0.8181818181818182
FT: 0.2985321581363678
+++++++++++++++++++++++++++++++++++++++++++++++
Similarities for (Int, integer)
LD: 0.2857142857142857
DLD: 0.2857142857142857
JS: 0.6507936507936508
JWS: 0.6507936507936508
DC: 0.6666666666666666
FT: 0.20010310411453247
+++++++++++++++++++++++++++++++++++++++++++++++
Similarities for (PS/2, Personal System/2)
LD: 0.23529411764705888
DLD: 0.23529411764705888
JS: 0.4362745098039216
JWS: 0.4362745098039216
DC: 0.4444444444444444
FT: 0.18993335962295532
+++++++++++++++++++++++++++++++++++++++++++++++
Similarities for (IANA, Internet Assigned Numbers Authority)
LD: 0.11428571428571432
DLD: 0.11428571428571432
JS: 0.611904761904762
JWS: 0.611904761904762
DC: 0.3157894736842105
FT: 0.09341581165790558
+++++++++++++++++++++++++++++++++++++++++++++++
Simi