#  Output of different similarity measures on randomly selected AEPs for Table 1.

In [32]:
import pandas as pd
import string
import pandas as pd
import jellyfish
import random

## Classifier based on semantic similarity (FastText)

In [34]:
import fasttext
import fasttext.util
from scipy import spatial
# fasttext.util.download_model('en', if_exists='ignore')
ft = fasttext.load_model("cc.en.300.bin")



In [42]:
# Cosine Similarity on Fasttext Wordvectors

def fast_text_sim(a, t):
    
    a_v = ft.get_word_vector(a)
    t_v = ft.get_word_vector(t)
    return abs(1 - spatial.distance.cosine(a_v, t_v))

## Classifier based on syntactic similarity (DiceCoefficient)

In [43]:
def dice_coefficient(a, b):
    """dice coefficient 2nt/(na + nb)."""
    a_bigrams = set(a.lower())
    b_bigrams = set(b.lower())
    overlap = len(a_bigrams & b_bigrams)
    return overlap * 2.0 / (len(a_bigrams) + len(b_bigrams))

# Replicability: 
## Similarities for Table 1 (Section 4.2)

In [44]:
random_AEP_pairs = [
    ("LED monitor", "light-emitting diode"),
    ("Int", "integer"),
    ("PS/2", "Personal System/2"),
    ("IANA", "Internet Assigned Numbers Authority"),
    ("SMM", "System Management Mode"),
    ("U/L", "upload"),
    ("IAP", "Internet access provider"),
    ("CLNS", "connectionless network service"),
    ("MMC", "MultiMediaCard"),
    ("I/O", "input/output")
]

In [45]:
measures = ["LD", "JWS", "DC", "FT"]
result_dict = {}
for aep_tuple in random_AEP_pairs:
    result_dict[aep_tuple[0] + " : " + aep_tuple[1]] = []
    for j, measure in enumerate([jellyfish.levenshtein_distance, jellyfish.jaro_winkler_similarity, dice_coefficient, fast_text_sim]):
        if measure in [jellyfish.levenshtein_distance]:
            tmp_similarity = 1 - (measure(aep_tuple[0], aep_tuple[1])/max(len(aep_tuple[0]), len(aep_tuple[1])))
        else:
            tmp_similarity = measure(aep_tuple[0], aep_tuple[1])
        tmp = result_dict[aep_tuple[0] + " : " + aep_tuple[1]]
        tmp.append(tmp_similarity)
        result_dict[aep_tuple[0] + " : " + aep_tuple[1]] = tmp
pd.DataFrame.from_dict(result_dict, orient="index", columns=["LD", "JWS", "DC", "FT"])

Unnamed: 0,LD,JWS,DC,FT
LED monitor : light-emitting diode,0.15,0.434848,0.818182,0.298532
Int : integer,0.285714,0.650794,0.666667,0.200103
PS/2 : Personal System/2,0.235294,0.436275,0.444444,0.189933
IANA : Internet Assigned Numbers Authority,0.114286,0.611905,0.315789,0.093416
SMM : System Management Mode,0.136364,0.585859,0.307692,0.14236
U/L : upload,0.0,0.0,0.444444,0.02476
IAP : Internet access provider,0.041667,0.458333,0.375,0.060126
CLNS : connectionless network service,0.0,0.0,0.470588,0.07575
MMC : MultiMediaCard,0.214286,0.603175,0.333333,0.532894
I/O : input/output,0.083333,0.472222,0.6,0.147428
