# Average Syntactic Similarities on $L$

In [2]:
import string
import pandas as pd
import jellyfish

# Loading Abbreviation-Expansion List

In [3]:
data = pd.read_csv('abbr_db.CSV', names=['abbr', 'long_forms'], sep=';', encoding='utf8')
abbreviations = list(data['abbr'].values)
expansions = list(data['long_forms'].values)

## Helper methods to calculate values for Table 2 (Section 5.2)

In [14]:
def dice_coefficient(a, b):
    """dice coefficient 2nt/(na + nb)."""
    a_bigrams = set(a.lower())
    b_bigrams = set(b.lower())
    overlap = len(a_bigrams & b_bigrams)
    return overlap * 2.0 / (len(a_bigrams) + len(b_bigrams))

def stop_words_handling(term):
    splitted_term = term.split()
    stop_words = set(["for", "and", "of", "in", "via", "be"])
    
    if splitted_term[0] in stop_words:
        stop_words = stop_words - set([splitted_term[0]])
                
    for sw in stop_words:
        while sw in splitted_term:
            splitted_term.remove(sw)
    sanitized_term = " ".join([w for w in splitted_term]) 
        
    return sanitized_term

def clean_string(s):
    s_lower = s.lower()
    invalidcharacters = set(string.punctuation)
    if any(char in invalidcharacters for char in s):
        s_ = s_lower.translate(str.maketrans('', '', string.punctuation))
    else:
        s_ = s_lower
    return s_


def calculate_average_similarity (abbreviation_list, terms_list):
    similarity_measures = [jellyfish.levenshtein_distance, jellyfish.jaro_winkler_similarity, dice_coefficient]
    result_list = []
    for sim in similarity_measures:
        tmp_sim = 0
        for index, abb in enumerate(abbreviation_list):
            term = terms_list[index]
            if sim == jellyfish.levenshtein_distance:
                tmp_sim = tmp_sim + (1 - (sim(abb, term)/max(len(abb), len(term))))
            else:
                tmp_sim = tmp_sim + sim(abb, term)
        result_list.append(tmp_sim/len(abbreviation_list))
    return result_list

## Calculate Similarities for Table 2
Average distance on pairs $(a,t)$ for the measures LD, JWS, DC <br>
Average distance on pairs $(a^{c},t^{c})$ for the measures LD, JWS, DC <br>
Average distance on pairs $(a, \hat{a})$ for the measures LD, JWS, DC <br>
Average distance on pairs $(a^{c},\hat{a}^{c})$ for the measures LD, JWS, DC <br>

In [15]:
result_dict = {}
# Average distance on pairs (a,t) for the measures LD, JWS, DC:
result_dict["(a,e)"] = calculate_average_similarity(abbreviations, expansions)

###############################################################################

# Average distance on pairs (a^{c},t^{c}) for the measures LD, JWS, DC:
abbreviations_removed_sw = [stop_words_handling(abb) for abb in abbreviations]
terms_removed_sw = [stop_words_handling(term) for term in expansions]
abbreviations_cleaned = [clean_string(abb) for abb in abbreviations_removed_sw]
terms_cleaned = [clean_string(term) for term in terms_removed_sw]
result_dict["(preprocess(a), preprocess(e))"] = calculate_average_similarity(abbreviations_cleaned, terms_cleaned)

###############################################################################

# Average distance on pairs (a,â) for the measures LD, JWS, DC:
potential_abbreviations = [''.join([c[0] for c in term.split()]) for term in expansions]
result_dict["(a, pot_abb(e))"] = calculate_average_similarity(abbreviations, potential_abbreviations)

###############################################################################

# Average distance on pairs (a^{c},â^{c}) for the measures LD, JWS, DC:
potential_abbreviations_of_cleaned_terms = [''.join([c[0] for c in term.split()]) for term in terms_cleaned]
result_dict["(preprocess(a), pot_abb(preprocess(e)))"] = calculate_average_similarity(abbreviations_cleaned, potential_abbreviations_of_cleaned_terms)


pd.DataFrame.from_dict(result_dict, orient="index", columns=["LD", "JWS", "DC"])

Unnamed: 0,LD,JWS,DC
"(a,e)",0.092446,0.309613,0.418682
"(preprocess(a), preprocess(e))",0.182515,0.637023,0.422113
"(a, pot_abb(e))",0.361353,0.422471,0.861103
"(preprocess(a), pot_abb(preprocess(e)))",0.796925,0.896257,0.864684


#### Average length of abbreviations after pre-processing

In [16]:
# Average length after pre-processing
tmp_len = 0
for abb in abbreviations_cleaned:
    tmp_len = tmp_len + len(abb)
print(tmp_len/len(abbreviations_cleaned))

3.5498320268756998


In [9]:
# Finding Bigram Abbreviations
count = 1
for abb in abbreviations:
    if len(abb.split())>1:
        print(str(count) + ") " + abb)
        count += 1

1) AE lock
2) A record
3) BEDO DRAM
4) CD-ROM XA
5) Cell phone
6) DDR SDRAM
7) Eb / EB
8) Gb / GB
9) Gib / GiB
10) HD DVD
11) IP Rating
12) Java EE
13) kb / kbit
14) LED monitor
15) Lo-res / low-res
16) Microsoft MVP
17) Model no.
18) MO diskette
19) MX record
20) Pg Dn
21) Pg Up
22) QR Code
23) RF shielding
24) RIPE notes
25) RLL encoding
26) ROM BIOS
27) RO terminal
28) SDHC card
29) Serial no.
30) SHV connector
31) SIM card
32) SM card
33) Telco line
34) Triple DEA
35) VL Bus
36) Web app
37) Windows PE
