In [1]:
import string
import pandas as pd
import jellyfish
import random
import math
import wordninja
import spacy
import re
from spacy_syllables import SpacySyllables
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("syllables", after="tagger")

<spacy_syllables.SpacySyllables at 0x15523392d30>

In [2]:
data = pd.read_csv('abbr_db.CSV', names=['abbr', 'long_forms'], sep=';', encoding='utf8')
abbr = list(data['abbr'].values)
long_forms = list(data['long_forms'].values)

In [3]:
extension_cands = [("ex", "xex"), (" to ", " 2to "), (" one ", " 1one "), (" two ", " 2two "), (" three ", " 3three" ), 
(" one", " 1one"), (" two", " 2two"), ("-to-", "-2to-"), ("-one-", "-1one-"), ("-two-", "-2two-"),
("-three-", "-3three-" ), (" one", " 1one"), (" two", " 2two"), (" three", " 3three"), 
(" four", " 4four"), (" 1 ", " 1one "), (" 2 ", " 2two "), (" 3 ", " 3three "), (" and ", " &and "), 
(" & ", " &and "), (" & ", " &and "), (" + ", " +plus ")]

In [4]:
def lower_aep_candidate_pairs (a, t):
    return a.lower(), t.lower()

In [5]:
def check_order_rtl(a, t):
    abbv_reversed = a.lower()[::-1]
    term_reversed = t.lower()[::-1]
    len_of_term = len(t)
    
    pos_memory = 0
    pos_memory_list = []
    order_matching_string_rev = ""
    first_letter_of_term_already_reached = False
    
    for j, char_from_abbv in enumerate(abbv_reversed):
        if j == len(abbv_reversed) - 1:
            if char_from_abbv == term_reversed[-1] and first_letter_of_term_already_reached:
                continue
            elif char_from_abbv == term_reversed[-1]:
                order_matching_string_rev = order_matching_string_rev + char_from_abbv
                pos_memory_list.append(0)
        else:
            for i, char_from_term in enumerate(term_reversed[pos_memory:]):
                remained_term_to_be_checked = term_reversed[pos_memory:]
                if char_from_abbv == char_from_term:
                    order_matching_string_rev = order_matching_string_rev + char_from_abbv
                    pos_memory = pos_memory + i + 1
                    pos_memory_list.append(len_of_term - pos_memory)
                    if i+1 == len(remained_term_to_be_checked):
                        first_letter_of_term_already_reached = True
                    break
                
    if order_matching_string_rev == abbv_reversed:
        return True, pos_memory_list[::-1]
    else:
        return False, pos_memory_list[::-1]

def extract_initial_letters_of_syllables(abbv, term):
    term = re.sub('\d+', '', term)
    term = " ".join(w for w in wordninja.split(term))
    doc = nlp(term)
    syl_list_ = [token._.syllables for token in doc]
    #syl_list_ = [token for token in syl_list if token]
    syls = []
    for sl in syl_list_:
        for s in sl:
            syls.append(s)
    initial_letters_of_syllables = "".join(s[0].lower() for s in syls)
    return initial_letters_of_syllables

In [6]:
extract_initial_letters_of_syllables("ADC", "analog-to-digital")

'altdit'

In [7]:
def clf_with_acronyms(a,t):
    if (a[0].lower() == t[0].lower()):
        a, t = lower_aep_candidate_pairs(a, t)
        acronym_of_term = ''.join([c[0] for c in t.split()])
        return check_order_rtl(a, acronym_of_term)[0]
    else:
        return False

In [8]:
def clf_with_syllables(a,t):
    if (a[0].lower() == t[0].lower()):
        a, t = lower_aep_candidate_pairs(a, t)
        t_ = re.sub('[\'-./+\\\\]',  ' ', t)
        t_ = " ".join(w for w in t_.split())
        acronym_of_syllables = extract_initial_letters_of_syllables(a, t_)
        return check_order_rtl(a, acronym_of_syllables)[0]
    else:
        return False

In [9]:
def clf_on_raw_term(a,t):
    if (a[0].lower() == t[0].lower()):
        a, t = lower_aep_candidate_pairs(a, t)
        return check_order_rtl(a,t)[0]
    else:
        return False

In [10]:
def extend_term(t):
    extended_t = t
    for pair in extension_cands:
        if pair[0] in t:
            extended_t = extended_t.replace(pair[0], pair[1])
    return extended_t


def clf_on_extended_term(a,t):
    if (a[0].lower() == t[0].lower()) or "ex" in t.lower():
        a, t = lower_aep_candidate_pairs (a, t)
        extended_t = extend_term(t)
        return check_order_rtl(a, extended_t)[0]
    else:
        return False

In [11]:
S = set()
for i, abb in enumerate(abbr):
    for j, exp in enumerate (long_forms):
        if abb != abbr[j]:
            S.add((abb, exp))
print(len(S))

fp_list = []

def calc_fp(algo):
    FP = 0
    iteration_counter = 0
    for sf_lf_tuple in S:
        if algo(sf_lf_tuple[0], sf_lf_tuple[1]):
            fp_list.append(sf_lf_tuple)
            #print("\"" + sf_lf_tuple[0] + "\", " + "\"" + sf_lf_tuple[1] + "\"")
            FP +=1
        #iteration_counter = iteration_counter +1
        #if iteration_counter % 50000 == 0:
        #    print("#########################")
        #    print ("iteration count is: " + str(iteration_counter))
    return str(FP) + " FALSE POSITIVE detections out of " +  str(len(S)) + " created false examples"

2710125


In [12]:
def calc_fn(algo):
    FN = 0
    for i, abb in enumerate(abbr):
        if not algo(abb, long_forms[i]):
            #print("\"" + abb + "\", " + "\"" + long_forms[i] + "\"")
            #print("##################################")
            FN += 1
    return str(FN) + " FALSE NEGATIVES. Pairs that could not be detected out of " + str(len(abbr)) + " given pairs"

In [23]:
#################### clf_on_extended_term

In [49]:
b = calc_fp(clf_on_extended_term)
print(b)

29648 FALSE POSITIVE detections out of 2710125 created false examples


In [48]:
a = calc_fn(clf_on_extended_term)
print(a)

100 FALSE NEGATIVES. Pairs that could not be detected out of 1786 given pairs


In [26]:
#################### clf_on_raw_term

In [47]:
b = calc_fp(clf_on_raw_term)
print(b)

29856 FALSE POSITIVE detections out of 2710125 created false examples


In [46]:
a = calc_fn(clf_on_raw_term)
print(a)

104 FALSE NEGATIVES. Pairs that could not be detected out of 1786 given pairs


In [18]:
#################### clf_with_syllables

In [62]:
b = calc_fp(clf_with_syllables)
print(b)

7164 FALSE POSITIVE detections out of 2710125 created false examples


In [61]:
a = calc_fn(clf_with_syllables)
print(a)

462 FALSE NEGATIVES. Pairs that could not be detected out of 1786 given pairs


In [30]:
#################### clf_with_acronyms

In [31]:
b = calc_fp(clf_with_acronyms)
print(b)

2147 FALSE POSITIVE detections out of 2710125 created false examples


In [32]:
a = calc_fn(clf_with_acronyms)
print(a)

734 FALSE NEGATIVES. Pairs that could not be detected out of 1786 given pairs
