# Evaluation of the Different Approaches to AEP Detection on the Promise dataset (Section 5.5)

In [17]:
import pandas as pd
import Syntactic_Classifiers
import ILLOD
import Abbreviation_and_NC_Extraction

## Reading the content of the PROMISE requirements. 30 abbreviations have been inserted in the texts. We want try to identify them.

In [18]:
requirements_data = pd.read_csv('promise_constructed.CSV', names=['text', 'set_id'], sep=';', encoding='utf8')
# print(requirements_data)

In [19]:
# Preparation of the data with the aim of storing it in a dictionary
data_dict = {}
for id_ in set(requirements_data["set_id"]):
    sublist = requirements_data[requirements_data["set_id"] == id_]
    data_dict[id_] = [req for req in sublist["text"]]

## The following function performs a set transformation. It partitions objects of the set  𝑇  to the sets  𝑂𝑇  and  𝑇∖𝑂𝑇 according to steps (4) and (6) from section (6.2)

In [20]:
def determine_sets_for_term_types(set_of_abbreviations, set_of_terms):
    
    #compliant wit section 5.2: terms_that_contain_abbreviations = T \ OT
    terms_that_contain_abbreviations = set()
    
    for term in set_of_terms:
        for abb in set_of_abbreviations:
            if abb in term.split():
                terms_that_contain_abbreviations.add(term)
    
    ordinary_terms = set_of_terms - terms_that_contain_abbreviations
    
    return ordinary_terms, terms_that_contain_abbreviations              

## Here we generate AEP Candidates and AEP groups with the different approches/ aep_classifiers

In [21]:
def generate_aep_candidates_and_groups(aep_classifier):
    counter = 0
    overall_aep_candidates = []
    AEP_candidate_clusters ={}
    for id_ in data_dict.keys():

        ######### Step(1) + Step(3): Extract set of Abbreviations A and set of terms T############

        terms = set()
        abbv_set = set()
        # print("TUPLES FROM ReqSet: " + str(id_))
        for req in data_dict[id_]:
            terms = terms.union(Abbreviation_and_NC_Extraction.nc_detect(req))
            abbv_set = abbv_set.union(Abbreviation_and_NC_Extraction.abbv_detect(req))

        ############ step(2): Reduce extracted abbreviations set A through cmparision with #######
        ############ project resources so that only undefined abbreviations stay in A ############



        ############################ step(4): determine the sets A, OT and T\OT ##################
        ordinary_terms, terms_that_contain_abbs = determine_sets_for_term_types(abbv_set, terms)


        # For every a∈ A generate an AEP group G^{a} of possible expansions t ∈ OT via ILLOD.####
        ###################################### step(5): ##########################################
        abbreviations_with_matching_candidates = set()
        for abv in abbv_set:
            for term in ordinary_terms:
                if aep_classifier(abv, term):
                    overall_aep_candidates.append((abv, term.lower()))
                    counter += 1
                    abbreviations_with_matching_candidates.add(abv)
                    #print(str(counter)+ ") (" + abv + ", " + term + ")")
                    if id_ in AEP_candidate_clusters.keys():
                        if abv in AEP_candidate_clusters[id_]:
                            expansion_candidates_list = AEP_candidate_clusters[id_][abv]
                            expansion_candidates_list.append(term)
                            AEP_candidate_clusters[id_][abv] = expansion_candidates_list
                        else:
                            AEP_candidate_clusters[id_][abv] = [term]
                    else:
                        AEP_candidate_clusters[id_] = {}

        # For every a∈ A extend its G^{a} with terms t∈ T\OT, if t contains a.####################
        ######################################### step(6): #######################################
        for abv in abbreviations_with_matching_candidates:
            for term in terms_that_contain_abbs:
                if abv in term.split() and abv != term:
                    if id_ in AEP_candidate_clusters.keys():
                        if abv in AEP_candidate_clusters[id_]:
                            expansion_candidates_list = AEP_candidate_clusters[id_][abv]
                            expansion_candidates_list.append(term)
                            AEP_candidate_clusters[id_][abv] = expansion_candidates_list
                        else:
                            AEP_candidate_clusters[id_][abv] = [term]
                    else:
                        AEP_candidate_clusters[id_] = {}
        #print("#####################################################")
    print("number of aep candidates: " + str(counter))

    cluster_counter = 0
    for id_ in AEP_candidate_clusters.keys():
        # print("CLUSTERS FROM ReqSet: " + str(id_) + ":")
        for key in AEP_candidate_clusters[id_]:
            cluster_counter += 1
            # print(str(cluster_counter) + ") " + "\"" + str(key)+ "\"" + " : " + str(AEP_candidate_clusters[id_][key]))
        # print("#####################################################")
    print("number of aep grous: " + str(cluster_counter))
    return overall_aep_candidates

def evaluate_aep_detection_approach(aep_classifier):
    tuple_collection_for_evaluation = generate_aep_candidates_and_groups(aep_classifier)
    inserted_abbreviations = pd.read_csv('insertedAbbreviations.txt', names=['expansion', 'abbreviation'], sep='\t', encoding='utf8')
    inserted_abbs = inserted_abbreviations["abbreviation"].tolist()
    inserted_exp = inserted_abbreviations["expansion"].tolist()
    inserted_abbreviations_as_tuples = [(inserted_abbs[i], inserted_exp[i]) for i in range(0, len(inserted_exp))]

    def contains_exp(proper_tuple, tup):
        cont = True
        for t in proper_tuple[1].split():
            exp_candidate_lower = tup[1].lower()
            if t not in exp_candidate_lower.split():
                cont = False
        if cont:
            return True
        else:
            return False

    matches_for_final_eval = 0
    resolved_abbs = set()
    print("FOUND AEPs:")
    for tup in tuple_collection_for_evaluation:
        for proper_tuple in inserted_abbreviations_as_tuples:
            if tup == proper_tuple and contains_exp(proper_tuple, tup):
                if tup[0] not in resolved_abbs:
                    print(tup)
                    resolved_abbs.add(tup[0])
                    matches_for_final_eval += 1
    print(matches_for_final_eval)

## Main Program to count the bumber of generated AEP candidates, AEP groups and to show corectly detected AEPs

In [28]:
# Evaluate ILLOD
evaluate_aep_detection_approach(ILLOD.illod)

number of aep candidates: 115
number of aep grous: 49
FOUND AEPs:
('cT', 'current time')
('RT', 'realtor')
('SR', 'search result')
('DS', 'disputes system')
('NSM', 'nursing staff member')
('DC', 'dispute case')
('Csi', 'clinical site')
('RF', 'repair facility')
('rP', 'recycled part')
('CE', 'collision estimate')
('AR', 'audit report')
('SP', 'search parameter')
('CR', 'conference room')
('sI', 'substitutionary ingredient')
('PF', 'product formula')
('IQA', 'inventory quantity adjustment')
('Sys', 'system')
('sMo', 'streaming movie')
('CC', 'credit card')
('LeSco', 'lead score')
('WES', 'web service')
('LeDA', 'lead data')
('oP', 'offensive player')
('dG', 'defensive grid')
('STAT', 'status')
25


In [29]:
# Evaluate Levenshtein-Distance_Classifier
evaluate_aep_detection_approach(Syntactic_Classifiers.ld_classifier)

number of aep candidates: 870
number of aep grous: 70
FOUND AEPs:
('cT', 'current time')
('RT', 'realtor')
('SR', 'search result')
('DS', 'disputes system')
('NSM', 'nursing staff member')
('DC', 'dispute case')
('Csi', 'clinical site')
('RF', 'repair facility')
('rP', 'recycled part')
('CE', 'collision estimate')
('AR', 'audit report')
('SP', 'search parameter')
('CR', 'conference room')
('sI', 'substitutionary ingredient')
('PF', 'product formula')
('IQA', 'inventory quantity adjustment')
('sMo', 'streaming movie')
('CC', 'credit card')
('WES', 'web service')
('oP', 'offensive player')
('dG', 'defensive grid')
21


In [30]:
# Evaluate Jaro-Winkler-Similarity_Classifier
evaluate_aep_detection_approach(Syntactic_Classifiers.jws_classifier)

number of aep candidates: 251
number of aep grous: 63
FOUND AEPs:
('cT', 'current time')
('RT', 'realtor')
('SR', 'search result')
('DS', 'disputes system')
('NSM', 'nursing staff member')
('DC', 'dispute case')
('Csi', 'clinical site')
('RF', 'repair facility')
('rP', 'recycled part')
('CE', 'collision estimate')
('AR', 'audit report')
('SP', 'search parameter')
('CR', 'conference room')
('sI', 'substitutionary ingredient')
('PF', 'product formula')
('IQA', 'inventory quantity adjustment')
('sMo', 'streaming movie')
('CC', 'credit card')
('LeSco', 'lead score')
('LeDA', 'lead data')
('oP', 'offensive player')
('dG', 'defensive grid')
22


In [31]:
# Evaluate Dice_Coefficient-Classiifer
evaluate_aep_detection_approach(Syntactic_Classifiers.dc_classifier)

number of aep candidates: 258
number of aep grous: 65
FOUND AEPs:
('cT', 'current time')
('SR', 'search result')
('DS', 'disputes system')
('NSM', 'nursing staff member')
('DC', 'dispute case')
('Csi', 'clinical site')
('RF', 'repair facility')
('rP', 'recycled part')
('CE', 'collision estimate')
('AR', 'audit report')
('SP', 'search parameter')
('CR', 'conference room')
('sI', 'substitutionary ingredient')
('PF', 'product formula')
('IQA', 'inventory quantity adjustment')
('sMo', 'streaming movie')
('CC', 'credit card')
('WES', 'web service')
('oP', 'offensive player')
('dG', 'defensive grid')
20
