# Evaluation of the Different Approaches to AEP Detection on the Promise dataset (Section 5.5)

In [1]:
import pandas as pd
import Syntactic_Classifiers
import ILLOD
import Abbreviation_and_NC_Extraction

## Reading the content of the PROMISE requirements. 30 abbreviations have been inserted in the texts. We want try to identify them.

In [2]:
requirements_data = pd.read_csv('promise_constructed.CSV', names=['text', 'set_id'], sep=';', encoding='utf8')
# print(requirements_data)

In [3]:
# Preparation of the data with the aim of storing it in a dictionary
data_dict = {}
for id_ in set(requirements_data["set_id"]):
    sublist = requirements_data[requirements_data["set_id"] == id_]
    data_dict[id_] = [req for req in sublist["text"]]

## The following function performs a set transformation. It partitions objects of the set  ùëá  to the sets  ùëÇùëá  and  ùëá‚àñùëÇùëá according to steps (4) and (6) from section (6.2)

In [4]:
def determine_sets_for_term_types(set_of_abbreviations, set_of_terms):
    
    #compliant wit section 5.2: terms_that_contain_abbreviations = T \ OT
    terms_that_contain_abbreviations = set()
    
    for term in set_of_terms:
        for abb in set_of_abbreviations:
            if abb in term.split():
                terms_that_contain_abbreviations.add(term)
    
    ordinary_terms = terms - terms_that_contain_abbreviations
    
    return ordinary_terms, terms_that_contain_abbreviations              

## Main Program to generate AEP candidates and AEP groups. In line 29 to 32 the one can set the classifier that he wants to evaluate.  

In [5]:
counter = 0
tuple_collection_for_evaluation = []
AEP_candidate_clusters ={}
for id_ in data_dict.keys():

    ######### Step(1) + Step(3): Extract set of Abbreviations A and set of terms T############

    terms = set()
    abbv_set = set()
    print("TUPLES FROM ReqSet: " + str(id_))
    for req in data_dict[id_]:
        terms = terms.union(Abbreviation_and_NC_Extraction.nc_detect(req))
        abbv_set = abbv_set.union(Abbreviation_and_NC_Extraction.abbv_detect(req))

    ############ step(2): Reduce extracted abbreviations set A through cmparision with #######
    ############ project resources so that only undefined abbreviations stay in A ############



    ############################ step(4): determine the sets A, OT and T\OT ##################
    ordinary_terms, terms_that_contain_abbs = determine_sets_for_term_types(abbv_set, terms)


    # For every a‚àà A generate an AEP group G^{a} of possible expansions t ‚àà OT via ILLOD.####
    ###################################### step(5): ##########################################
    abbreviations_with_matching_candidates = set()
    for abv in abbv_set:
        for term in ordinary_terms:
            if ILLOD.illod(abv, term):
            # if Syntactic_Classifiers.jws_classifier(abv, term):
            # if Syntactic_Classifiers.dc_classifier(abv, term):
            # if Syntactic_Classifiers.ld_classifier(abv, term):
                tuple_collection_for_evaluation.append((abv, term.lower()))
                counter += 1
                abbreviations_with_matching_candidates.add(abv)
                print(str(counter)+ ") (" + abv + ", " + term + ")")
                if id_ in AEP_candidate_clusters.keys():
                    if abv in AEP_candidate_clusters[id_]:
                        expansion_candidates_list = AEP_candidate_clusters[id_][abv]
                        expansion_candidates_list.append(term)
                        AEP_candidate_clusters[id_][abv] = expansion_candidates_list
                    else:
                        AEP_candidate_clusters[id_][abv] = [term]
                else:
                    AEP_candidate_clusters[id_] = {}

    # For every a‚àà A extend its G^{a} with terms t‚àà T\OT, if t contains a.####################
    ######################################### step(6): #######################################
    for abv in abbreviations_with_matching_candidates:
        for term in terms_that_contain_abbs:
            if abv in term.split() and abv != term:
                if id_ in AEP_candidate_clusters.keys():
                    if abv in AEP_candidate_clusters[id_]:
                        expansion_candidates_list = AEP_candidate_clusters[id_][abv]
                        expansion_candidates_list.append(term)
                        AEP_candidate_clusters[id_][abv] = expansion_candidates_list
                    else:
                        AEP_candidate_clusters[id_][abv] = [term]
                else:
                    AEP_candidate_clusters[id_] = {}
    print("#####################################################")

cluster_counter = 0
for id_ in AEP_candidate_clusters.keys():
    print("CLUSTERS FROM ReqSet: " + str(id_) + ":")
    for key in AEP_candidate_clusters[id_]:
        cluster_counter += 1
        print(str(cluster_counter) + ") " + "\"" + str(key)+ "\"" + " : " + str(AEP_candidate_clusters[id_][key]))
    print("#####################################################")


TUPLES FROM ReqSet: 1
1) (MDI, modification)
2) (MDI, modification of display)
3) (cT, chart)
4) (cT, current time)
5) (cT, cT.)
6) (PC, product)
#####################################################
TUPLES FROM ReqSet: 2
7) (CE, client)
8) (CMA, contact information)
9) (SR, seller)
10) (SR, search result)
11) (RT, realtor)
#####################################################
TUPLES FROM ReqSet: 3
12) (Dr, department)
13) (PoS, possibility)
14) (PoS, Program Administrators)
15) (PoS, portion of system)
#####################################################
TUPLES FROM ReqSet: 4
16) (TR, type of retrieval)
17) (TR, type of transaction)
18) (TR, transaction)
19) (TR, transaction and industry)
20) (Lab, lab)
21) (Lab, labs)
22) (RTR, retrieval)
23) (DS, department / section)
24) (DS, dispute)
25) (DS, Disputes System)
26) (DS, database)
27) (DS, dispute system)
28) (In, illness)
29) (In, instructor)
30) (In, information)
31) (In, interface creation)
32) (In, industry)
33) (NSM, nursing St

## Evaluation of AEP Detection: Counting detected AEPs

In [6]:
inserted_abbreviations = pd.read_csv('insertedAbbreviations.txt', names=['expansion', 'abbreviation'], sep='\t', encoding='utf8')
inserted_abbs = inserted_abbreviations["abbreviation"].tolist()
inserted_exp = inserted_abbreviations["expansion"].tolist()
inserted_abbreviations_as_tuples = [(inserted_abbs[i], inserted_exp[i]) for i in range(0, len(inserted_exp))]

def contains_exp(proper_tuple, tup):
    cont = True
    for t in proper_tuple[1].split():
        exp_candidate_lower = tup[1].lower()
        if t not in exp_candidate_lower.split():
            cont = False
    if cont:
        return True
    else:
        return False
    
matches_for_final_eval = 0
resolved_abbs = set()
for tup in tuple_collection_for_evaluation:
    for proper_tuple in inserted_abbreviations_as_tuples:
        if tup == proper_tuple and contains_exp(proper_tuple, tup):
            if tup[0] not in resolved_abbs:
                print(tup)
                resolved_abbs.add(tup[0])
                matches_for_final_eval += 1
print(matches_for_final_eval)

('cT', 'current time')
('SR', 'search result')
('RT', 'realtor')
('DS', 'disputes system')
('NSM', 'nursing staff member')
('Csi', 'clinical site')
('DC', 'dispute case')
('CE', 'collision estimate')
('RF', 'repair facility')
('rP', 'recycled part')
('AR', 'audit report')
('SP', 'search parameter')
('CR', 'conference room')
('PF', 'product formula')
('sI', 'substitutionary ingredient')
('IQA', 'inventory quantity adjustment')
('Sys', 'system')
('sMo', 'streaming movie')
('CC', 'credit card')
('LeSco', 'lead score')
('LeDA', 'lead data')
('WES', 'web service')
('dG', 'defensive grid')
('oP', 'offensive player')
('STAT', 'status')
25
