# Integration of ILLOD into the clustering of glossary term candidates (Section 6.2, Figure 2)

In [26]:
import pandas as pd
import Syntactic_Classifiers
import ILLOD
import Abbreviation_and_NC_Extraction
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture

## Helper Functions for Clustering

In [27]:
def create_cluster_dict (prediction_list, terms_list):
    dict_for_mapping = {}
    for i, cluster_id in enumerate(prediction_list):
        if int(cluster_id) in dict_for_mapping:
            tmp = dict_for_mapping[int(cluster_id)]
            tmp.append(terms_list[i])
            dict_for_mapping[int(cluster_id)] = tmp
        else:
            dict_for_mapping[int(cluster_id)] = [terms_list[i]]
    return dict_for_mapping

## Reading the content of the PROMISE requirements. 30 abbreviations have been inserted in the texts. We try to identify them.

In [28]:
requirements_data = pd.read_csv('promise_constructed.CSV', names=['text', 'set_id'], sep=';', encoding='utf8')
# print(requirements_data)

In [29]:
# Preparation of the data with the aim of storing it in a dictionary
# Focus on requirements set number 5 (auto part finder system)
data_dict = {}
for id_ in set(requirements_data["set_id"]):
    sublist = requirements_data[requirements_data["set_id"] == id_]
    data_dict[id_] = [req for req in sublist["text"]]
test_reqs = data_dict[5]

## The following function performs a set transformation. It partitions objects of the set  𝑇  to the sets  𝑂𝑇  and  𝑇∖𝑂𝑇 according to steps (4) and (6) from Section 6.2, Figure 2.

In [30]:
def determine_sets_for_term_types(set_of_abbreviations, set_of_terms):
    
    #compliant wit section 5.2: terms_that_contain_abbreviations = T \ OT
    terms_that_contain_abbreviations = set()
    
    for term in set_of_terms:
        for abb in set_of_abbreviations:
            if abb in term.split():
                terms_that_contain_abbreviations.add(term)
    
    ordinary_terms = terms - terms_that_contain_abbreviations
    
    return ordinary_terms, terms_that_contain_abbreviations              

## Generation of Clusters according to steps(1) till step(8) (Section 6.2, Figure 2).

In [31]:
######## Step(1) + Step(3): Extract set of Abbreviations A and set of terms T ############
terms = set()
abbv_set = set()
for req in test_reqs:
    terms = terms.union(Abbreviation_and_NC_Extraction.nc_detect(req))
    abbv_set = abbv_set.union(Abbreviation_and_NC_Extraction.abbv_detect(req))


########## placeholder for step(2): Reduce extracted abbreviations set A through #########
#########  cmparision with project resources so that only undefined abbreviations stay in A


###################### step(4): determine the sets A, OT and T\OT ########################
ordinary_terms, terms_that_contain_abbs = determine_sets_for_term_types(abbv_set, terms)



# For every a∈ A generate an AEP group G^{a} of possible expansions t ∈ OT via ILLOD.####
###################################### step(5): ##########################################
AEP_candidate_clusters = {}
abbreviations_with_matching_candidates = set()
for abv in abbv_set:
    for term in ordinary_terms:
        if ILLOD.illod(abv, term):
            abbreviations_with_matching_candidates.add(abv)
            if abv in AEP_candidate_clusters:
                expansion_candidates_list = AEP_candidate_clusters[abv]
                expansion_candidates_list.append(term)
                AEP_candidate_clusters[abv] = expansion_candidates_list
            else:
                AEP_candidate_clusters[abv] = [term]

# For every a∈ A extend its G^{a} with terms t∈ T\OT, if t contains a.####################
######################################### step(6): #######################################
for abv in abbreviations_with_matching_candidates:
    for term in terms_that_contain_abbs:
        if abv in term.split() and abv != term:
            if abv in AEP_candidate_clusters:
                expansion_candidates_list = AEP_candidate_clusters[abv]
                expansion_candidates_list.append(term)
                AEP_candidate_clusters[abv] = expansion_candidates_list
            else:
                AEP_candidate_clusters[abv] = [term]

####################### step(7): generate clusters for terms from OT #####################
tfidf = TfidfVectorizer()
X = pd.DataFrame(tfidf.fit_transform(ordinary_terms).toarray(),
                 index=list(ordinary_terms), columns=tfidf.get_feature_names())
cluster_dict = {}
gmm = GaussianMixture(n_components=16).fit(X)
pred = gmm.predict(X)
cluster_dict = create_cluster_dict(pred, list(ordinary_terms))



## step(8): Add AEP groups as additional clusters to the clusters of the ordinary terms ##
for key in  AEP_candidate_clusters:
    tmp_list = AEP_candidate_clusters[key]
    tmp_list.append(key)
    cluster_dict[key] = tmp_list

    
################################# print on terminal ######################################
for key in cluster_dict.keys():
    print(str(key))
    print(cluster_dict[key])
    print("############################################################")

1
['environment', 'supervisor role', 'city', 'list', 'miles', 'audit', 'initial launch', 'Sarbanes - Oxley', 'invalid datum', '90 %', 'claim processing', 'street address', 'instruction', 'appearance', 'maintenance', 'adjuster', 'insurance companys claim datum', '98 %', '95 %', 'indivual line item', 'denial', 'other adjuster', 'feed', 'computer virus', 'access', '1 month', '80 %', 'category', 'insurance regulation', '85 %', 'product installation', 'application', 'approximately 1:00 am']
############################################################
6
['estimator', 'estimate', '80 % of Collision Estimators', 'Collision Estimators', 'collision estimate', 'available recycled part and collision estimate', 'recycled part audit of collision estimate', 'productivity of Collision Estimators', 'collision estimator', 'only collision estimator', 'estimate assignment', 'adjuster and Collision Estimators', 'collision estimator role']
############################################################
4
['hig