# Intgration of ILLOD into GTE:  Clustering Experiment  (Section 5.2)

In [1]:
import spacy
import string
import pandas as pd
import jellyfish
import random
nlp = spacy.load("en_core_web_sm")
import pandas as pd
from spacy.matcher import Matcher
import re
import nltk
from string import punctuation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture

## Reading the content of the PROMISE requirements. 30 abbreviations have been inserted in the texts. We want try to identify them.

In [2]:
requirements_data = pd.read_csv('promise_constructed.CSV', names=['text', 'set_id'], sep=';', encoding='utf8')
print(requirements_data)

                                                  text  set_id
1    The system shall refresh the display every 60 ...       1
2    The application shall match the color of the s...       1
3    If projected  the data must be readable.  On a...       1
4    The product shall be available during normal b...       1
5    If projected  the data must be understandable....       1
..                                                 ...     ...
621  User access should be limited to the permissio...      15
622  The product must comply with the intranet page...      15
623   The intranet pages should display appropriate...      15
624  The users should be able to easily use the sys...      15
625  The product interface should be fast. The resp...      15

[625 rows x 2 columns]


In [3]:
# Preparation of the data with the aim of storing it in a dictionary
data_dict = {}
for id_ in set(requirements_data["set_id"]):
    sublist = requirements_data[requirements_data["set_id"] == id_]
    data_dict[id_] = [req for req in sublist["text"]]

In [4]:
test_reqs = data_dict[5]

## NC and Abbreviation Extraction

### Helper Functions to extract noun chunks (NCs) and abbreviations

In [5]:
stop_words = ["the", "and", "i", "for", "as", "an", "a", "if", "any", "all", "one", "on", "new", "out", "we", "to", "at", "by", "from"]

In [6]:
def portion_of_capital_letters(w):
    upper_cases = ''.join([c for c in w if c.isupper()])
    return len(upper_cases)/len(w)

In [7]:
def abbv_detect(sent):
    abv = set()
    for word in sent.split():
        if (len(word) <= 13 and portion_of_capital_letters(word) >= 0.29):
            if len([c for c in word if c.isupper()]) == 1 and word[0].isupper() and word.lower() in stop_words:
                continue
            abv.add(word.strip(punctuation))
    return abv

In [8]:
def normalize_nc(nc):
    doc = nlp(nc)
    cleaned_nc = ""
    for token in doc:
        if token.pos_ != "DET":
            cleaned_nc = cleaned_nc + " " + token.lemma_
            cleaned_nc = re.sub(r"[\([{})\]]", "", cleaned_nc)
            cleaned_nc = cleaned_nc.strip()
    return cleaned_nc

Extraction of noun chunks according to [2] (Arora, Chetan, et al. "Automated extraction and clustering of requirements glossary terms." IEEE Transactions on Software Engineering 43.10 (2016): 918-945). Some Pos-Tag-Patterns are added to the NC detection to improve recall of spacy package

In [9]:
def nc_detect(req):
    noun_chunks_set = set()
    matcher = Matcher(nlp.vocab)
    pattern1 = [{'POS': 'NOUN'}, {'POS': 'NOUN'}, {'POS': 'NOUN'}]
    pattern2 = [{'POS': 'PROPN'}, {'POS': 'NOUN'}, {'POS': 'PROPN'}, {'POS': 'PROPN'}]
    pattern3 = [{'POS': 'NOUN'}, {'POS': 'DET'}, {'POS': 'NOUN'}]
    pattern4 = [{'POS': 'NOUN'}, {'POS': 'PROPN'}]
    pattern5 = [{'POS': 'NOUN'}]
    matcher.add("TrigramNCs", [pattern1, pattern2, pattern3, pattern4, pattern5])
    doc = nlp(req)
    matches = matcher(doc)
    for nc_ in doc.noun_chunks:
        noun_chunks_set.add(nc_.text)
    

    composed_terms = set()
    for nc1 in noun_chunks_set:
        for nc2 in noun_chunks_set:
            comp_term1 = nc1 + " of " + nc2
            comp_term2 = nc1 + " and " + nc2
            if comp_term1 in req:
                composed_terms.add(comp_term1)
            if comp_term2 in req:
                composed_terms.add(comp_term2)
    found_terms = noun_chunks_set.union(composed_terms)
    
    cleaned_terms = []
    for t in found_terms:
        cleaned_terms.append(normalize_nc(t))
    return set(cleaned_terms)

## ILLOD with its methods (Section 4.3)

In [10]:
def clear_special_characters(s1, s2):
    invalidcharacters = set(string.punctuation)
    if any(char in invalidcharacters for char in s1):
        s1_ = s1.lower().translate(str.maketrans('', '', string.punctuation))
    else:
        s1_ = s1
    if any(char in invalidcharacters for char in s2):
        s2_ = s2.lower().translate(str.maketrans('', '', string.punctuation))
    else:
        s2_ = s2
    return s1_, s2_

def stop_words_handling(term):
    splitted_term = term.split()
    stop_words = set(["for", "and", "of", "in", "via", "be"])
    
    if splitted_term[0] in stop_words:
        stop_words = stop_words - set([splitted_term[0]])
                
    for sw in stop_words:
        while sw in splitted_term:
            splitted_term.remove(sw)
    sanitized_term = " ".join([w for w in splitted_term]) 
        
    return sanitized_term

def clean_string_pair_and_reduce_expansion(abb, term):
    abb_lower = abb.lower()
    term_lower = term.lower()
    sanitized_abbv, sanitized_term = clear_special_characters(abb_lower, term_lower) 
    sanitized_term_without_stopswords = stop_words_handling(sanitized_term)
    initial_letters_of_tokens_of_sanitized_term_without_stopswords = ''.join([c[0] for c in sanitized_term_without_stopswords.split()])
    return sanitized_abbv, initial_letters_of_tokens_of_sanitized_term_without_stopswords

def check_initial_letters(a, t):
    initial_letters_of_tokens_of_t = ''.join([c[0] for c in t.split()])
    if initial_letters_of_tokens_of_t == a or initial_letters_of_tokens_of_t.upper() == a:
        return True
    
def check_length_consistency(a, t):
    length_consistency = False
    if len(t.split()) <= len(a):
        length_consistency = True
    return length_consistency

def check_order(a, t):
    abbv_reversed = a.lower()[::-1]
    term_reversed = t.lower()[::-1]
    len_of_term = len(t)
    
    pos_memory = 0
    pos_memory_list = []
    order_matching_string_rev = ""
    
    for j, char_from_abbv in enumerate(abbv_reversed):
        if j == len(abbv_reversed) - 1 and len(pos_memory_list) > 0 and pos_memory == len(term_reversed):
            break
        else:
            for i, char_from_term in enumerate(term_reversed[pos_memory:]):
                if char_from_abbv == char_from_term:
                    order_matching_string_rev = order_matching_string_rev + char_from_abbv
                    pos_memory = pos_memory + i + 1
                    pos_memory_list.append(len_of_term - pos_memory)
                    break
    if order_matching_string_rev == abbv_reversed:
        return True, pos_memory_list[::-1]
    else:
        return False, []

def check_distribution_of_matching_characters(pos_of_chars_list, t):
    term_intervals = []
    len_of_term = len(t)
    i = 0
    while i < len_of_term:
        sublist = []
        j = i
        while j < len_of_term and t[j] != " ":
            sublist.append(j)
            j = j+ 1
        i = j+1
        term_intervals.append(sublist)
        
    splitted_term = t.split()      
    
    containment_list = []
    for i, interval in enumerate(term_intervals):
        contanment_sublist = []
        for pos in pos_of_chars_list:
            if (pos in interval) and (splitted_term[i][0] == t[pos]):
                contanment_sublist.append(0)
            elif pos in interval:
                contanment_sublist.append(interval.index(pos))
        if len(contanment_sublist) == 0:
            contanment_sublist.append(-1)
        containment_list.append(contanment_sublist)
    
    result_of_distribution_check = False
    if len(containment_list) <= 1:
        result_of_distribution_check = True
    elif len (containment_list) >= 2:
        non_zero_count = 0
        for sublist in containment_list[1:]:
            if len(sublist) == 1 and 0 not in sublist:
                non_zero_count += 1
        if non_zero_count == 0:
            result_of_distribution_check = True
    
    return result_of_distribution_check


def illod(abbv, term, threshold=None):
    if (abbv[0].lower() == term[0].lower()):
        
        
        ###################################### Step (a) ##########################################
        # check wether initial letters of tokens in t match with the letters in abbreviation
        if check_initial_letters(abbv, term):
            return True
        
        
        
        ###################################### Step (b) ##########################################
        # clean abbreviation and term from special characters and stopwords
        a_, t_ = clean_string_pair_and_reduce_expansion(abbv, term)
        if a_ == t_:
            return True
        
        sanitized_abbv, sanitized_term = clear_special_characters(abbv, term) 
        sanitized_term_without_stopswords = stop_words_handling(sanitized_term)
        sanitized_term_without_stopswords_splitted  = sanitized_term_without_stopswords.split()
        
        ###################################### Step (c), (d), (e) #################################
        # Sequential call of the methods that check and compare lengths, order and distribution of characters
        length_consistency = check_length_consistency(sanitized_abbv, sanitized_term_without_stopswords)
        order, pos_of_chars_list = check_order(sanitized_abbv, sanitized_term_without_stopswords)
        distribution = check_distribution_of_matching_characters(pos_of_chars_list, sanitized_term_without_stopswords)


        if length_consistency and order and distribution:
            return True
        else:
            return False

        ################################## in case first letter differs ###########################
    else:
        return False

## Helper Functions for Clustering

In [11]:
def tokenizer(keyword):
    return nltk.word_tokenize(keyword)

In [12]:
def create_cluster_dict (prediction_list, terms_list):
    dict_for_mapping = {}
    for i, cluster_id in enumerate(prediction_list):
        if int(cluster_id) in dict_for_mapping:
            tmp = dict_for_mapping[int(cluster_id)]
            tmp.append(terms_list[i])
            dict_for_mapping[int(cluster_id)] = tmp
        else:
            dict_for_mapping[int(cluster_id)] = [terms_list[i]]
    return dict_for_mapping

### The following function performs a set transformation. It distributes objects of the sets $A$ and $T$ to the sets $OT$ and $G^{a}$ according to steps (4) and (6) from section (5.2) 

In [13]:
def determine_sets_for_term_types(set_of_abbreviations, set_of_terms):
    
    #compliant wit section 5.2: terms_that_contain_abbreviations = T \ OT
    terms_that_contain_abbreviations = set()
    
    for term in set_of_terms:
        for abb in set_of_abbreviations:
            if abb in term.split():
                terms_that_contain_abbreviations.add(term)
    
    ordinary_terms = terms - terms_that_contain_abbreviations
    
    return ordinary_terms, terms_that_contain_abbreviations       

## Generation of Clusters according to steps(1) till step(8) (Section 5.2) 

In [14]:
######## Step(1) + Step(3): Extract set of Abbreviations A and set of terms T ############
terms = set()
abbv_set = set()
for req in test_reqs:
    terms = terms.union(nc_detect(req))
    abbv_set = abbv_set.union(abbv_detect(req))


########## placeholder for step(2): Reduce extracted abbreviations set A through #########
#########  cmparision with project resources so that only undefined abbreviations stay in A


###################### step(4): determine the sets A, OT and T\OT ########################
ordinary_terms, terms_that_contain_abbs = determine_sets_for_term_types(abbv_set, terms)




###################################### step(5): ##########################################
AEP_candidate_clusters = {}
abbreviations_with_matching_candidates = set()
for abv in abbv_set:
    for term in ordinary_terms:
        if illod(abv, term):
            abbreviations_with_matching_candidates.add(abv)
            if abv in AEP_candidate_clusters:
                expansion_candidates_list = AEP_candidate_clusters[abv]
                expansion_candidates_list.append(term)
                AEP_candidate_clusters[abv] = expansion_candidates_list
            else:
                AEP_candidate_clusters[abv] = [term]

######################################### step(6): #######################################
for abv in abbreviations_with_matching_candidates:
    for term in terms_that_contain_abbs:
        if abv in term.split() and abv != term:
            if abv in AEP_candidate_clusters:
                expansion_candidates_list = AEP_candidate_clusters[abv]
                expansion_candidates_list.append(term)
                AEP_candidate_clusters[abv] = expansion_candidates_list
            else:
                AEP_candidate_clusters[abv] = [term]

####################### step(7): generate clusters for terms from OT #####################
tfidf = TfidfVectorizer()
X = pd.DataFrame(tfidf.fit_transform(ordinary_terms).toarray(),
                 index=list(ordinary_terms), columns=tfidf.get_feature_names())
cluster_dict = {}
gmm = GaussianMixture(n_components=16).fit(X)
pred = gmm.predict(X)
cluster_dict = create_cluster_dict(pred, list(ordinary_terms))



## step(8): Add AEP groups as additional clusters to the clusters of the ordinary terms ##
for key in  AEP_candidate_clusters:
    tmp_list = AEP_candidate_clusters[key]
    tmp_list.append(key)
    cluster_dict[key] = tmp_list

    
################################# print on terminal ######################################
for key in cluster_dict.keys():
    print(str(key))
    print(cluster_dict[key])
    print("############################################################")

4
['audit report', 'damaged vehicle part information', 'original search result', '98 % uptime', '2 year', 'search result', 'save', 'list', 'initial launch', 'search', 'audit', 'service', 'zipcode', 'approximately 1:00 am', 'denial', 'supplied vehicle part', 'application', 'access', 'supervisor role', 'supplied vehicle part and supplier', 'vehicle vehicle location', 'vehicle year', 'percentage', '2 year of initial launch', 'denial of service', 'supplier', 'day', 'radius', 'environment', 'search radius', 'indivual line item', 'feed', 'miles', 'state', 'scale', '85 %', 'category', 'vehicle datum', '90 %', '98 %', '1 month', 'more than 2 %', 'city', 'vehicle location', 'training', 'instruction', 'middleware technology team']
############################################################
14
['average number', 'number', 'total number of recycled part', 'total score of audit', 'total number', 'total score']
############################################################
7
['80 % of Collision Estim