# Evaluation of Workflow

In [3]:
import spacy
import string
import pandas as pd
import jellyfish
import random
nlp = spacy.load("en_core_web_sm")
import pandas as pd
from spacy.matcher import Matcher

In [4]:
requirements_data = pd.read_csv('promise_constructed.CSV', names=['text', 'set_id'], sep=';', encoding='utf8')
reqs = list(requirements_data['text'].values)
list_of_id = list(requirements_data['set_id'].values)
data_dict = {}
for j, id_ in enumerate(list_of_id):
    if id_ not in data_dict.keys(): 
        data_dict[id_] = [reqs[j]]
    else:
        tmp_list = data_dict[id_]
        tmp_list.append(reqs[j])
        data_dict[id_] = tmp_list

In [5]:
stop_words = ["the", "and", "i", "for", "as", "an", "a", "if", "any", "all", "one", "on", "new", "out", "we", "to", "at", "by", "from"]

In [6]:
def upper_ratio(w):
    upper_cases = ''.join([c for c in w if c.isupper()])
    return len(upper_cases)/len(w)

In [7]:
def clear_special_characters(s1, s2):
    invalidcharacters = set(string.punctuation)
    if any(char in invalidcharacters for char in s1):
        s1_ = s1.lower().translate(str.maketrans('', '', string.punctuation))
    else:
        s1_ = s1
    if any(char in invalidcharacters for char in s2):
        s2_ = s2.lower().translate(str.maketrans('', '', string.punctuation))
    else:
        s2_ = s2
    return s1_, s2_

def stop_words_handling(term):
    splitted_term = term.split()
    stop_words = set(["for", "and", "of", "in", "via", "be"])
    
    if splitted_term[0] in stop_words:
        stop_words = stop_words - set([splitted_term[0]])
                
    for sw in stop_words:
        while sw in splitted_term:
            splitted_term.remove(sw)
    sanitized_term = " ".join([w for w in splitted_term]) 
        
    return sanitized_term

def clean_string_pair_and_reduce_expansion(abb, term):
    abb_lower = abb.lower()
    term_lower = term.lower()
    sanitized_abbv, sanitized_term = clear_special_characters(abb_lower, term_lower) 
    sanitized_term_without_stopswords = stop_words_handling(sanitized_term)
    initial_letters_of_tokens_of_sanitized_term_without_stopswords = ''.join([c[0] for c in sanitized_term_without_stopswords.split()])
    return sanitized_abbv, initial_letters_of_tokens_of_sanitized_term_without_stopswords

def check_initial_letters(a, t):
    initial_letters_of_tokens_of_t = ''.join([c[0] for c in t.split()])
    if initial_letters_of_tokens_of_t == a or initial_letters_of_tokens_of_t.upper() == a:
        return True
    
def check_length_consistency(a, t):
    length_consistency = False
    if len(t.split()) <= len(a):
        length_consistency = True
    return length_consistency

def check_order(a, t):
    abbv_reversed = a.lower()[::-1]
    term_reversed = t.lower()[::-1]
    len_of_term = len(t)
    
    pos_memory = 0
    pos_memory_list = []
    order_matching_string_rev = ""
    
    for j, char_from_abbv in enumerate(abbv_reversed):
        if j == len(abbv_reversed) - 1 and len(pos_memory_list) > 0 and pos_memory == len(term_reversed):
            break
        else:
            for i, char_from_term in enumerate(term_reversed[pos_memory:]):
                if char_from_abbv == char_from_term:
                    order_matching_string_rev = order_matching_string_rev + char_from_abbv
                    pos_memory = pos_memory + i + 1
                    pos_memory_list.append(len_of_term - pos_memory)
                    break
    if order_matching_string_rev == abbv_reversed:
        return True, pos_memory_list[::-1]
    else:
        return False, []

def check_distribution_of_matching_characters(pos_of_chars_list, t):
    term_intervals = []
    len_of_term = len(t)
    i = 0
    while i < len_of_term:
        sublist = []
        j = i
        while j < len_of_term and t[j] != " ":
            sublist.append(j)
            j = j+ 1
        i = j+1
        term_intervals.append(sublist)
        
    splitted_term = t.split()      
    
    containment_list = []
    for i, interval in enumerate(term_intervals):
        contanment_sublist = []
        for pos in pos_of_chars_list:
            if (pos in interval) and (splitted_term[i][0] == t[pos]):
                contanment_sublist.append(0)
            elif pos in interval:
                contanment_sublist.append(interval.index(pos))
        if len(contanment_sublist) == 0:
            contanment_sublist.append(-1)
        containment_list.append(contanment_sublist)
    
    result_of_distribution_check = False
    if len(containment_list) <= 1:
        result_of_distribution_check = True
    elif len (containment_list) >= 2:
        non_zero_count = 0
        for sublist in containment_list[1:]:
            if len(sublist) == 1 and 0 not in sublist:
                non_zero_count += 1
        if non_zero_count == 0:
            result_of_distribution_check = True
    
    return result_of_distribution_check


def illod(abbv, term, threshold=None):
    if (abbv[0].lower() == term[0].lower()):
        
        
        ###################################### Step (a) ##########################################
        # check wether initial letters of tokens in t match with the letters in abbreviation
        if check_initial_letters(abbv, term):
            return True
        
        
        
        ###################################### Step (b) ########################################
        # clean abbreviation and term from special characters and stopwords
        a_, t_ = clean_string_pair_and_reduce_expansion(abbv, term)
        if a_ == t_:
            return True
        
        sanitized_abbv, sanitized_term = clear_special_characters(abbv, term) 
        sanitized_term_without_stopswords = stop_words_handling(sanitized_term)
        sanitized_term_without_stopswords_splitted  = sanitized_term_without_stopswords.split()
        
        ###################################### Step (c) ##########################################
        # Sequential call of the methods that check and compare lengths, order and distribution of characters
        length_consistency = check_length_consistency(sanitized_abbv, sanitized_term_without_stopswords)
        order, pos_of_chars_list = check_order(sanitized_abbv, sanitized_term_without_stopswords)
        distribution = check_distribution_of_matching_characters(pos_of_chars_list, sanitized_term_without_stopswords)


        if length_consistency and order and distribution:
            return True
        else:
            return False

        ################################## in case nothing matches #################################
    else:
        return False

In [8]:
def normalize_nc(nc):
    doc = nlp(nc)
    cleaned_nc = ""
    for token in doc:
        if token.pos_ != "DET":
            cleaned_nc = cleaned_nc + " " + token.lemma_
            cleaned_nc = cleaned_nc.strip()
    return cleaned_nc

In [9]:
def abbv_detect(sent):
    abv = set()
    for word in sent.split():
        if (len(word) <= 13 and upper_ratio(word) >= 0.29):
            if len([c for c in word if c.isupper()]) == 1 and word[0].isupper() and word.lower() in stop_words:
                continue
            abv.add(word)
    return abv

In [10]:
def nc_detect(req):
    noun_chunks_set = set()
    matcher = Matcher(nlp.vocab)
    pattern1 = [{'POS': 'NOUN'}, {'POS': 'NOUN'}, {'POS': 'NOUN'}]
    pattern2 = [{'POS': 'PROPN'}, {'POS': 'NOUN'}, {'POS': 'PROPN'}, {'POS': 'PROPN'}]
    pattern3 = [{'POS': 'NOUN'}, {'POS': 'DET'}, {'POS': 'NOUN'}]
    pattern4 = [{'POS': 'NOUN'}]
    matcher.add("TrigramNCs", [pattern1, pattern2, pattern3, pattern4])
    doc = nlp(req)
    matches = matcher(doc)
    for nc_ in doc.noun_chunks:
        noun_chunks_set.add(nc_.text)
    

    composed_terms = set()
    for nc1 in noun_chunks_set:
        for nc2 in noun_chunks_set:
            comp_term1 = nc1 + " of " + nc2
            comp_term2 = nc1 + " and " + nc2
            if comp_term1 in req:
                composed_terms.add(comp_term1)
            if comp_term2 in req:
                composed_terms.add(comp_term2)
    found_terms = noun_chunks_set.union(composed_terms)
    
    cleaned_terms = []
    for t in found_terms:
        cleaned_terms.append(normalize_nc(t))
    return set(cleaned_terms)

In [11]:
def determine_sets_for_term_types(abbv_set, ordinary_terms):
    ordinary_terms_without_abbs = ordinary_terms
    terms_that_contain_abbs = {}
    for abb in abbv_set:
        set_of_terms_that_contains_given_abb = set()
        for t in ordinary_terms:
            if abb in t.split():
                set_of_terms_that_contains_given_abb.add(t)
        ordinary_terms_without_abbs = ordinary_terms_without_abbs - set_of_terms_that_contains_given_abb
        terms_that_contain_abbs[abb] = set_of_terms_that_contains_given_abb
    return abbv_set, ordinary_terms_without_abbs, terms_that_contain_abbs              

In [38]:
counter = 0
AEP_candidate_clusters ={}
for id_ in data_dict.keys():
    ordinary_terms = set()
    abbv_set = set()
    print("TUPLES FROM ReqSet: " + str(id_))
    for req in data_dict[id_]:
        ordinary_terms = ordinary_terms.union(nc_detect(req))
        abbv_set = abbv_set.union(abbv_detect(req))
    
    abbv_set, ordinary_terms, terms_that_contain_abbs = determine_sets_for_term_types(abbv_set, ordinary_terms)
    
    for abv in abbv_set:
        for term in ordinary_terms:
            if illod(abv, term):
                counter += 1
                print(str(counter)+ ") (" + abv + ", " + term + ")")
                if id_ in AEP_candidate_clusters.keys():
                    if abv in AEP_candidate_clusters[id_]:
                        expansion_candidates_list = AEP_candidate_clusters[id_][abv]
                        expansion_candidates_list.append(term)
                        AEP_candidate_clusters[id_][abv] = expansion_candidates_list
                    else:
                        AEP_candidate_clusters[id_][abv] = [term]
                else:
                    AEP_candidate_clusters[id_] = {}
                
                
    print("###################################################################################")

cluster_counter = 0
for id_ in AEP_candidate_clusters.keys():
    print("CLUSTERS FROM ReqSet: " + str(id_) + ":")
    for key in AEP_candidate_clusters[id_]:
        cluster_counter += 1
        print(str(cluster_counter) + ") " + "\"" + str(key)+ "\"" + " : " + str(AEP_candidate_clusters[id_][key]))
    print("#####################################################")

TUPLES FROM ReqSet: 1
1) (cT., chart)
2) (cT., current time)
3) (MDI, modification of display)
4) (MDI, modification)
5) (PC, product)
###################################################################################
TUPLES FROM ReqSet: 2
6) (RT, realtor)
7) (CMA, contact information)
8) (CE, client)
9) (SR., search result)
10) (SR., seller)
###################################################################################
TUPLES FROM ReqSet: 3
11) (Dr, department)
12) (PoS., portion of system)
13) (PoS., possibility)
14) (PoS., Program Administrators)
###################################################################################
TUPLES FROM ReqSet: 4
15) (DC., documentation)
16) (DC., dispute case)
17) (DC., document)
18) (DS, dispute)
19) (DS, dispute system)
20) (DS, database)
21) (DS, Disputes System)
22) (DS, department / section)
23) (E, exception)
24) (E, example)
25) (TR, transaction)
26) (TR, transaction and industry)
27) (TR, type of retrieval)
28) (TR, type of transa

In [None]:
################################## Terms that contain undefined Abbreviations? ######################################

In [19]:
ordinary_terms = set()
abbv_set = set()
for req in data_dict[5]:
        ordinary_terms = ordinary_terms.union(nc_detect(req))
        abbv_set = abbv_set.union(abbv_detect(req))
list_ = sorted(list(ordinary_terms))
for term in list_:
    print(term)

1 and 30 mile
1 month
10 000 concurrent user
10 second
15 second
1500 user
2 year
2 year of initial launch
30 mile
5 second
8 second
80 %
80 % of Collision Estimators
85 %
85 % of user
90 %
90 % of system
95 %
95 % of adjuster
98 %
98 % of schedule outage
98 % uptime
AR
CE
Choice part System
ChoiceParts system
Collision Estimators
Mozilla Firefox
Sarbanes - Oxley
User help
access
adjuster
adjuster and Collision Estimators
adjuster role
appearance
appearance of product
application
approximately 1:00 am
attempt use
attempt use of rP
attempt use of rP and actual use
audit
audit report
availability schedule
available online time
available part
available recycled part
available recycled part and collision estimate
available recycled part and supplier
available recycled part information
available recycled part information and supplier
average number
average number of recycled part record
blank set
blank set of rating
category
city
claim processing
collision estimate
collision estimator
colli