# 10-fold validation on 100 randomly inserted uncontrolled Abbreviations (Section 6.5)

In [5]:
import pandas as pd
import Function_Pool
import pickle
import ILLOD
import time

In [6]:
import fasttext
import fasttext.util
from scipy import spatial
fasttext.util.download_model('en', if_exists='ignore')
ft = fasttext.load_model("cc.en.300.bin")


def fast_text_similarity(a, t, threshold):  
    a_v = ft.get_sentence_vector(a)
    t_v = ft.get_sentence_vector(t)
    if 1 - spatial.distance.cosine(a_v, t_v) >= threshold:
        return True
    else:
        return False



## Load Data

In [7]:
filePath = "pure_dataset.csv"
data = pd.read_csv(filePath, names=['ID','dataset', 'requirement'], sep=';', encoding='utf8')
data_list = data.values.tolist()

replacement_data = pd.read_csv("SF-LF-Pairs.csv", names=['term','abbv'], sep=';', encoding='utf8')
aeps_to_replace = replacement_data.values.tolist()
inserted_abbreviations = []
for sample in aeps_to_replace:
    if not (sample[1] != sample[1]):
        inserted_abbreviations.append(sample)
print(len(inserted_abbreviations))

518


## Generation of AEP Groups

In [8]:
def generate_aep_candidates_and_groups(found_abbs, 
                                       ordinary_terms, 
                                       terms_that_contain_abbs, 
                                       aep_classifier, 
                                       threshold):
    dict_for_aep_groups = {}
    for a in found_abbs:
        for ot in ordinary_terms:
            if aep_classifier(a, ot, threshold):
                if a in dict_for_aep_groups.keys():
                    dict_for_aep_groups[a].append(ot)
                else:
                    dict_for_aep_groups[a] = [ot]
                    
    
    for a in found_abbs:
        for ttca in terms_that_contain_abbs:
            if a in ttca.split():
                if a in dict_for_aep_groups.keys():
                    dict_for_aep_groups[a].append(ttca)
                else:
                    dict_for_aep_groups[a] = [ttca]
    return dict_for_aep_groups

## Calculate Performance Indicators for each AEP Detection Approach

In [9]:
def evaluate_aep_detection_approach(found_abbreviations, list_of_replacements, ordinary_terms, terms_that_contain_abbs, clf, threshold):
    start_time = time.time()
    dict_for_aep_groups = generate_aep_candidates_and_groups(found_abbreviations, 
                                                             ordinary_terms, 
                                                             terms_that_contain_abbs, 
                                                             clf, 
                                                             threshold)
    end_time = time.time()
    duration = end_time - start_time
    abbreviations_detected = []
    matched_pairs = []
    for aep in list_of_replacements: # inserted_abbreviations is a global variable
        if aep[1] in dict_for_aep_groups.keys():
            abbreviations_detected.append(aep[1])
            if aep[0] in dict_for_aep_groups[aep[1]]:
                matched_pairs.append(aep)
    abbv_set_of_inserted_abbvs = set([p[1] for p in list_of_replacements])
    missed_abbreviations = abbv_set_of_inserted_abbvs - found_abbreviations

    sum_of_lengths = 0
    for key in dict_for_aep_groups.keys():
        sum_of_lengths = sum_of_lengths + len(dict_for_aep_groups[key])
    average_size_of_AEP_group = sum_of_lengths/len(dict_for_aep_groups.keys())
    cost_effectiveness = average_size_of_AEP_group/len(matched_pairs)
    return (len(dict_for_aep_groups), 
            len(abbreviations_detected), 
            len(missed_abbreviations), 
            len(matched_pairs), 
            average_size_of_AEP_group,
            cost_effectiveness,
            duration)

## The following function performs a set transformation. It partitions objects of the set  𝑇  to the sets  𝑂𝑇  and  𝑇∖𝑂𝑇 according to steps (4) and (6) from section (7.2, Figure 3)

In [10]:
def determine_sets_for_term_types(set_of_abbreviations, set_of_terms):
    
    #compliant with section 7.2: terms_that_contain_abbreviations = T \ OT
    set_of_cleaned_terms = set([term_tuple[1] for term_tuple in set_of_terms])
    terms_that_contain_abbreviations = set()
    
    for term in set_of_cleaned_terms:
        for abb in set_of_abbreviations:
            if abb in term.split():
                terms_that_contain_abbreviations.add(term)
    
    ordinary_terms = set_of_cleaned_terms - terms_that_contain_abbreviations
    
    return ordinary_terms, terms_that_contain_abbreviations    

## Main Function to insert 100 randomly chosen abbreviations. In each Iteration a new Requirements set with uncontrolled abbreviations is generated. After extraction of terms and abbreviations on the new set, we generate AEP groups with the mentioned approaches and evaluate them.

In [11]:
def one_fold_validation_with_fastText(number_of_abbreviations):
    overall_results = []
    for i in range(0, 1):
        print("iteration number: " + str(i+1))
        uncontr_aeps_data, list_of_replacements = Function_Pool.create_uncontrolled_abbreviations_in_requirements(data_list, aeps_to_replace, number_of_abbreviations)
        terms = set()
        for req in uncontr_aeps_data:
            terms = terms.union(Function_Pool.nc_detect(req[2]))
        found_abbreviations = Function_Pool.extract_abbs([req[2] for req in uncontr_aeps_data])
        ordinary_terms, terms_that_contain_abbs = determine_sets_for_term_types(found_abbreviations, terms)
        result_memo = []
        th_list = [0.29, 0.52, 0.76, 0.70, -1, -1, -2]
        for j, clf in enumerate([fast_text_similarity,
                                 Function_Pool.levensthein_similarity_on_reduction_of_expansion,
                                 Function_Pool.jaro_winkler_similarity_on_reduction_of_expansion,
                                 Function_Pool.dice_coefficient_on_reduction_of_expansion,
                                 ILLOD.illod,
                                 Function_Pool.illod_plus,
                                 Function_Pool.illod_plus]):
            print("clf_position = " + str(j))
            result_memo.append(evaluate_aep_detection_approach(found_abbreviations,
                                                               list_of_replacements,
                                                               ordinary_terms, 
                                                               terms_that_contain_abbs,
                                                               clf,
                                                               threshold = th_list[j]))
        overall_results.append(result_memo)
    return overall_results

In [12]:
tor = one_fold_validation_with_fastText(50)

iteration number: 1
clf_position = 0


  dist = 1.0 - uv / np.sqrt(uu * vv)


clf_position = 1
clf_position = 2
clf_position = 3
clf_position = 4
clf_position = 5
clf_position = 6


## Sum up Results for validation with fastText

In [13]:
## 1 Iteration with fastText
def calc_average_results(tor, i):
    spec_results = [iter_result[i] for iter_result in tor]
    summed_up_result = []
    for dim in range(0,7):
        sum_ = 0
        for ir in spec_results:
            sum_ = sum_ + ir[dim]
        summed_up_result.append(sum_/len(tor))
    return summed_up_result

average_results_dict = {"FastText_Clf": calc_average_results(tor, 0),
                        "Levensthein_Clf": calc_average_results(tor, 1),
                        "Jaro_Winkler_Clf": calc_average_results(tor, 2),
                        "Dice_Coefficient_Clf": calc_average_results(tor, 3),
                        "ILLOD": calc_average_results(tor, 4),
                        "ILLOD+_VariantA": calc_average_results(tor, 5),
                        "ILLOD+_VariantB": calc_average_results(tor, 6)}

pd.DataFrame.from_dict(average_results_dict, orient="index", columns=["#AEP_groups", "#found_abbs", 
                                                                      "#missed_abbs", "#matched_AEPs", 
                                                                      "size_of_AEPgroup", "cost-effectiveness", 
                                                                      "execution time"])

Unnamed: 0,#AEP_groups,#found_abbs,#missed_abbs,#matched_AEPs,size_of_AEPgroup,cost-effectiveness,execution time
FastText_Clf,485.0,35.0,15.0,4.0,98.101031,24.525258,95.802927
Levensthein_Clf,354.0,32.0,15.0,10.0,20.474576,2.047458,10.519334
Jaro_Winkler_Clf,477.0,35.0,15.0,32.0,63.920335,1.99751,10.419696
Dice_Coefficient_Clf,444.0,35.0,15.0,10.0,31.655405,3.165541,11.293453
ILLOD,364.0,35.0,15.0,31.0,15.876374,0.512141,1.811064
ILLOD+_VariantA,363.0,35.0,15.0,32.0,12.534435,0.391701,47.483333
ILLOD+_VariantB,360.0,35.0,15.0,18.0,10.033333,0.557407,50.825526


## Perform 10-fold Validation without fastText

In [14]:

def ten_fold_validation(number_of_abbreviations):
    overall_results = []
    for i in range(0, 10):
        print("iteration number: " + str(i+1))
        uncontr_aeps_data, list_of_replacements = Function_Pool.create_uncontrolled_abbreviations_in_requirements(data_list, aeps_to_replace, number_of_abbreviations)
        terms = set()
        for req in uncontr_aeps_data:
            terms = terms.union(Function_Pool.nc_detect(req[2]))
        found_abbreviations = Function_Pool.extract_abbs([req[2] for req in uncontr_aeps_data])
        ordinary_terms, terms_that_contain_abbs = determine_sets_for_term_types(found_abbreviations, terms)
        result_memo = []
        th_list = [0.52, 0.76, 0.70, -1, -1, -2]
        for j, clf in enumerate([Function_Pool.levensthein_similarity_on_reduction_of_expansion,
            Function_Pool.jaro_winkler_similarity_on_reduction_of_expansion,
            Function_Pool.dice_coefficient_on_reduction_of_expansion,
            ILLOD.illod,
            Function_Pool.illod_plus,
            Function_Pool.illod_plus]):
            result_memo.append(evaluate_aep_detection_approach(found_abbreviations,
                                                               list_of_replacements,
                                                               ordinary_terms, 
                                                               terms_that_contain_abbs,
                                                               clf,
                                                               threshold = th_list[j]))
        overall_results.append(result_memo)
    return overall_results

In [15]:
tor = ten_fold_validation(50)

iteration number: 1
iteration number: 2
iteration number: 3
iteration number: 4
iteration number: 5
iteration number: 6
iteration number: 7
iteration number: 8
iteration number: 9
iteration number: 10


In [12]:
#import numpy as np
#tor_array = np.asarray(tor)
#print(tor_array)

## Sum up Results of 10-fold Validation

In [16]:
def calc_average_results(tor, i):
    spec_results = [iter_result[i] for iter_result in tor]
    summed_up_result = []
    for dim in range(0,7):
        sum_ = 0
        for ir in spec_results:
            sum_ = sum_ + ir[dim]
        summed_up_result.append(sum_/len(tor))
    return summed_up_result

def show_average_results(tor):
    average_results_dict = {"Levensthein_Similarity": calc_average_results(tor, 0),
           "Jaro_Winkler_Similarity": calc_average_results(tor, 1),
           "Dice_Coefficient_Similarity": calc_average_results(tor, 2),
           "ILLOD": calc_average_results(tor, 3),
           "ILLOD+_VariantA": calc_average_results(tor, 4),
           "ILLOD+_VariantB": calc_average_results(tor, 5)}

    df = pd.DataFrame.from_dict(average_results_dict, orient="index", columns=["#AEP_groups", "#found_abbs", 
                                                                          "#missed_abbs", "#matched_AEPs", 
                                                                          "size_of_AEPgroup", "cost-effectiveness", 
                                                                          "execution time"])
    display(df)

In [17]:
show_average_results(tor)

Unnamed: 0,#AEP_groups,#found_abbs,#missed_abbs,#matched_AEPs,size_of_AEPgroup,cost-effectiveness,execution time
Levensthein_Similarity,354.3,33.0,14.9,11.4,20.309908,1.875205,10.875675
Jaro_Winkler_Similarity,476.8,35.0,14.9,29.1,64.133598,2.23857,11.102193
Dice_Coefficient_Similarity,441.6,34.1,14.9,11.7,31.421843,2.833422,11.198013
ILLOD,363.2,34.9,14.9,30.0,15.57166,0.526609,1.79304
ILLOD+_VariantA,363.4,34.9,14.9,30.4,12.323487,0.411932,42.278919
ILLOD+_VariantB,360.2,34.8,14.9,18.0,9.905376,0.566452,48.129621


### Show results for simple Validation with 100 abbreviations

In [18]:
tor = ten_fold_validation(100)

iteration number: 1
iteration number: 2
iteration number: 3
iteration number: 4
iteration number: 5
iteration number: 6
iteration number: 7
iteration number: 8
iteration number: 9
iteration number: 10


In [19]:
show_average_results(tor)

Unnamed: 0,#AEP_groups,#found_abbs,#missed_abbs,#matched_AEPs,size_of_AEPgroup,cost-effectiveness,execution time
Levensthein_Similarity,386.6,63.5,31.1,19.8,20.36859,1.0568,11.282742
Jaro_Winkler_Similarity,515.2,68.7,31.1,53.7,65.535067,1.226149,10.991291
Dice_Coefficient_Similarity,477.9,66.8,31.1,20.4,31.880091,1.60382,11.635065
ILLOD,399.4,68.7,31.1,56.6,15.748048,0.278829,1.859186
ILLOD+_VariantA,398.7,68.7,31.1,57.5,12.356998,0.215619,45.489371
ILLOD+_VariantB,395.2,68.4,31.1,35.8,9.749068,0.274223,54.154928
