# 10-fold validation on 100 randomly inserted uncontrolled Abbreviations (Section 6.5)

In [1]:
import pandas as pd
import Function_Pool
import pickle
import ILLOD
import time

In [2]:
import fasttext
import fasttext.util
from scipy import spatial
fasttext.util.download_model('en', if_exists='ignore')
ft = fasttext.load_model("cc.en.300.bin")


def fast_text_similarity(a, t, threshold):  
    a_v = ft.get_sentence_vector(a)
    t_v = ft.get_sentence_vector(t)
    if 1 - spatial.distance.cosine(a_v, t_v) >= threshold:
        return True
    else:
        return False



## Load Data

In [3]:
filePath = "pure_dataset.csv"
data = pd.read_csv(filePath, names=['ID','dataset', 'requirement'], sep=';', encoding='utf8')
data_list = data.values.tolist()

replacement_data = pd.read_csv("SF-LF-Pairs.csv", names=['term','abbv'], sep=';', encoding='utf8')
aeps_to_replace = replacement_data.values.tolist()
inserted_abbreviations = []
for sample in aeps_to_replace:
    if not (sample[1] != sample[1]):
        inserted_abbreviations.append(sample)
print(len(inserted_abbreviations))

518


## Generation of AEP Groups

In [4]:
def generate_aep_candidates_and_groups(found_abbs, 
                                       ordinary_terms, 
                                       terms_that_contain_abbs, 
                                       aep_classifier, 
                                       threshold):
    dict_for_aep_groups = {}
    for a in found_abbs:
        for ot in ordinary_terms:
            if aep_classifier(a, ot, threshold):
                if a in dict_for_aep_groups.keys():
                    dict_for_aep_groups[a].append(ot)
                else:
                    dict_for_aep_groups[a] = [ot]
                    
    
    for a in found_abbs:
        for ttca in terms_that_contain_abbs:
            if a in ttca.split():
                if a in dict_for_aep_groups.keys():
                    dict_for_aep_groups[a].append(ttca)
                else:
                    dict_for_aep_groups[a] = [ttca]
    return dict_for_aep_groups

## Calculate Performance Indicators for each AEP Detection Approach

In [5]:
def evaluate_aep_detection_approach(found_abbreviations, list_of_replacements, ordinary_terms, terms_that_contain_abbs, clf, threshold):
    start_time = time.time()
    dict_for_aep_groups = generate_aep_candidates_and_groups(found_abbreviations, 
                                                             ordinary_terms, 
                                                             terms_that_contain_abbs, 
                                                             clf, 
                                                             threshold)
    end_time = time.time()
    duration = end_time - start_time
    abbreviations_detected = []
    matched_pairs = []
    for aep in list_of_replacements: # inserted_abbreviations is a global variable
        if aep[1] in dict_for_aep_groups.keys():
            abbreviations_detected.append(aep[1])
            if aep[0] in dict_for_aep_groups[aep[1]]:
                matched_pairs.append(aep)
    abbv_set_of_inserted_abbvs = set([p[1] for p in list_of_replacements])
    missed_abbreviations = abbv_set_of_inserted_abbvs - found_abbreviations

    sum_of_lengths = 0
    for key in dict_for_aep_groups.keys():
        sum_of_lengths = sum_of_lengths + len(dict_for_aep_groups[key])
    average_size_of_AEP_group = sum_of_lengths/len(dict_for_aep_groups.keys())
    cost_effectiveness = average_size_of_AEP_group/len(matched_pairs)
    return (len(dict_for_aep_groups), 
            len(abbreviations_detected), 
            len(missed_abbreviations), 
            len(matched_pairs), 
            average_size_of_AEP_group,
            cost_effectiveness,
            duration)

## The following function performs a set transformation. It partitions objects of the set  𝑇  to the sets  𝑂𝑇  and  𝑇∖𝑂𝑇 according to steps (4) and (6) from section (7.2, Figure 3)

In [6]:
def determine_sets_for_term_types(set_of_abbreviations, set_of_terms):
    
    #compliant with section 7.2: terms_that_contain_abbreviations = T \ OT
    set_of_cleaned_terms = set([term_tuple[1] for term_tuple in set_of_terms])
    terms_that_contain_abbreviations = set()
    
    for term in set_of_cleaned_terms:
        for abb in set_of_abbreviations:
            if abb in term.split():
                terms_that_contain_abbreviations.add(term)
    
    ordinary_terms = set_of_cleaned_terms - terms_that_contain_abbreviations
    
    return ordinary_terms, terms_that_contain_abbreviations    

## Main Function to insert 100 randomly chosen abbreviations. In each Iteration a new Requirements set with uncontrolled abbreviations is generated. After extraction of terms and abbreviations on the new set, we generate AEP groups with the mentioned approaches and evaluate them.

In [7]:
def one_fold_validation_with_fastText(number_of_abbreviations):
    overall_results = []
    for i in range(0, 1):
        print("iteration number: " + str(i+1))
        uncontr_aeps_data, list_of_replacements = Function_Pool.create_uncontrolled_abbreviations_in_requirements(data_list, aeps_to_replace, number_of_abbreviations)
        terms = set()
        for req in uncontr_aeps_data:
            terms = terms.union(Function_Pool.nc_detect(req[2]))
        found_abbreviations = Function_Pool.extract_abbs([req[2] for req in uncontr_aeps_data])
        ordinary_terms, terms_that_contain_abbs = determine_sets_for_term_types(found_abbreviations, terms)
        result_memo = []
        th_list = [0.29, 0.52, 0.76, 0.70, -1, -1, -2]
        for j, clf in enumerate([fast_text_similarity,
                                 Function_Pool.levensthein_similarity_on_reduction_of_expansion,
                                 Function_Pool.jaro_winkler_similarity_on_reduction_of_expansion,
                                 Function_Pool.dice_coefficient_on_reduction_of_expansion,
                                 ILLOD.illod,
                                 Function_Pool.illod_plus,
                                 Function_Pool.illod_plus]):
            print("clf_position = " + str(j))
            result_memo.append(evaluate_aep_detection_approach(found_abbreviations,
                                                               list_of_replacements,
                                                               ordinary_terms, 
                                                               terms_that_contain_abbs,
                                                               clf,
                                                               threshold = th_list[j]))
        overall_results.append(result_memo)
    return overall_results

In [8]:
tor = one_fold_validation_with_fastText(50)

iteration number: 1
clf_position = 0


  dist = 1.0 - uv / np.sqrt(uu * vv)


clf_position = 1
clf_position = 2
clf_position = 3
clf_position = 4
clf_position = 5
clf_position = 6


## Sum up Results for validation with fastText

In [9]:
## 1 Iteration with fastText
def calc_average_results(tor, i):
    spec_results = [iter_result[i] for iter_result in tor]
    summed_up_result = []
    for dim in range(0,7):
        sum_ = 0
        for ir in spec_results:
            sum_ = sum_ + ir[dim]
        summed_up_result.append(sum_/len(tor))
    return summed_up_result

average_results_dict = {"FastText_Clf": calc_average_results(tor, 0),
                        "Levensthein_Clf": calc_average_results(tor, 1),
                        "Jaro_Winkler_Clf": calc_average_results(tor, 2),
                        "Dice_Coefficient_Clf": calc_average_results(tor, 3),
                        "ILLOD": calc_average_results(tor, 4),
                        "ILLOD+_VariantA": calc_average_results(tor, 5),
                        "ILLOD+_VariantB": calc_average_results(tor, 6)}

pd.DataFrame.from_dict(average_results_dict, orient="index", columns=["#AEP_groups", "#found_abbs", 
                                                                      "#missed_abbs", "#matched_AEPs", 
                                                                      "size_of_AEPgroup", "cost-effectiveness", 
                                                                      "execution time"])

Unnamed: 0,#AEP_groups,#found_abbs,#missed_abbs,#matched_AEPs,size_of_AEPgroup,cost-effectiveness,execution time
FastText_Clf,448.0,33.0,17.0,6.0,84.631696,14.105283,146.579318
Levensthein_Clf,323.0,31.0,17.0,12.0,20.575851,1.714654,13.313746
Jaro_Winkler_Clf,443.0,33.0,17.0,24.0,64.785553,2.699398,13.073903
Dice_Coefficient_Clf,416.0,32.0,17.0,12.0,33.201923,2.766827,13.579233
ILLOD,339.0,33.0,17.0,28.0,13.091445,0.467552,2.2355
ILLOD+_VariantA,337.0,33.0,17.0,29.0,10.21365,0.352195,45.212449
ILLOD+_VariantB,335.0,33.0,17.0,21.0,7.820896,0.372424,50.515371


## Perform 10-fold Validation without fastText

In [10]:

def ten_fold_validation(number_of_abbreviations):
    overall_results = []
    for i in range(0, 10):
        print("iteration number: " + str(i+1))
        uncontr_aeps_data, list_of_replacements = Function_Pool.create_uncontrolled_abbreviations_in_requirements(data_list, aeps_to_replace, number_of_abbreviations)
        terms = set()
        for req in uncontr_aeps_data:
            terms = terms.union(Function_Pool.nc_detect(req[2]))
        found_abbreviations = Function_Pool.extract_abbs([req[2] for req in uncontr_aeps_data])
        ordinary_terms, terms_that_contain_abbs = determine_sets_for_term_types(found_abbreviations, terms)
        result_memo = []
        th_list = [0.52, 0.76, 0.70, -1, -1, -2]
        for j, clf in enumerate([Function_Pool.levensthein_similarity_on_reduction_of_expansion,
            Function_Pool.jaro_winkler_similarity_on_reduction_of_expansion,
            Function_Pool.dice_coefficient_on_reduction_of_expansion,
            ILLOD.illod,
            Function_Pool.illod_plus,
            Function_Pool.illod_plus]):
            result_memo.append(evaluate_aep_detection_approach(found_abbreviations,
                                                               list_of_replacements,
                                                               ordinary_terms, 
                                                               terms_that_contain_abbs,
                                                               clf,
                                                               threshold = th_list[j]))
        overall_results.append(result_memo)
    return overall_results

In [11]:
tor = ten_fold_validation(50)

iteration number: 1
iteration number: 2
iteration number: 3
iteration number: 4
iteration number: 5
iteration number: 6
iteration number: 7
iteration number: 8
iteration number: 9
iteration number: 10


In [12]:
#import numpy as np
#tor_array = np.asarray(tor)
#print(tor_array)

## Sum up Results of 10-fold Validation

In [13]:
def calc_average_results(tor, i):
    spec_results = [iter_result[i] for iter_result in tor]
    summed_up_result = []
    for dim in range(0,7):
        sum_ = 0
        for ir in spec_results:
            sum_ = sum_ + ir[dim]
        summed_up_result.append(sum_/len(tor))
    return summed_up_result

def show_average_results(tor):
    average_results_dict = {"Levensthein_Similarity": calc_average_results(tor, 0),
           "Jaro_Winkler_Similarity": calc_average_results(tor, 1),
           "Dice_Coefficient_Similarity": calc_average_results(tor, 2),
           "ILLOD": calc_average_results(tor, 3),
           "ILLOD+_VariantA": calc_average_results(tor, 4),
           "ILLOD+_VariantB": calc_average_results(tor, 5)}

    df = pd.DataFrame.from_dict(average_results_dict, orient="index", columns=["#AEP_groups", "#found_abbs", 
                                                                          "#missed_abbs", "#matched_AEPs", 
                                                                          "size_of_AEPgroup", "cost-effectiveness", 
                                                                          "execution time"])
    display(df)

In [14]:
show_average_results(tor)

Unnamed: 0,#AEP_groups,#found_abbs,#missed_abbs,#matched_AEPs,size_of_AEPgroup,cost-effectiveness,execution time
Levensthein_Similarity,327.8,29.7,18.5,13.3,21.034361,1.699941,13.99317
Jaro_Winkler_Similarity,444.5,31.3,18.5,26.8,65.845314,2.471787,13.781164
Dice_Coefficient_Similarity,417.7,30.6,18.5,13.6,34.182596,2.714743,14.235075
ILLOD,340.0,31.3,18.5,28.0,13.490446,0.485819,2.234543
ILLOD+_VariantA,339.5,31.3,18.5,28.3,10.388843,0.370787,47.512191
ILLOD+_VariantB,337.5,31.3,18.5,19.7,7.937919,0.408637,53.89203


### Show results for simple Validation with 100 abbreviations

In [15]:
tor = ten_fold_validation(100)

iteration number: 1
iteration number: 2
iteration number: 3
iteration number: 4
iteration number: 5
iteration number: 6
iteration number: 7
iteration number: 8
iteration number: 9
iteration number: 10


In [16]:
show_average_results(tor)

Unnamed: 0,#AEP_groups,#found_abbs,#missed_abbs,#matched_AEPs,size_of_AEPgroup,cost-effectiveness,execution time
Levensthein_Similarity,364.7,66.4,29.7,28.9,21.279011,0.750977,14.682589
Jaro_Winkler_Similarity,486.6,70.1,29.7,60.0,67.561655,1.133635,14.559649
Dice_Coefficient_Similarity,458.7,69.2,29.7,29.4,34.830936,1.209134,14.888212
ILLOD,379.7,70.0,29.7,63.0,14.10401,0.224799,2.419981
ILLOD+_VariantA,378.7,70.0,29.7,65.4,10.838362,0.166489,49.219254
ILLOD+_VariantB,376.7,70.0,29.7,42.9,8.151518,0.19159,61.115925
