# 10-fold validation on 100 randomly inserted uncontrolled Abbreviations (Section 6.5)

In [1]:
import pandas as pd
import Function_Pool
import pickle
import ILLOD
import time

In [2]:
import fasttext
import fasttext.util
from scipy import spatial
fasttext.util.download_model('en', if_exists='ignore')
ft = fasttext.load_model("cc.en.300.bin")


def fast_text_similarity(a, t, threshold):  
    a_v = ft.get_sentence_vector(a)
    t_v = ft.get_sentence_vector(t)
    if 1 - spatial.distance.cosine(a_v, t_v) >= threshold:
        return True
    else:
        return False



## Load Data

In [3]:
filePath = "pure_dataset.csv"
data = pd.read_csv(filePath, names=['ID','dataset', 'requirement'], sep=';', encoding='utf8')
data_list = data.values.tolist()

replacement_data = pd.read_csv("SF-LF-Pairs.csv", names=['term','abbv'], sep=';', encoding='utf8')
aeps_to_replace = replacement_data.values.tolist()
inserted_abbreviations = []
for sample in aeps_to_replace:
    if not (sample[1] != sample[1]):
        inserted_abbreviations.append(sample)
print(len(inserted_abbreviations))

518


## Generation of AEP Groups

In [4]:
def generate_aep_candidates_and_groups(found_abbs, 
                                       ordinary_terms, 
                                       terms_that_contain_abbs, 
                                       aep_classifier, 
                                       threshold):
    dict_for_aep_groups = {}
    for a in found_abbs:
        for ot in ordinary_terms:
            if aep_classifier(a, ot, threshold):
                if a in dict_for_aep_groups.keys():
                    dict_for_aep_groups[a].append(ot)
                else:
                    dict_for_aep_groups[a] = [ot]
                    
    
    for a in found_abbs:
        for ttca in terms_that_contain_abbs:
            if a in ttca.split():
                if a in dict_for_aep_groups.keys():
                    dict_for_aep_groups[a].append(ttca)
                else:
                    dict_for_aep_groups[a] = [ttca]
    return dict_for_aep_groups

## Calculate Performance Indicators for each AEP Detection Approach

In [5]:
def evaluate_aep_detection_approach(found_abbreviations, list_of_replacements, ordinary_terms, terms_that_contain_abbs, clf, threshold):
    start_time = time.time()
    dict_for_aep_groups = generate_aep_candidates_and_groups(found_abbreviations, 
                                                             ordinary_terms, 
                                                             terms_that_contain_abbs, 
                                                             clf, 
                                                             threshold)
    end_time = time.time()
    duration = end_time - start_time
    abbreviations_detected = []
    matched_pairs = []
    for aep in list_of_replacements: # inserted_abbreviations is a global variable
        if aep[1] in dict_for_aep_groups.keys():
            abbreviations_detected.append(aep[1])
            if aep[0] in dict_for_aep_groups[aep[1]]:
                matched_pairs.append(aep)
    abbv_set_of_inserted_abbvs = set([p[1] for p in list_of_replacements])
    missed_abbreviations = abbv_set_of_inserted_abbvs - found_abbreviations

    sum_of_lengths = 0
    for key in dict_for_aep_groups.keys():
        sum_of_lengths = sum_of_lengths + len(dict_for_aep_groups[key])
    average_size_of_AEP_group = sum_of_lengths/len(dict_for_aep_groups.keys())
    cost_effectiveness = average_size_of_AEP_group/len(matched_pairs)
    return (len(dict_for_aep_groups), 
            len(abbreviations_detected), 
            len(missed_abbreviations), 
            len(matched_pairs), 
            average_size_of_AEP_group,
            cost_effectiveness,
            duration)

## The following function performs a set transformation. It partitions objects of the set  𝑇  to the sets  𝑂𝑇  and  𝑇∖𝑂𝑇 according to steps (4) and (6) from section (7.2, Figure 3)

In [6]:
def determine_sets_for_term_types(set_of_abbreviations, set_of_terms):
    
    #compliant with section 7.2: terms_that_contain_abbreviations = T \ OT
    set_of_cleaned_terms = set([term_tuple[1] for term_tuple in set_of_terms])
    terms_that_contain_abbreviations = set()
    
    for term in set_of_cleaned_terms:
        for abb in set_of_abbreviations:
            if abb in term.split():
                terms_that_contain_abbreviations.add(term)
    
    ordinary_terms = set_of_cleaned_terms - terms_that_contain_abbreviations
    
    return ordinary_terms, terms_that_contain_abbreviations    

## Main Function to insert 100 randomly chosen abbreviations. In each Iteration a new Requirements set with uncontrolled abbreviations is generated. After extraction of terms and abbreviations on the new set, we generate AEP groups with the mentioned approaches and evaluate them.

In [7]:
def one_fold_validation_with_fastText(number_of_abbreviations):
    overall_results = []
    for i in range(0, 1):
        print("iteration number: " + str(i+1))
        uncontr_aeps_data, list_of_replacements = Function_Pool.create_uncontrolled_abbreviations_in_requirements(data_list, aeps_to_replace, number_of_abbreviations)
        terms = set()
        for req in uncontr_aeps_data:
            terms = terms.union(Function_Pool.nc_detect(req[2]))
        found_abbreviations = Function_Pool.extract_abbs([req[2] for req in uncontr_aeps_data])
        ordinary_terms, terms_that_contain_abbs = determine_sets_for_term_types(found_abbreviations, terms)
        result_memo = []
        th_list = [0.29, 0.52, 0.76, 0.70, -1, -1, -2]
        for j, clf in enumerate([fast_text_similarity,
                                 Function_Pool.levensthein_similarity_on_reduction_of_expansion,
                                 Function_Pool.jaro_winkler_similarity_on_reduction_of_expansion,
                                 Function_Pool.dice_coefficient_on_reduction_of_expansion,
                                 ILLOD.illod,
                                 Function_Pool.illod_plus,
                                 Function_Pool.illod_plus]):
            print("clf_position = " + str(j))
            result_memo.append(evaluate_aep_detection_approach(found_abbreviations,
                                                               list_of_replacements,
                                                               ordinary_terms, 
                                                               terms_that_contain_abbs,
                                                               clf,
                                                               threshold = th_list[j]))
        overall_results.append(result_memo)
    return overall_results

In [8]:
tor = one_fold_validation_with_fastText(100)

iteration number: 1
clf_position = 0


  dist = 1.0 - uv / np.sqrt(uu * vv)


clf_position = 1
clf_position = 2
clf_position = 3
clf_position = 4
clf_position = 5
clf_position = 6


## Sum up Results for validation with fastText

In [9]:
import numpy as np
## 1 Iteration with fastText
 
def calc_average_results(tor, i):
    spec_results = [iter_result[i] for iter_result in tor]
    summed_up_result = []
    for dim in range(0,7):
        sum_ = 0
        for ir in spec_results:
            sum_ = sum_ + ir[dim]
        summed_up_result.append(sum_/len(tor))
    overall_std_list = []
    for dim in range(0,7):
        std_list = []
        for ir in spec_results:
            std_list.append(ir[dim])
        overall_std_list.append(std_list) 
    stds = []
    for entry in overall_std_list:
        stds.append(np.std(entry))
        z_result = zip(summed_up_result, stds)
    return [item for t in z_result for item in t]
  
import pandas as pd

def show_average_results(tor):
    average_results_dict = {"FastText_Clf": calc_average_results(tor, 0),
                            "Levensthein_Clf": calc_average_results(tor, 1),
                            "Jaro_Winkler_Clf": calc_average_results(tor, 2),
                            "Dice_Coefficient_Clf": calc_average_results(tor, 3),
                            "ILLOD": calc_average_results(tor, 4),
                            "ILLOD+_VariantA": calc_average_results(tor, 5),
                            "ILLOD+_VariantB": calc_average_results(tor, 6)}



    df = pd.DataFrame.from_dict(average_results_dict, orient="index", columns=["#AEP_groups", "std-AEP_groups", 
                                                                          "#found_abbs", "std-found_Abbs",
                                                                          "#missed_abbs", "std-missed_abbs",
                                                                          "#matched_AEPs", "std-matched_abbs",
                                                                          "av.size_of_AEPgroup", "std-size_of_AEP_group",
                                                                          "cost_effectiveness", "std-cost_effectiveness",
                                                                          "execution time", "std-execution_time"])
    display(df)


In [10]:
show_average_results(tor)

Unnamed: 0,#AEP_groups,std-AEP_groups,#found_abbs,std-found_Abbs,#missed_abbs,std-missed_abbs,#matched_AEPs,std-matched_abbs,av.size_of_AEPgroup,std-size_of_AEP_group,cost_effectiveness,std-cost_effectiveness,execution time,std-execution_time
FastText_Clf,492.0,0.0,66.0,0.0,33.0,0.0,12.0,0.0,84.105691,0.0,7.008808,0.0,163.157019,0.0
Levensthein_Clf,359.0,0.0,60.0,0.0,33.0,0.0,24.0,0.0,20.771588,0.0,0.865483,0.0,14.093887,0.0
Jaro_Winkler_Clf,486.0,0.0,66.0,0.0,33.0,0.0,60.0,0.0,67.300412,0.0,1.121674,0.0,13.706525,0.0
Dice_Coefficient_Clf,453.0,0.0,63.0,0.0,33.0,0.0,26.0,0.0,34.315673,0.0,1.319834,0.0,14.465711,0.0
ILLOD,377.0,0.0,66.0,0.0,33.0,0.0,64.0,0.0,13.702918,0.0,0.214108,0.0,2.314248,0.0
ILLOD+_VariantA,377.0,0.0,66.0,0.0,33.0,0.0,63.0,0.0,10.435013,0.0,0.165635,0.0,49.027487,0.0
ILLOD+_VariantB,375.0,0.0,66.0,0.0,33.0,0.0,40.0,0.0,7.752,0.0,0.1938,0.0,57.375843,0.0


## Perform 10-fold Validation without fastText

In [11]:

def ten_fold_validation(number_of_abbreviations):
    overall_results = []
    for i in range(0, 10):
        print("iteration number: " + str(i+1))
        uncontr_aeps_data, list_of_replacements = Function_Pool.create_uncontrolled_abbreviations_in_requirements(data_list, aeps_to_replace, number_of_abbreviations)
        terms = set()
        for req in uncontr_aeps_data:
            terms = terms.union(Function_Pool.nc_detect(req[2]))
        found_abbreviations = Function_Pool.extract_abbs([req[2] for req in uncontr_aeps_data])
        ordinary_terms, terms_that_contain_abbs = determine_sets_for_term_types(found_abbreviations, terms)
        result_memo = []
        th_list = [0.52, 0.76, 0.70, -1, -1, -2]
        for j, clf in enumerate([Function_Pool.levensthein_similarity_on_reduction_of_expansion,
            Function_Pool.jaro_winkler_similarity_on_reduction_of_expansion,
            Function_Pool.dice_coefficient_on_reduction_of_expansion,
            ILLOD.illod,
            Function_Pool.illod_plus,
            Function_Pool.illod_plus]):
            result_memo.append(evaluate_aep_detection_approach(found_abbreviations,
                                                               list_of_replacements,
                                                               ordinary_terms, 
                                                               terms_that_contain_abbs,
                                                               clf,
                                                               threshold = th_list[j]))
        overall_results.append(result_memo)
    return overall_results

In [15]:
tor = ten_fold_validation(100)

iteration number: 1
iteration number: 2
iteration number: 3
iteration number: 4
iteration number: 5
iteration number: 6
iteration number: 7
iteration number: 8
iteration number: 9
iteration number: 10


In [20]:
import pandas as pd
average_results_dict = {
                        "Levensthein_Clf": calc_average_results(tor, 0),
                        "Jaro_Winkler_Clf": calc_average_results(tor, 1),
                        "Dice_Coefficient_Clf": calc_average_results(tor, 2),
                        "ILLOD": calc_average_results(tor, 3),
                        "ILLOD+_VariantA": calc_average_results(tor, 4),
                        "ILLOD+_VariantB": calc_average_results(tor, 5)}
  
 
  
pd.DataFrame.from_dict(average_results_dict, orient="index", columns=["#AEP_groups", "std-AEP_groups", 
                                                                      "#found_abbs", "std-found_Abbs",
                                                                      "#missed_abbs", "std-missed_abbs",
                                                                      "#matched_AEPs", "std-matched_abbs",
                                                                      "av.size_of_AEPgroup", "std-size_of_AEP_group",
                                                                      "cost_effectiveness", "std-cost_effectiveness",
                                                                      "execution time", "std-execution_time"])


Unnamed: 0,#AEP_groups,std-AEP_groups,#found_abbs,std-found_Abbs,#missed_abbs,std-missed_abbs,#matched_AEPs,std-matched_abbs,av.size_of_AEPgroup,std-size_of_AEP_group,cost_effectiveness,std-cost_effectiveness,execution time,std-execution_time
Levensthein_Clf,360.6,4.103657,62.7,4.754997,33.5,5.024938,26.0,3.974921,21.401129,0.598445,0.84262,0.130501,15.5685,0.775639
Jaro_Winkler_Clf,482.5,5.818075,66.0,5.09902,33.5,5.024938,55.8,4.935585,67.235505,0.745144,1.21474,0.111303,15.429949,0.669261
Dice_Coefficient_Clf,454.1,4.459821,64.9,5.185557,33.5,5.024938,26.5,3.827532,34.882164,0.825526,1.342311,0.183254,16.016431,0.719111
ILLOD,375.2,6.177378,65.7,5.459853,33.5,5.024938,58.2,5.134199,14.068732,0.421079,0.243636,0.02241,2.691697,0.224035
ILLOD+_VariantA,374.0,6.276942,65.7,5.459853,33.5,5.024938,60.8,5.362835,10.729617,0.274246,0.177869,0.0163,58.665318,5.643475
ILLOD+_VariantB,371.9,6.155485,65.7,5.459853,33.5,5.024938,39.2,4.749737,8.016057,0.132522,0.207529,0.025396,68.579418,6.665913
