# 10-fold validation on 100 randomly inserted uncontrolled Abbreviations (Section 6.5)

In [1]:
import pandas as pd
import Function_Pool
import pickle
import ILLOD
import time

## Load Data

In [2]:
filePath = "pure_dataset.csv"
data = pd.read_csv(filePath, names=['ID','dataset', 'requirement'], sep=';', encoding='utf8')
data_list = data.values.tolist()

replacement_data = pd.read_csv("SF-LF-Pairs.csv", names=['term','abbv'], sep=';', encoding='utf8')
aeps_to_replace = replacement_data.values.tolist()
inserted_abbreviations = []
for sample in aeps_to_replace:
    if not (sample[1] != sample[1]):
        inserted_abbreviations.append(sample)
print(len(inserted_abbreviations))

518


## Generation of AEP Groups

In [3]:
def generate_aep_candidates_and_groups(found_abbs, 
                                       ordinary_terms, 
                                       terms_that_contain_abbs, 
                                       aep_classifier, 
                                       threshold):
    dict_for_aep_groups = {}
    for a in found_abbs:
        for ot in ordinary_terms:
            if aep_classifier(a, ot, threshold):
                if a in dict_for_aep_groups.keys():
                    dict_for_aep_groups[a].append(ot)
                else:
                    dict_for_aep_groups[a] = [ot]
                    
    
    for a in found_abbs:
        for ttca in terms_that_contain_abbs:
            if a in ttca.split():
                if a in dict_for_aep_groups.keys():
                    dict_for_aep_groups[a].append(ttca)
                else:
                    dict_for_aep_groups[a] = [ttca]
    return dict_for_aep_groups

## Calculate Performance Indicators for each AEP Detection Approach

In [4]:
def evaluate_aep_detection_approach(found_abbreviations, list_of_replacements, ordinary_terms, terms_that_contain_abbs, clf, threshold):
    start_time = time.time()
    dict_for_aep_groups = generate_aep_candidates_and_groups(found_abbreviations, 
                                                             ordinary_terms, 
                                                             terms_that_contain_abbs, 
                                                             clf, 
                                                             threshold)
    end_time = time.time()
    duration = end_time - start_time
    abbreviations_detected = []
    matched_pairs = []
    for aep in list_of_replacements: # inserted_abbreviations is a global variable
        if aep[1] in dict_for_aep_groups.keys():
            abbreviations_detected.append(aep[1])
            if aep[0] in dict_for_aep_groups[aep[1]]:
                matched_pairs.append(aep)
    abbv_set_of_inserted_abbvs = set([p[1] for p in list_of_replacements])
    missed_abbreviations = abbv_set_of_inserted_abbvs - found_abbreviations

    sum_of_lengths = 0
    for key in dict_for_aep_groups.keys():
        sum_of_lengths = sum_of_lengths + len(dict_for_aep_groups[key])
    average_size_of_AEP_group = sum_of_lengths/len(dict_for_aep_groups.keys())
    cost_effectiveness = average_size_of_AEP_group/len(matched_pairs)
    return (len(dict_for_aep_groups), 
            len(abbreviations_detected), 
            len(missed_abbreviations), 
            len(matched_pairs), 
            average_size_of_AEP_group,
            cost_effectiveness,
            duration)

## The following function performs a set transformation. It partitions objects of the set  ùëá  to the sets  ùëÇùëá  and  ùëá‚àñùëÇùëá according to steps (4) and (6) from section (7.2, Figure 3)

In [5]:
def determine_sets_for_term_types(set_of_abbreviations, set_of_terms):
    
    #compliant with section 7.2: terms_that_contain_abbreviations = T \ OT
    set_of_cleaned_terms = set([term_tuple[1] for term_tuple in set_of_terms])
    terms_that_contain_abbreviations = set()
    
    for term in set_of_cleaned_terms:
        for abb in set_of_abbreviations:
            if abb in term.split():
                terms_that_contain_abbreviations.add(term)
    
    ordinary_terms = set_of_cleaned_terms - terms_that_contain_abbreviations
    
    return ordinary_terms, terms_that_contain_abbreviations    

## Main Function to insert 100 randomly chosen abbreviations. In each Iteration a new Requirements set with uncontrolled abbreviations is generated. After extraction of terms and abbreviations on the new set, we generate AEP groups with the mentioned approaches and evaluate them.

## Perform 10-fold Validation without fastText

In [6]:

def simple_validation(number_of_abbreviations):
    overall_results = []
    for i in range(0, 1):
        print("iteration number: " + str(i+1))
        uncontr_aeps_data, list_of_replacements = Function_Pool.create_uncontrolled_abbreviations_in_requirements(data_list, aeps_to_replace, number_of_abbreviations)
        terms = set()
        for req in uncontr_aeps_data:
            terms = terms.union(Function_Pool.nc_detect(req[2]))
        found_abbreviations = Function_Pool.extract_abbs([req[2] for req in uncontr_aeps_data])
        ordinary_terms, terms_that_contain_abbs = determine_sets_for_term_types(found_abbreviations, terms)
        result_memo = []
        th_list = [0.52, 0.76, 0.70, -1, -1, -2]
        for j, clf in enumerate([Function_Pool.levensthein_similarity_on_reduction_of_expansion,
            Function_Pool.jaro_winkler_similarity_on_reduction_of_expansion,
            Function_Pool.dice_coefficient_on_reduction_of_expansion,
            ILLOD.illod,
            Function_Pool.illod_plus,
            Function_Pool.illod_plus]):
            result_memo.append(evaluate_aep_detection_approach(found_abbreviations,
                                                               list_of_replacements,
                                                               ordinary_terms, 
                                                               terms_that_contain_abbs,
                                                               clf,
                                                               threshold = th_list[j]))
        overall_results.append(result_memo)
    return overall_results

In [7]:
tor = simple_validation(50)

iteration number: 1


In [8]:
#import numpy as np
#tor_array = np.asarray(tor)
#print(tor_array)

## Show results for simple Validation with 50 abbreviations

In [9]:
def calc_average_results(tor, i):
    spec_results = [iter_result[i] for iter_result in tor]
    summed_up_result = []
    for dim in range(0,7):
        sum_ = 0
        for ir in spec_results:
            sum_ = sum_ + ir[dim]
        summed_up_result.append(sum_/len(tor))
    return summed_up_result



In [10]:
def show_average_results(tor):
    average_results_dict = {"Levensthein_Similarity": calc_average_results(tor, 0),
           "Jaro_Winkler_Similarity": calc_average_results(tor, 1),
           "Dice_Coefficient_Similarity": calc_average_results(tor, 2),
           "ILLOD": calc_average_results(tor, 3),
           "ILLOD+_VariantA": calc_average_results(tor, 4),
           "ILLOD+_VariantB": calc_average_results(tor, 5)}

    df = pd.DataFrame.from_dict(average_results_dict, orient="index", columns=["#AEP_groups", "#found_abbs", 
                                                                          "#missed_abbs", "#matched_AEPs", 
                                                                          "size_of_AEPgroup", "cost-effectiveness", 
                                                                          "execution time"])
    display(df)

In [11]:
show_average_results(tor)

Unnamed: 0,#AEP_groups,#found_abbs,#missed_abbs,#matched_AEPs,size_of_AEPgroup,cost-effectiveness,execution time
Levensthein_Similarity,328.0,31.0,17.0,14.0,21.018293,1.501307,28.805413
Jaro_Winkler_Similarity,443.0,33.0,17.0,28.0,64.997743,2.321348,22.841941
Dice_Coefficient_Similarity,417.0,32.0,17.0,14.0,33.995204,2.428229,15.167335
ILLOD,340.0,33.0,17.0,30.0,13.085294,0.436176,2.319299
ILLOD+_VariantA,340.0,33.0,17.0,32.0,10.088235,0.315257,50.38494
ILLOD+_VariantB,338.0,33.0,17.0,19.0,7.683432,0.404391,58.310116


## Show results for simple Validation with 100 abbreviations

In [12]:
tor = simple_validation(100)
show_average_results(tor)

iteration number: 1


Unnamed: 0,#AEP_groups,#found_abbs,#missed_abbs,#matched_AEPs,size_of_AEPgroup,cost-effectiveness,execution time
Levensthein_Similarity,360.0,66.0,28.0,20.0,21.691667,1.084583,31.168089
Jaro_Winkler_Similarity,484.0,72.0,28.0,61.0,66.597107,1.091756,24.570376
Dice_Coefficient_Similarity,454.0,70.0,28.0,21.0,35.004405,1.666876,16.251092
ILLOD,377.0,72.0,28.0,67.0,13.952255,0.208243,2.62044
ILLOD+_VariantA,377.0,72.0,28.0,69.0,10.655172,0.154423,60.120152
ILLOD+_VariantB,375.0,72.0,28.0,37.0,7.893333,0.213333,65.556862


## Show results for simple Validation with 200 abbreviations

In [13]:
tor = simple_validation(200)
show_average_results(tor)

iteration number: 1


Unnamed: 0,#AEP_groups,#found_abbs,#missed_abbs,#matched_AEPs,size_of_AEPgroup,cost-effectiveness,execution time
Levensthein_Similarity,419.0,122.0,68.0,57.0,21.749403,0.381568,33.696921
Jaro_Winkler_Similarity,548.0,131.0,68.0,111.0,68.164234,0.614092,27.387596
Dice_Coefficient_Similarity,514.0,127.0,68.0,59.0,35.428016,0.600475,18.23841
ILLOD,436.0,131.0,68.0,114.0,14.172018,0.124316,3.276407
ILLOD+_VariantA,436.0,131.0,68.0,117.0,10.75,0.09188,61.959292
ILLOD+_VariantB,433.0,130.0,68.0,81.0,7.916859,0.097739,73.707328


## Show results for simple Validation with 400 abbreviations

In [19]:
tor = simple_validation(400)
show_average_results(tor)

iteration number: 1


Unnamed: 0,#AEP_groups,#found_abbs,#missed_abbs,#matched_AEPs,size_of_AEPgroup,cost-effectiveness,execution time
Levensthein_Similarity,535.0,246.0,138.0,93.0,21.607477,0.232338,54.021557
Jaro_Winkler_Similarity,679.0,258.0,138.0,218.0,74.159057,0.340179,40.750063
Dice_Coefficient_Similarity,644.0,253.0,138.0,96.0,35.857143,0.373512,29.011497
ILLOD,559.0,257.0,138.0,229.0,15.563506,0.067963,4.596382
ILLOD+_VariantA,557.0,257.0,138.0,232.0,11.793537,0.050834,122.858981
ILLOD+_VariantB,555.0,257.0,138.0,144.0,8.369369,0.058121,130.480213
