## Script to find an estimate for alpha (abbreviation to term ratio)

In [109]:
import spacy
import pandas as pd
nlp = spacy.load("en_core_web_sm")
import pandas as pd
from spacy.matcher import Matcher
import re
from string import punctuation

## Load PURE Data from file:

In [110]:
pure_data = pd.read_csv('pure_data.CSV', names=["dataset", "id", "req_texts"], sep='\t', encoding='utf8')
ids = list(pure_data['id'].values)
reqs = list(pure_data['req_texts'].values)
dataset = list(pure_data['dataset'].values)

## Define set of stop words

In [111]:
stop_words = ["the", "and", "i", "for", "as", "an", "a", "if", "any", "all", "one", "on", "new", "out", "we", "to", "at", "by", "from"]

## Helper functions to extract noun chunks (NCs) and abbreviations

In [112]:
def upper_ratio(w):
    upper_cases = ''.join([c for c in w if c.isupper()])
    return len(upper_cases)/len(w)

In [113]:
def normalize_nc(nc):
    doc = nlp(nc)
    cleaned_nc = ""
    for token in doc:
        if token.pos_ != "DET":
            cleaned_nc = cleaned_nc + " " + token.lemma_
            cleaned_nc = re.sub(r"[\([{})\]]", "", cleaned_nc)
            cleaned_nc = cleaned_nc.strip()
    return cleaned_nc

Extraction of noun chunks according to [2] (Arora, Chetan, et al. "Automated extraction and clustering of requirements glossary terms." IEEE Transactions on Software Engineering 43.10 (2016): 918-945). Some Pos-Tag-Patterns are added to the NC detection to improve recall of spacy package

In [114]:
def nc_detect(req):
    noun_chunks_set = set()
    matcher = Matcher(nlp.vocab)
    pattern1 = [{'POS': 'NOUN'}, {'POS': 'NOUN'}, {'POS': 'NOUN'}]
    pattern2 = [{'POS': 'PROPN'}, {'POS': 'NOUN'}, {'POS': 'PROPN'}, {'POS': 'PROPN'}]
    pattern3 = [{'POS': 'NOUN'}, {'POS': 'DET'}, {'POS': 'NOUN'}]
    pattern4 = [{'POS': 'NOUN'}]
    matcher.add("TrigramNCs", [pattern1, pattern2, pattern3, pattern4])
    doc = nlp(req)
    matches = matcher(doc)
    for nc_ in doc.noun_chunks:
        noun_chunks_set.add(nc_.text)
    

    composed_terms = set()
    for nc1 in noun_chunks_set:
        for nc2 in noun_chunks_set:
            comp_term1 = nc1 + " of " + nc2
            comp_term2 = nc1 + " and " + nc2
            if comp_term1 in req:
                composed_terms.add(comp_term1)
            if comp_term2 in req:
                composed_terms.add(comp_term2)
    found_terms = noun_chunks_set.union(composed_terms)
    
    cleaned_terms = []
    for t in found_terms:
        cleaned_terms.append(normalize_nc(t))
    return set(cleaned_terms)

In [115]:
# Extraction of abbreviations according to the F1-optimized approach
def abbv_detect(sent):
    abv = set()
    for word in sent.split():
        if (len(word) <= 13 and upper_ratio(word) >= 0.29):
            if len([c for c in word if c.isupper()]) == 1 and word[0].isupper() and word.lower() in stop_words:
                continue
            abv.add(word.strip(punctuation))
    return abv

## The main function: Collect the set of NCs and the set of Abbreviations independantly and compare their length at the end

In [116]:
set_of_detected_ncs = set()
for req in reqs:
    set_of_detected_ncs = set_of_detected_ncs.union(nc_detect(req))

In [117]:
for term in list(set_of_detected_ncs)[:20]:
    print(term)

size of file
programmable parameter
way
visible indication
special MMI action
primary CDN
link assurance tone
default
reversing function
intend recipient
food
c4i system
alphanumeric description of identity
estimate of error
vcd transportation system
exposure level
number of different source
access restriction
permission
provisioning and reservation


In [118]:
print(len(set_of_detected_ncs))

3195


In [119]:
set_of_detected_abbreviations = set()
for req in reqs:
    extracted_abbreviations = abbv_detect(req)
    for abbv in extracted_abbreviations:
        cleaned_abbv = re.sub(r"[\([{})\]]", "", abbv)
        set_of_detected_abbreviations.add(cleaned_abbv)

In [120]:
for abbv in list(set_of_detected_abbreviations)[:20]:
    print(abbv)

OTH-Gold
IFF
CD-ROM
IR
CPU
DMI
LOINC
non-EIRENE
SHOULD/MUST
Of
VGCS
LOS
VHF
ADT
Epi
CDNs
DFAD
HWCIs
PAs
AVs


In [121]:
print(len(set_of_detected_abbreviations))

138


In [122]:
ratio = (len(set_of_detected_ncs) - len(set_of_detected_abbreviations))/len(set_of_detected_abbreviations)
print(ratio)

22.152173913043477
