## Script to find an estimate for alpha (abbreviation to term ratio)

In [79]:
import spacy
import pandas as pd
nlp = spacy.load("en_core_web_sm")
import pandas as pd
from spacy.matcher import Matcher
import re

## Load PURE Data from file:

In [80]:
pure_data = pd.read_csv('pure_data.CSV', names=["dataset", "id", "req_texts"], sep='\t', encoding='utf8')
ids = list(pure_data['id'].values)
reqs = list(pure_data['req_texts'].values)
dataset = list(pure_data['dataset'].values)

## Define set of stop words

In [81]:
stop_words = ["the", "and", "i", "for", "as", "an", "a", "if", "any", "all", "one", "on", "new", "out", "we", "to", "at", "by", "from"]

## Helper functions to extract noun chunks (NCs) and abbeviations

In [82]:
def upper_ratio(w):
    upper_cases = ''.join([c for c in w if c.isupper()])
    return len(upper_cases)/len(w)

In [83]:
def normalize_nc(nc):
    doc = nlp(nc)
    cleaned_nc = ""
    for token in doc:
        if token.pos_ != "DET":
            cleaned_nc = cleaned_nc + " " + token.lemma_
            cleaned_nc = re.sub(r"[\([{})\]]", "", cleaned_nc)
            cleaned_nc = cleaned_nc.strip()
    return cleaned_nc

In [84]:
# Extraction of noun chunks according to  
# [2] (Arora, Chetan, et al. "Automated extraction and clustering of requirements glossary terms." IEEE Transactions on Software Engineering 43.10 (2016): 918-945.)
# Some Pos-Tag-Patterns are added to the NC detection to improve recall of spacy package

def nc_detect(req):
    noun_chunks_set = set()
    matcher = Matcher(nlp.vocab)
    pattern1 = [{'POS': 'NOUN'}, {'POS': 'NOUN'}, {'POS': 'NOUN'}]
    pattern2 = [{'POS': 'PROPN'}, {'POS': 'NOUN'}, {'POS': 'PROPN'}, {'POS': 'PROPN'}]
    pattern3 = [{'POS': 'NOUN'}, {'POS': 'DET'}, {'POS': 'NOUN'}]
    pattern4 = [{'POS': 'NOUN'}]
    matcher.add("TrigramNCs", [pattern1, pattern2, pattern3, pattern4])
    doc = nlp(req)
    matches = matcher(doc)
    for nc_ in doc.noun_chunks:
        noun_chunks_set.add(nc_.text)
    

    composed_terms = set()
    for nc1 in noun_chunks_set:
        for nc2 in noun_chunks_set:
            comp_term1 = nc1 + " of " + nc2
            comp_term2 = nc1 + " and " + nc2
            if comp_term1 in req:
                composed_terms.add(comp_term1)
            if comp_term2 in req:
                composed_terms.add(comp_term2)
    found_terms = noun_chunks_set.union(composed_terms)
    
    cleaned_terms = []
    for t in found_terms:
        cleaned_terms.append(normalize_nc(t))
    return set(cleaned_terms)

In [85]:
# Extraction of abbreviations according to the F1-optimized approach
def abbv_detect(sent):
    abv = set()
    for word in sent.split():
        if (len(word) <= 13 and upper_ratio(word) >= 0.29):
            if len([c for c in word if c.isupper()]) == 1 and word[0].isupper() and word.lower() in stop_words:
                continue
            abv.add(word)
    return abv

## The main function: Collect NCs and Abbreviations in sepereate sets

In [86]:
set_of_detected_ncs = set()
for req in reqs:
    set_of_detected_ncs = set_of_detected_ncs.union(nc_detect(req))

In [87]:
for term in set_of_detected_ncs:
    print(term)

size of file
programmable parameter
way
visible indication
special MMI action
primary CDN
link assurance tone
default
reversing function
intend recipient
food
c4i system
alphanumeric description of identity
estimate of error
vcd transportation system
exposure level
number of different source
access restriction
permission
provisioning and reservation
target coordinate
chief conductor conductor 1 conductor 2 catering staff
type of symptom
error estimate
permitted national speed value
carrier identifier
call establishment
payload information
PAs
possible intervention
estimate time
engine number of call originator
flow rate
railway emergency priority
operational history and statistical report
recovery
anticipate flight path
local chapter
diagnostic test
type of living thing
maximum of 5 minute
microphone
relevant information
driver convenience
eg transitory lighting of display
datum link terminal command
brake application
direct mode
automatic network selection
air tightness
new resource
o

altitude constraint
message and HCI action
on - go call charge
interactive display
protection
power
subject of case
individual report
tender - specific criterion
export
know epi - link
one RBC
incoming and outgoing tactical communication message
contraindication information
peering arrangement
link assurance signal
name of object
supervisor
resource , service and policy detail
list of staff type
patient
call group
User Information section
Countermeasure / Response Administration
environmental sample datum
readiness
same mobile
new network
detail part
operator selection of specific av
four button
mission coordination and operation
spice
national system
connection of Cab radio
time of demand
newly register driver
, laboratory
navigation method
follow major function
group voice call
road
coverage of current network
new staff
full control functionality
datum storage device
call / call user restriction of display
annotation and overlay
self test
brake intervention
document
VSP Code
TCS Disp

In [88]:
print(len(set_of_detected_ncs))

3195


In [89]:
set_of_detected_abbreviations = set()
for req in reqs:
    extracted_abbreviations = abbv_detect(req)
    for abbv in extracted_abbreviations:
        cleaned_abbv = re.sub(r"[\([{})\]]", "", abbv)
        set_of_detected_abbreviations.add(cleaned_abbv)

In [90]:
for abbv in set_of_detected_abbreviations:
    print(abbv)

CD-ROM
DMI
ADT,
VCD.
EO,
USMTF,
SHOULD/MUST
Of
LOINC,
VGCS
LOS
UHF/VHF,
ADT
Epi
CDNs
HWCIs
TCSs.
PAs
AVs
MISU
VSP,
non-EIRENE.
CTS.
PM,
Via
THEMAS
FD/L
OM
UPS,
SLAs
UHF,
NIMA
ERTMS/ETCS
No
SP,
UT
SATCOM
VSP
SATCOM.
TCS.
UO
X/Y
Web
SHOULD
ADT.
IDs
ONLY
SR
ERTMS/ETCS;
LIFO
DMI.
RBC.
EED,
IFF.
DSD
PAs.
VCD
UTC
SATellite
LO
AV's
NITF
GDT,
No.
B:
ADRG
T
MMI;
CADRG
In
BLOS,
Not
CPU,
VMIU
PWS,
UIS
TACFIRE,
IEWCOMCAT
DoD
MSIU
RBC
LT
EIRENE.
“No
CDN
DFAD,
DII-COE
Cab
AVs.
HCI
PA
OFF
VIN
SLAs.
Air
Use
PA.
ETCS
SC
LOL
Non
UAV.
Key
Fe,
CDN.
STM.
SLAs,
ID
VBS
pre-VCD
TCS
DTED.
But
SAR
OTH-Gold,
UO,
ON/OFF
BLOS
MUST
MMI
GDT
CSCIs
AV
VHF,
Tab
SNOMED,
PHIN
MAE
VSP.
WSs.
HF
ID.
RTP
MMI.
IR,
HP0001SC0001
GPS
NRT
MA
It
A:
STM,
STM
ID,
VCR
MRP
Arc
Yes
ASCII
ON
DMI,
Hz
UHF/VHF
AV.
LRP,
C4I
RS170A
PTT
CD
DTED,
EIRENE
RBC,
EEA
OM.
AV,
UO.
UAV
RAID
Foe
RS-170A
Up,
If,
ICON
EFTA


In [91]:
print(len(set_of_detected_abbreviations))

174


In [92]:
ratio = (len(set_of_detected_ncs) - len(set_of_detected_abbreviations))/len(set_of_detected_abbreviations)
print(ratio)

17.362068965517242
