In [31]:
import pandas as pd
import spacy
from spacy import displacy
import numpy as np
import torch

# LOADING THE SPACY MODEL
nlp = spacy.load("en_core_web_trf")

# LOADING THE GLOSSARY
f = open("./Glossary/NISTIR 7298 Rev3.txt", "r")
glossary = []
for x in f:
  glossary.append(x.replace("\n", ""))

# LOADING SECBERT
from transformers import pipeline

fill_mask_secbert = pipeline(
    "fill-mask",
    model="jackaduma/SecBERT",
    tokenizer="jackaduma/SecBERT"
)
tokenizer = fill_mask_secbert.tokenizer
secbert = fill_mask_secbert.model
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

print("Libraries and models uploaded!")

Libraries and models uploaded!


In [32]:
example = "The system and XQuery and system shall provide the ability to capture and display temperature, weight and height in both metric and English units."

In [33]:
# Visualizing ENTITES, DEPENDENCIES  and POS_TAGS
doc = nlp(example)
displacy.render(doc, style='ent', jupyter=True)
displacy.render(doc, style='dep', jupyter=True)

In [34]:
# Generating similar requirements by using SecBERT
results = []
words_found = []
for word in glossary:
    for index, term in enumerate(example.split()):
        if word == term:
            if word not in words_found:
                words_found.append(word)
            new_requirement = example.split()
            new_requirement[index] = "[MASK]"
            new_req_from_bert = fill_mask_secbert(' '.join(new_requirement))
            results.append(new_req_from_bert[0]['sequence'])

print(results)
print("--------------------")
print(words_found)
del results
del words_found


['the mac and xquery and system shall provide the ability to capture and display temperature, weight and height in both metric and english units.', 'the system and xquery and program shall provide the ability to capture and display temperature, weight and height in both metric and english units.', 'the system and temperature and system shall provide the ability to capture and display temperature, weight and height in both metric and english units.']
--------------------
['system', 'XQuery']


In [35]:
# Function to generate similar requirements
# Requires fill_mask_secbert loaded!
def generate_similar_requirement(string):
    results = []
    words_found = []
    for word in glossary:
        for index, term in enumerate(string.split()):
            if word == term:
                if word not in words_found:
                    words_found.append(word)
                new_requirement = string.split()
                new_requirement[index] = "[MASK]"
                new_req_from_bert = fill_mask_secbert(' '.join(new_requirement))
                results.append(new_req_from_bert[0]['sequence'])
    return results,words_found 

In [36]:
# Function to extract all the features from a string
def extract_features(string):
    PERSONS = ['I', 'ME', 'MY', 'MINE', 'YOU', 'YOUR', 'YOURS', 'HE', 'SHE', 
           'HIS', 'HER', 'HIM', 'THEY', 'THEM', 'THEMSELVES', 'OUR', 'WE']
    
    doc = nlp(string)

    entities = []
    dependencies = []
    pos_tags = []
        
    for t in doc:
        if t.ent_type_ == '':
            if t.text.upper() in PERSONS:
                entities.append('PERSON')
            else:
                entities.append(t.text)
        else:
            entities.append(t.ent_type_)
    
    for t in doc:
        dependencies.append(t.dep_)
 
    for t in doc:
        pos_tags.append(t.pos_)
    
    similar_req, words_found = generate_similar_requirement(string)

    return entities, dependencies, pos_tags, similar_req, words_found

In [37]:
entities, dependencies, pos_tags, similar_req, words_found = extract_features(example)
print("#############   ENTITIES ################")
print(entities)
print("#############   DEPENDENCIES ################")
print(dependencies)
print("#############   POS_TAGS ################")
print(pos_tags)
print("#############   SIMILAR REQUIREMENTS ################")
print(similar_req)
print("#############   WORDS FOUND ################")
print(words_found)

#############   ENTITIES ################
['The', 'system', 'and', 'XQuery', 'and', 'system', 'shall', 'provide', 'the', 'ability', 'to', 'capture', 'and', 'display', 'temperature', ',', 'weight', 'and', 'height', 'in', 'both', 'metric', 'and', 'LANGUAGE', 'units', '.']
#############   DEPENDENCIES ################
['det', 'nsubj', 'cc', 'conj', 'cc', 'conj', 'aux', 'ROOT', 'det', 'dobj', 'aux', 'acl', 'cc', 'conj', 'dobj', 'punct', 'conj', 'cc', 'conj', 'prep', 'preconj', 'amod', 'cc', 'conj', 'pobj', 'punct']
#############   POS_TAGS ################
['DET', 'NOUN', 'CCONJ', 'NOUN', 'CCONJ', 'NOUN', 'AUX', 'VERB', 'DET', 'NOUN', 'PART', 'VERB', 'CCONJ', 'VERB', 'NOUN', 'PUNCT', 'NOUN', 'CCONJ', 'NOUN', 'ADP', 'CCONJ', 'ADJ', 'CCONJ', 'ADJ', 'NOUN', 'PUNCT']
#############   SIMILAR REQUIREMENTS ################
['the mac and xquery and system shall provide the ability to capture and display temperature, weight and height in both metric and english units.', 'the system and xquery and p

### EXTRACTING FEATURES FROM ORIGINAL DATASET

In [38]:
original_data = pd.read_excel(".\ClassifiedDataset\RequirementsOriginal.xlsx",dtype=object)
original_data.info()
original_data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10871 entries, 0 to 10870
Data columns (total 8 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   Requirement Text                 10798 non-null  object
 1   Security Related                 10871 non-null  object
 2   Confidentiality                  10871 non-null  object
 3   Integrity                        10871 non-null  object
 4   Availability                     10871 non-null  object
 5   Identification & Authentication  10871 non-null  object
 6   Accountability                   10871 non-null  object
 7   Privacy                          10871 non-null  object
dtypes: object(8)
memory usage: 679.6+ KB


Unnamed: 0,Requirement Text,Security Related,Confidentiality,Integrity,Availability,Identification & Authentication,Accountability,Privacy
0,The system shall create a single patient recor...,True,True,False,False,False,False,False
1,The system shall associate (store and link) ke...,True,True,False,False,True,False,False
2,The system shall provide the ability to store ...,True,False,False,False,True,False,False
3,The system shall provide a field which will id...,False,False,False,False,False,False,False
4,The system shall provide the ability to merge ...,True,True,True,False,False,True,False
...,...,...,...,...,...,...,...,...
10866,"VLER DAS stores the ‘Patient ID’ (EDIPI, if no...",True,True,True,False,False,True,False
10867,"VLER DAS stores the ‘Status of Exam Result’, w...",True,True,True,False,False,True,False
10868,VLER DAS stores the event description informat...,True,True,True,False,False,True,False
10869,Constraints,False,False,False,False,False,False,False


In [39]:
original_data["Entities"] = None
original_data["Dependencies"] = None
original_data["Parts of Speech"] = None
original_data["Security Words"] = None
original_data["Similar Requirements"] = None
counter = 0
for row in original_data.iterrows():
    entities, dependencies, pos_tags, similar_req, words_found = extract_features(str(row[1]['Requirement Text']))
    original_data.at[counter,"Entities"] = entities
    original_data.at[counter,"Dependencies"] = dependencies
    original_data.at[counter,"Parts of Speech"] = pos_tags
    original_data.at[counter,"Security Words"] = words_found
    original_data.at[counter,"Similar Requirements"] = similar_req
    if row[1]['Security Related'] == True:
        original_data.at[counter,"Security Related"] = float(1)
    else:
        original_data.at[counter,"Security Related"] = float(0)
    counter+=1

original_data.info()
original_data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10871 entries, 0 to 10870
Data columns (total 13 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   Requirement Text                 10798 non-null  object
 1   Security Related                 10871 non-null  object
 2   Confidentiality                  10871 non-null  object
 3   Integrity                        10871 non-null  object
 4   Availability                     10871 non-null  object
 5   Identification & Authentication  10871 non-null  object
 6   Accountability                   10871 non-null  object
 7   Privacy                          10871 non-null  object
 8   Entities                         10871 non-null  object
 9   Dependencies                     10871 non-null  object
 10  Parts of Speech                  10871 non-null  object
 11  Security Words                   10871 non-null  object
 12  Similar Requirements            

Unnamed: 0,Requirement Text,Security Related,Confidentiality,Integrity,Availability,Identification & Authentication,Accountability,Privacy,Entities,Dependencies,Parts of Speech,Security Words,Similar Requirements
0,The system shall create a single patient recor...,1.0,True,False,False,False,False,False,"[The, system, shall, create, a, single, patien...","[det, nsubj, aux, ROOT, det, amod, compound, d...","[DET, NOUN, AUX, VERB, DET, ADJ, NOUN, NOUN, A...",[system],[the program shall create a single patient rec...
1,The system shall associate (store and link) ke...,1.0,True,False,False,True,False,False,"[The, system, shall, associate, (, store, and,...","[det, nsubj, aux, ROOT, punct, nmod, cc, conj,...","[DET, NOUN, AUX, VERB, PUNCT, VERB, CCONJ, VER...","[identifier, information, key, system]",[the system shall associate ( store and link )...
2,The system shall provide the ability to store ...,1.0,False,False,False,True,False,False,"[The, system, shall, provide, the, ability, to...","[det, nsubj, aux, ROOT, det, dobj, aux, acl, a...","[DET, NOUN, AUX, VERB, DET, NOUN, PART, VERB, ...","[identifier, system]",[the system shall provide the ability to store...
3,The system shall provide a field which will id...,0.0,False,False,False,False,False,False,"[The, system, shall, provide, a, field, which,...","[det, nsubj, aux, ROOT, det, dobj, nsubj, aux,...","[DET, NOUN, AUX, VERB, DET, NOUN, PRON, AUX, V...",[system],[the process shall provide a field which will ...
4,The system shall provide the ability to merge ...,1.0,True,True,False,False,True,False,"[The, system, shall, provide, the, ability, to...","[det, nsubj, aux, ROOT, det, dobj, aux, acl, c...","[DET, NOUN, AUX, VERB, DET, NOUN, PART, VERB, ...","[information, records, system]",[the system shall provide the ability to merge...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10866,"VLER DAS stores the ‘Patient ID’ (EDIPI, if no...",1.0,True,True,False,False,True,False,"[ORG, ORG, stores, the, ‘, Patient, ID, ’, (, ...","[compound, nsubj, ROOT, det, punct, compound, ...","[PROPN, PROPN, VERB, DET, PUNCT, PROPN, PROPN,...","[audit, DAS]","[vler das stores the ‘ patient id ’ ( edipi, i..."
10867,"VLER DAS stores the ‘Status of Exam Result’, w...",1.0,True,True,False,False,True,False,"[ORG, ORG, stores, the, ‘, Status, of, Exam, R...","[compound, nsubj, ROOT, det, punct, dobj, prep...","[PROPN, PROPN, VERB, DET, PUNCT, NOUN, ADP, PR...","[audit, DAS]",[vler das stores the ‘ status of exam result ’...
10868,VLER DAS stores the event description informat...,1.0,True,True,False,False,True,False,"[ORG, ORG, stores, the, event, description, in...","[compound, nsubj, ROOT, det, compound, compoun...","[PROPN, PROPN, VERB, DET, NOUN, NOUN, NOUN, AD...","[audit, DAS, event, information]",[vler das stores the event description informa...
10869,Constraints,0.0,False,False,False,False,False,False,[Constraints],[ROOT],[NOUN],[],[]


In [40]:
original_data2 = original_data

In [41]:
max_length = 558

In [42]:
# ENCODING ENTITIES
Entities = []
for i in original_data[["Entities"]].iterrows():
    Entities.append(" ".join(i[1][0]))
counter = 0
for tokens in fill_mask_secbert.tokenizer(Entities)['input_ids']:
    enc = np.zeros((max_length))
    count=0
    for i in tokens[1:len(tokens)-1]:
        enc[count]=i
        count+=1
    original_data.at[counter,"Entities"] = enc.astype("float32")
    counter += 1

In [43]:
# ENCODING DEPENDENCIES
Dependencies = []
for i in original_data[["Dependencies"]].iterrows():
    Dependencies.append(" ".join(i[1][0]))
counter = 0
for tokens in fill_mask_secbert.tokenizer(Dependencies)['input_ids']:
    enc = np.zeros((max_length))
    count=0
    for i in tokens[1:len(tokens)-1]:
        enc[count]=i
        count+=1
    original_data.at[counter,"Dependencies"] = enc.astype("float32")
    counter += 1

In [44]:
# ENCODING PARTS OF SPEECH
Parts_of_Speech = []
for i in original_data[["Parts of Speech"]].iterrows():
    Parts_of_Speech.append(" ".join(i[1][0]))
counter = 0
for tokens in fill_mask_secbert.tokenizer(Parts_of_Speech)['input_ids']:
    enc = np.zeros((max_length))
    count=0
    for i in tokens[1:len(tokens)-1]:
        enc[count]=i
        count+=1
    original_data.at[counter,"Parts of Speech"] = enc.astype("float32")
    counter += 1

In [45]:
# ENCODING SECURITY WORDS
Sec_Words = []
for i in original_data[["Security Words"]].iterrows():
    Sec_Words.append(" ".join(i[1][0]))
counter = 0
for tokens in fill_mask_secbert.tokenizer(Sec_Words)['input_ids']:
    enc = np.zeros((max_length))
    count=0
    for i in tokens[1:len(tokens)-1]:
        enc[count]=i
        count+=1
    original_data.at[counter,"Security Words"] = enc.astype("float32")
    counter += 1

In [46]:
# ENCODING SIMILAR REQUIREMENTS
Sim_req = []
for i in original_data[["Similar Requirements"]].iterrows():
    Sim_req.append(" ".join(i[1][0]))
counter = 0
for tokens in fill_mask_secbert.tokenizer(Sim_req)['input_ids']:
    enc = np.zeros((max_length))
    count=0
    for i in tokens[1:len(tokens)-1]:
        if count<max_length:
            enc[count]=i
        count+=1
    original_data.at[counter,"Similar Requirements"] = enc.astype("float32")
    counter += 1

In [62]:
# EXTRACTING SECBERT OUTPUT
original_data["Secbert Outputs"] = None
counter = 0
secbert.to(device)
for req in original_data[["Requirement Text"]].iterrows():
    input = tokenizer(str(req[1][0]), return_tensors='pt')
    input.to(device)
    with torch.no_grad():
        output = secbert(**input, output_hidden_states=True)
    del input
    tensors = np.zeros((33280))
    count = 0
    for tensor in output[1][6]:
        for value in tensor.flatten().cpu().data.numpy()[0::3]:
            tensors[count] = float(value)
            count += 1
    original_data.at[counter,"Secbert Outputs"] = tensors
    counter += 1


In [63]:
original_data

Unnamed: 0,Requirement Text,Security Related,Confidentiality,Integrity,Availability,Identification & Authentication,Accountability,Privacy,Entities,Dependencies,Parts of Speech,Security Words,Similar Requirements,Secbert Outputs
0,The system shall create a single patient recor...,1.0,True,False,False,False,False,False,"[1337.0, 1550.0, 11389.0, 2585.0, 43.0, 3460.0...","[1664.0, 5142.0, 1695.0, 1019.0, 11614.0, 2960...","[1664.0, 1864.0, 1387.0, 11614.0, 12827.0, 166...","[1550.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0....","[1337.0, 2176.0, 11389.0, 2585.0, 43.0, 3460.0...","[0.5811913013458252, 0.9095062017440796, 0.589..."
1,The system shall associate (store and link) ke...,1.0,True,False,False,True,False,False,"[1337.0, 1550.0, 11389.0, 10860.0, 12.0, 3876....","[1664.0, 5142.0, 1695.0, 1019.0, 11614.0, 2960...","[1664.0, 1864.0, 1387.0, 11614.0, 12827.0, 372...","[4857.0, 1572.0, 1785.0, 1550.0, 0.0, 0.0, 0.0...","[1337.0, 1550.0, 11389.0, 10860.0, 12.0, 3876....","[1.1955792903900146, 0.603007435798645, 1.3445..."
2,The system shall provide the ability to store ...,1.0,False,False,False,True,False,False,"[1337.0, 1550.0, 11389.0, 2954.0, 1337.0, 3477...","[1664.0, 5142.0, 1695.0, 1019.0, 11614.0, 2960...","[1664.0, 1864.0, 1387.0, 11614.0, 12827.0, 166...","[4857.0, 1550.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[1337.0, 1550.0, 11389.0, 2954.0, 1337.0, 3477...","[0.7627249956130981, 1.0235766172409058, 0.660..."
3,The system shall provide a field which will id...,0.0,False,False,False,False,False,False,"[1337.0, 1550.0, 11389.0, 2954.0, 43.0, 2568.0...","[1664.0, 5142.0, 1695.0, 1019.0, 11614.0, 2960...","[1664.0, 1864.0, 1387.0, 11614.0, 12827.0, 166...","[1550.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0....","[1337.0, 1872.0, 11389.0, 2954.0, 43.0, 2568.0...","[0.30204275250434875, 0.39741265773773193, -0...."
4,The system shall provide the ability to merge ...,1.0,True,True,False,False,True,False,"[1337.0, 1550.0, 11389.0, 2954.0, 1337.0, 3477...","[1664.0, 5142.0, 1695.0, 1019.0, 11614.0, 2960...","[1664.0, 1864.0, 1387.0, 11614.0, 12827.0, 166...","[1572.0, 3989.0, 1550.0, 0.0, 0.0, 0.0, 0.0, 0...","[1337.0, 1550.0, 11389.0, 2954.0, 1337.0, 3477...","[0.12711727619171143, 0.5058688521385193, 1.03..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10866,"VLER DAS stores the ‘Patient ID’ (EDIPI, if no...",1.0,True,True,False,False,True,False,"[1841.0, 1841.0, 5125.0, 1337.0, 285.0, 7644.0...","[15122.0, 5142.0, 1695.0, 1019.0, 2960.0, 1664...","[3635.0, 1016.0, 3635.0, 1016.0, 12827.0, 1664...","[6430.0, 20859.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0...","[17254.0, 1335.0, 20859.0, 5125.0, 1337.0, 285...","[1.8536443710327148, 1.1284810304641724, 0.279..."
10867,"VLER DAS stores the ‘Status of Exam Result’, w...",1.0,True,True,False,False,True,False,"[1841.0, 1841.0, 5125.0, 1337.0, 285.0, 4609.0...","[15122.0, 5142.0, 1695.0, 1019.0, 2960.0, 1664...","[3635.0, 1016.0, 3635.0, 1016.0, 12827.0, 1664...","[6430.0, 20859.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0...","[17254.0, 1335.0, 20859.0, 5125.0, 1337.0, 285...","[1.67890202999115, 1.5895962715148926, 0.38226..."
10868,VLER DAS stores the event description informat...,1.0,True,True,False,False,True,False,"[1841.0, 1841.0, 5125.0, 1337.0, 3154.0, 4352....","[15122.0, 5142.0, 1695.0, 1019.0, 2960.0, 1664...","[3635.0, 1016.0, 3635.0, 1016.0, 12827.0, 1664...","[6430.0, 20859.0, 3154.0, 1572.0, 0.0, 0.0, 0....","[17254.0, 1335.0, 20859.0, 5125.0, 1337.0, 315...","[1.2100515365600586, 0.4258882701396942, 0.840..."
10869,Constraints,0.0,False,False,False,False,False,False,"[13831.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0...","[2960.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0....","[1864.0, 1387.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.6892223954200745, 0.8946217894554138, 1.479..."


In [61]:
original_data.to_pickle(".\ClassifiedDataset\OriginalDataFeatureExtracted.pkl")