In [None]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

In [None]:
import numpy as np
print(np.__version__)

In [None]:
import datasets
# import numpy as np
import torch
import os
# from transformers import pipeline
from datasets import Dataset, load_dataset, Features, Value
from datasets import load_dataset, concatenate_datasets


GREEN = "\033[32m"
BLUE = "\033[34m"
RESET = "\033[0m"

print(GREEN, 'datasets version', RESET, datasets.__version__)

path = ""
# Load raw dataset
raw_dataset = datasets.load_from_disk(path)
print(BLUE, 'Loading dataset: ', RESET, raw_dataset)
print('==========================')
# Check insterested Label
print(BLUE, 'check KM100L6V2 Label: ', RESET)
print(np.unique(raw_dataset['KM100L6V2 Label'], return_counts=True))
print(BLUE, 'check Sources Type Label: ', RESET)
print(np.unique(raw_dataset['Sources Type'], return_counts=True))

# Check env info
print(GREEN, 'torch version: ', RESET, torch.__version__)
print(GREEN, 'cuda available: ', RESET, torch.cuda.is_available())
print(GREEN, 'GPU name: ', RESET, torch.cuda.get_device_name(0))
print('===========================')

In [None]:
# filter the datasets: remove the columns not needed
dataset = raw_dataset.remove_columns(
    ['Requirement (EN)', 'Requirement (DE)', 'Requirement (Other Language)', 'Category (Source)',
     'Sub Category (Source)', 'Category (NoRBERT)', 'Sub Category (NoRBERT)', 'Category (Manual)',
     'Sub Category (Manual)', 'Open/ Closed Source', 'Date', 'Comment', 'Original Language Code', 'KM45L6V2 Label',
     'KM35L6V2S3 Label', 'input_ids', 'token_type_ids', 'attention_mask', 'regex_tagged', '__index_level_0__'])
print(BLUE, 'data set after remove columns: ', RESET, dataset)
print('==========================')

In [None]:
# Filter the dataset with Sources Type = RE
def filter_SourcesType(example):
    return example['Sources Type'] == 'RE'


filtered_dataset = dataset.filter(filter_SourcesType)
print(BLUE, 'Filtered the dataset with Sources Type = RE', RESET, filtered_dataset)
print('==========================')


# Filter the dataset with KM100L6V2 Label = F
def filter_KM100L6V2(example):
    return example['KM100L6V2 Label'] == 'F' or example['KM100L6V2 Label'] == 'NF'


filtered_dataset = filtered_dataset.filter(filter_KM100L6V2)
print(BLUE, 'Filtered the dataset with KM100L6V2 Label = F', RESET, filtered_dataset)
print('==========================')

print(BLUE, "Check a sample data: ", RESET, filtered_dataset[1])
print('==========================')

# Save the filtered dataset to disk
filtered_dataset.save_to_disk('/pfs/data5/home/st/st_us-051500/st_st180358/filtered_dataset_Step1')
print(GREEN, 'save the filtered data to disk', RESET)
print('==========================')

In [None]:
import transformers
print(transformers.__version__)

In [None]:

# from transformers import pipeline
# Use model to further filter the data
# Get the model
pipe = transformers.pipeline("text-generation", model="HuggingFaceH4/zephyr-7b-beta", torch_dtype=torch.bfloat16, device_map="auto")
print(GREEN, 'model downloaded: ', RESET, pipe)
print('==========================')

In [None]:
instruction = '''
<|system|> \n
You are a requirement engineer that helps me to classify requirements according to my needs,
and your answer must starts with Yes or No.</s>
'''

# prompt: Nominalization
prompt_TE_1 = ''' \n
<|user|> \n
Here is the definition of nominalization:
Nominalization turns processes into single events, losing detailed information. 
A nominalized term must not allow for any leeway in the interpretation of the processes and must
precisely depict the process, including any exceptions that may occur as well as all input and output parameters.
For example, "transmit" becomes "transmission". Other typical examples of nominalization are the terms input, booking, and acceptance.

And Here is the example of Nominalization:
“In case of a system crash, a restart of the system shall be performed.”
The terms system crash and restart each describe a process that thought to be
analyzed more precisely.\n

With the definition and example: Please tell me if the following requirement text has the problem of Nominalization:
'''

# prompt: Nouns without reference index
prompt_TE_2 = '''\n
<|user|> \n
Here is the definition of 'Nouns without reference index':
As with process verbs, nouns are frequently incompletely specified. Linguists
call this a missing or inadequate index of reference. Examples of
terms that contain incompletely specified nouns are the user, the controller,
the system, the message, the data, or the function.

And Here is the example of Nouns without reference index:
"The data shall be displayed to the user on the terminal"
The following questions arise: What data exactly? Which user exactly?
Which terminal exactly? If this information is amended, the requirement
might thus read as follows:
"The system shall display the billing data to the registered user on the terminal
she is logged in to."

With the definition and example: Please tell me if the following requirement text has the problem of Nouns without reference index:
'''


# prompt: Universal Quantifiers
prompt_TE_3 = '''\n
<|user|> \n
Here is the definition of 'Universal Quantifiers':
Universal quantifiers specify amounts or frequencies. They group a set of
objects and make a statement about the behavior of this set. When using
universal quantifiers, there is the risk that the specified behavior or
property does not apply to all objects within the specified set.
It must be verified whether the specified behavior really applies to all
objects summarized through the quantifiers. Universal quantifiers can be
easily identified through trigger words such as never, always, no, none,
every, all, some, or nothing.


And Here is the example of Universal Quantifiers:
"The system shall show all data sets in every submenu."
In this case, the following question must be asked: Really in every submenu?
Really all data sets?

With the definition and example: Please tell me if the following requirement text has the problem of Universal Quantifiers:
'''


# prompt: Incompletely Specified Conditions
prompt_TE_4 = '''\n
<|user|> \n
Here is the definition of 'Incompletely Specified Conditions':
Incompletely specified conditions are another indicator of a potential loss
of information. Requirements that contain conditions specify the behavior
that must occur when the condition is met. In addition, they must specify
what behavior must occur if the condition is not met (the part that is often
missing).Trigger words are, for instance, if … then, in case, whether, and depending
on.

And Here is the example of Incompletely Specified Conditions:
"The restaurant system shall offer all beverages to a registered guest over the age
of 20 years."
At least one aspect remains unspecified in the example above: Which beverages
shall be offered to a guest that is 20 years or younger?

With the definition and example: Please tell me if the following requirement text has the problem of Incompletely Specified Conditions:
'''


# prompt: Incompletely Specified Process Verbs
prompt_TE_5 = '''\n
<|user|> \n
Here is the definition of 'Incompletely Specified Process Verbs':
Some process verbs require more than one noun to be considered completely
specified. The verb transmit, for instance, requires at least three
supplements to be considered complete: what is being transmitted, from
where it is being transmitted, and to where it is being transmitted.
Similarly, adjectives and adverbs may need to be supplemented
as well. It can mostly be avoided or kept to a minimum using the active voice.

And Here is the example of Incompletely Specified Process Verbs:
"To log a user in, the login data is entered."
It is unclear who enters the login Use active voice.
data. It is also unclear where and how this is done.

With the definition and example: Please tell me if the following requirement text has the problem of Incompletely Specified Process Verbs:
'''

assistant = '''
</s>
\n<|assistant|>\n
'''

input_TE_1 = instruction + prompt_TE_1  + 'The service must be user friendly and easy to access by the user' + assistant
response = pipe(input_TE_1, max_length=350, truncation=True)[0]
# print(response['generated_text'].split('\n<|assistant|>\n')[-1])
print(response['generated_text'])

In [None]:
RED = "\033[31m"  

# Use the model downloaded to check_requirements
def check_requirements_HF(instruction, prompt, text, model):
    # （prompt + data）
    input_text = instruction + prompt + text + assistant
    response = model(input_text, max_length=350, truncation=True)[0]['generated_text'].split('\n<|assistant|>\n')[-1]
    # print('response: ', response)
    return response


# map function, tag if a text has TE_1: nominalization
def modify_TE_column(item):
    print('checking text: ', item['text'])
    # doing processing here...
    response = []
    response.append(check_requirements_HF(instruction, prompt_TE_1, item['text'], pipe))
    response.append(check_requirements_HF(instruction, prompt_TE_2, item['text'], pipe))
    response.append(check_requirements_HF(instruction, prompt_TE_3, item['text'], pipe))
    response.append(check_requirements_HF(instruction, prompt_TE_4, item['text'], pipe))
    response.append(check_requirements_HF(instruction, prompt_TE_5, item['text'], pipe))
    print('response: ', response)
    # print('==========================')
    # print(response[0])
    # print(type(response))
    # print(type(response[0]))
    for index, r in enumerate(response):
        key = f'TE_{index}'
        if 'Yes' in r:
            print('==========================')
            print('result TE_', index, ': Yes')
            print('==========================')
            item[key] = 'Yes'
        elif 'No' in r:
            print('==========================')
            print('result TE_', index, ': No')
            print('==========================')
            item[key] = 'No'
        else:
            print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
            print('result TE_', index, ': Error')
            print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
            item[key] = 'Error'
    return item



In [None]:
filtered_dataset

In [None]:
print(BLUE, '============= Start Processing with model ============', RESET)

# test with the test dataset
# testset = filtered_dataset.select(range(6))
testset = filtered_dataset

num_shards = 100  # shards num, adjust according to the available resources and the size of dataset
# index
processed_shards = []

# check which shard has been processed
for i in range(num_shards):
    shard_path = f'test_processed_shard_{i}'
    if os.path.exists(shard_path):
        processed_shards.append(i)

for i in range(num_shards):
    if i not in processed_shards:
        # shard the dataset
        shard = testset.shard(num_shards=num_shards, index=i)
        # start processing here...
        processed_shard = shard.map(modify_TE_column)
        processed_shard.save_to_disk(f'test_processed_shard_{i}')
        processed_shards.append(i)
        # processed_batches.append(processed_shard)

# combine the shards
final_dataset = concatenate_datasets([Dataset.load_from_disk(f'test_processed_shard_{i}') for i in range(num_shards)])

# Store the final dataset
final_dataset.save_to_disk('/pfs/data5/home/st/st_us-051500/st_st180358/test_processed_dataset_Step1')


# TO AVOID MORE EXAMPLES in the response, use like # in the instruction, then split[0] to get yes or no

In [None]:
raw_dataset = datasets.load_from_disk('/pfs/data5/home/st/st_us-051500/st_st180358/test_processed_dataset_Step1')
print(raw_dataset)
print(raw_dataset[0])
print(raw_dataset[1])

In [None]:
print(raw_dataset[13300])

In [None]:
def filter_TE_0_Yes(example):
    return example['TE_0'] == 'Yes'
def filter_TE_1_Yes(example):
    return example['TE_1'] == 'Yes'
def filter_TE_2_Yes(example):
    return example['TE_2'] == 'Yes'
def filter_TE_3_Yes(example):
    return example['TE_3'] == 'Yes'
def filter_TE_4_Yes(example):
    return example['TE_4'] == 'Yes'

TE_0_dataset = raw_dataset.filter(filter_TE_0_Yes)
TE_1_dataset = raw_dataset.filter(filter_TE_1_Yes)
TE_2_dataset = raw_dataset.filter(filter_TE_2_Yes)
TE_3_dataset = raw_dataset.filter(filter_TE_3_Yes)
TE_4_dataset = raw_dataset.filter(filter_TE_4_Yes)

TE_0_dataset.save_to_disk('/pfs/data5/home/st/st_us-051500/st_st180358/TE_0_dataset')
TE_1_dataset.save_to_disk('/pfs/data5/home/st/st_us-051500/st_st180358/TE_1_dataset')
TE_2_dataset.save_to_disk('/pfs/data5/home/st/st_us-051500/st_st180358/TE_2_dataset')
TE_3_dataset.save_to_disk('/pfs/data5/home/st/st_us-051500/st_st180358/TE_3_dataset')
TE_4_dataset.save_to_disk('/pfs/data5/home/st/st_us-051500/st_st180358/TE_4_dataset')

print(BLUE,'================TE_0_dataset=================' ,RESET)
print(TE_0_dataset)
print(TE_0_dataset[103])

print(BLUE,'================TE_1_dataset=================' ,RESET)
print(TE_1_dataset)
print(TE_1_dataset[103])

print(BLUE,'================TE_2_dataset=================' ,RESET)
print(TE_2_dataset)
print(TE_2_dataset[103])

print(BLUE,'================TE_3_dataset=================' ,RESET)
print(TE_3_dataset)
print(TE_3_dataset[103])

print(BLUE,'================TE_4_dataset=================' ,RESET)
print(TE_4_dataset)
print(TE_4_dataset[103])



In [None]:
def filter_TE_All_No(example):
    return example['TE_0'] == 'No' and example['TE_1'] == 'No' and example['TE_2'] == 'No' and example['TE_3'] == 'No' and example['TE_4'] == 'No'

TE_All_No_dataset = raw_dataset.filter(filter_TE_All_No)

TE_All_No_dataset.save_to_disk('/pfs/data5/home/st/st_us-051500/st_st180358/TE_All_No_dataset')

print(BLUE,'================TE_All_No_dataset=================' ,RESET)
print(TE_All_No_dataset)
print(TE_All_No_dataset[103])