# Hybrid Zero Shot Learning Approach

In [1]:
import pandas as pd
import numpy as np

<h2>Loading Pre-Trained Models</h2>

<h3>Loading BERT4RE</h3>

In [2]:
from transformers import AutoTokenizer, AutoModel

# Specify the model name for BERT4RE
model_name = "thearod5/bert4re"  # Replace with the correct model name or path if needed

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
bert4re = AutoModel.from_pretrained(model_name)

print("BERT4RE loaded successfully.")

Some weights of RobertaModel were not initialized from the model checkpoint at thearod5/bert4re and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERT4RE loaded successfully.


<h3>Testing BERT4RE</h3>

In [3]:
import torch

# Move model to CPU or GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert4re.to(device)

# Prepare a sample sentence and encode it
input_text = "John works at OpenAI as a researcher."
inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)

# Pass the input through the model
with torch.no_grad():  # Disable gradient calculation for inference
    outputs = bert4re(**inputs)

# Extract the embeddings (hidden states of the last layer)
embeddings = outputs.last_hidden_state  # Shape: (batch_size, sequence_length, hidden_size)

# Optionally, get sentence-level embedding by averaging token embeddings (ignoring padding tokens)
sentence_embedding = embeddings.mean(dim=1)  # Shape: (batch_size, hidden_size)

# Print the output shape and sentence embedding
print("Embedding shape:", sentence_embedding.shape)
print("Sentence embedding:", sentence_embedding)

Embedding shape: torch.Size([1, 768])
Sentence embedding: tensor([[-6.8532e-01, -2.0724e-01,  4.6072e-01,  3.8424e-01, -8.1794e-01,
          6.2162e-02,  1.6305e-01,  6.3627e-01, -6.2024e-01,  2.3580e-01,
         -4.1779e-01,  4.5084e-02,  1.0638e+00, -5.3630e-02, -6.2707e-01,
         -2.7198e-01,  5.0179e-01,  6.1894e-01, -5.1897e-01, -5.8201e-01,
          4.4262e-01, -2.5631e-01,  1.5594e-01, -1.6746e-01, -7.8109e-01,
         -2.5813e-01, -9.9017e-01, -8.3839e-01, -4.7253e-01, -7.3076e-01,
          1.2344e-01,  6.4781e-01, -3.2000e-01,  1.6285e-01, -9.6721e-01,
          8.0904e-01, -4.3488e-01, -5.2084e-01, -3.0309e-01, -2.9167e-01,
         -2.0697e-01,  2.6938e-01,  6.5628e-01, -6.7765e-01, -3.0446e-01,
          1.2397e+00,  4.9944e-01,  7.6995e-01, -2.0611e-01,  4.7138e-01,
         -1.2173e-01,  2.2105e-01, -1.7483e-01, -3.7982e-01, -4.4242e-01,
          2.8855e-01, -2.1307e-01,  2.2410e-01, -8.2249e-01, -1.9172e-01,
          6.2317e-02, -1.4458e-01, -6.6763e-01, -4.327

<h3>Loading SBERT</h3>

In [4]:
from sentence_transformers import SentenceTransformer, util

# Load a pretrained SBERT model
model_name = "bert-base-nli-mean-tokens" # Compatible with 768 Dimensions to match BERT4RE
sbert = SentenceTransformer(model_name)

print("SBERT model loaded successfully.")

SBERT model loaded successfully.


<h3>Testing SBERT</h3>

In [5]:
# Example sentences to encode
sentences = ["This is a test sentence.", "Sentence embeddings are useful for NLP tasks."]

# Generate sentence embeddings
embeddings = sbert.encode(sentences)

# Print the shape and embeddings
print("Embedding shape:", embeddings.shape)
print("Embeddings:", embeddings)

Embedding shape: (2, 768)
Embeddings: [[ 0.0507936  -0.02515819  1.4253851  ... -0.79881257 -0.81836474
   0.15679334]
 [-0.319343    0.04131207  1.000955   ... -0.6872918  -1.4411713
   0.02773192]]


<h2>Data Preprocessing</h2>

<h3>Data Retrieval</h3>

In [6]:
# Loading in promise dataset
df = pd.read_csv("datasets/promise.csv")

df.head()

Unnamed: 0,INPUT,TYPE
0,The system shall refresh the display every 60 ...,PE
1,The application shall match the color of the s...,LF
2,If projected the data must be readable. On a...,US
3,The product shall be available during normal b...,A
4,If projected the data must be understandable....,US


<h3>Re-Labeling</h3>

In [7]:
# Making dataset binary between functional & non-functional requirements

df.loc[df['TYPE'] != 'F', 'TYPE'] = 'NF' # Locating all values that do not equal 'F' and changing them to 'NF'

df.head(20)

Unnamed: 0,INPUT,TYPE
0,The system shall refresh the display every 60 ...,NF
1,The application shall match the color of the s...,NF
2,If projected the data must be readable. On a...,NF
3,The product shall be available during normal b...,NF
4,If projected the data must be understandable....,NF
5,The product shall ensure that it can only be a...,NF
6,The product shall be intuitive and self-explan...,NF
7,The product shall respond fast to keep up-to-d...,NF
8,The system shall have a MDI form that allows f...,F
9,The system shall display Events in a vertical ...,F


<h3>Lowercasing</h3>

In [8]:
df['INPUT'] = df['INPUT'].str.lower()

df.head()

Unnamed: 0,INPUT,TYPE
0,the system shall refresh the display every 60 ...,NF
1,the application shall match the color of the s...,NF
2,if projected the data must be readable. on a...,NF
3,the product shall be available during normal b...,NF
4,if projected the data must be understandable....,NF


<h3>Stopword Removal</h3>

In [9]:
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords

# Define stop words
stop_words = set(stopwords.words('english'))

# Function to remove stopwords
def remove_stopwords(text):
    words = text.split()  # Split the text into words
    filtered_text = ' '.join([word for word in words if word.lower() not in stop_words])
    return filtered_text

# Apply stopword removal to the DataFrame column
df['INPUT'] = df['INPUT'].apply(remove_stopwords)

df.head()

Unnamed: 0,INPUT,TYPE
0,system shall refresh display every 60 seconds.,NF
1,application shall match color schema set forth...,NF
2,projected data must readable. 10x10 projection...,NF
3,product shall available normal business hours....,NF
4,projected data must understandable. 10x10 proj...,NF


<h3>Punctuation Removal<h3>

In [10]:
import string

# Replacing percent symbol to string
df['INPUT'] = df['INPUT'].str.replace('%', ' percent')

# Remove punctuation from a specific column (e.g., 'Column1')
df['INPUT'] = df['INPUT'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)) if isinstance(x, str) else x)

df.head(20)

Unnamed: 0,INPUT,TYPE
0,system shall refresh display every 60 seconds,NF
1,application shall match color schema set forth...,NF
2,projected data must readable 10x10 projection ...,NF
3,product shall available normal business hours ...,NF
4,projected data must understandable 10x10 proje...,NF
5,product shall ensure accessed authorized users...,NF
6,product shall intuitive selfexplanatory 90 pe...,NF
7,product shall respond fast keep uptodate data ...,NF
8,system shall mdi form allows viewing graph dat...,F
9,system shall display events vertical table time,F


<h3>Tokenize Dataset</h3>

In [11]:
# Tokenize the dataset with the appropriate settings
tokenized_dataset = tokenizer(
    df['INPUT'].tolist(),
    padding=True,           # Add padding to make the input sequences the same length
    truncation=True,        # Truncate sequences to the maximum length allowed by the model
    max_length=512,         # Adjust max_length based on model's input size
    return_tensors="pt"     # Return PyTorch tensors
)

# Print tokenized output (optional)
print(tokenized_dataset)

{'input_ids': tensor([[    0, 11873,  1199,  ...,     1,     1,     1],
        [    0, 31867,  1199,  ...,     1,     1,     1],
        [    0,  3887,  2331,  ...,     1,     1,     1],
        ...,
        [    0,   821,    86,  ...,     1,     1,     1],
        [    0, 16171,   754,  ...,     1,     1,     1],
        [    0, 17655,   382,  ...,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}


<h2>Model Fitting<h2>

<h3>Label Descriptions<h3>

In [12]:
# Label descriptions for Functional and Non-Functional

label_descriptions = {
    "Functional": [
        "Specifies the tasks or actions the system must perform to meet user objectives, such as processing data or responding to inputs.", 
        "Defines interactions between users and the system, focusing on activities like data input, processing, and system responses.",
        "Describes functions the system must execute to provide its core features, enabling users to complete their goals.",
        "Outlines precise operations the system must carry out, detailing steps to complete processes or respond to user commands.",
        "Lists functions that directly support user needs, such as generating reports, calculating results, or saving information.",
        "States requirements for actions the system must perform, including data validation, transaction handling, and workflow management.",
        "Defines specific behaviors or responses expected from the system based on various user inputs and interactions."
        ],
    "Non-Functional": [
        "Specifies criteria for system performance, including processing speed, load handling, and response time under different conditions.", 
        "Defines security standards, such as data encryption, access control, and user authentication to ensure data protection.",
        "Describes usability requirements, including user interface consistency, accessibility, and ease of navigation for end-users.",
        "Outlines reliability standards, such as system uptime, fault tolerance, and recovery mechanisms to ensure consistent operation.",
        "Specifies scalability requirements, including the system's ability to handle increased data volume or users without performance loss.",
        "Sets efficiency benchmarks, such as memory usage, storage requirements, and energy consumption limits for optimal resource use.",
        "Establishes compliance standards, specifying adherence to industry regulations or corporate guidelines to meet legal requirements."
    ]
}

<h3>Label Embeddings</h3>

In [13]:
# Generate embeddings for each description in both Functional and Non-Functional categories
label_embeddings = {
    "Functional": [sbert.encode(desc) for desc in label_descriptions["Functional"]],
    "Non-Functional": [sbert.encode(desc) for desc in label_descriptions["Non-Functional"]]
}

<h3>Preparing Data Loader & Tensor Dataset</h3>

In [14]:
from torch.utils.data import DataLoader, TensorDataset

label_mapping = {"F": 0, "NF": 1}
numerical_labels = df['TYPE'].map(label_mapping).tolist()

# Prepare a TensorDataset with input IDs and attention masks
input_ids = tokenized_dataset["input_ids"]
attention_mask = tokenized_dataset["attention_mask"]
labels = torch.tensor(numerical_labels)
dataset_torch = TensorDataset(input_ids, attention_mask, labels)
dataloader = DataLoader(dataset_torch, batch_size=8, shuffle=True)

<h3>CLS Pooling & Cosine Similarity</h3>

In [20]:
def classify_requirements(hidden_states, label_embeddings):

    # Apply CLS pooling by taking the first token's embedding (CLS token)
    cls_embeddings = hidden_states[:, 0, :]  # Shape: (batch_size, hidden_size)

    # Classify each requirement in the batch
    classifications = []
    for cls_embedding in cls_embeddings:
        # Convert tensor to numpy for compatibility with cosine similarity calculation
        cls_embedding_np = cls_embedding.cpu().numpy()

        # Calculate similarity for functional and non-functional descriptions
        functional_similarities = [util.cos_sim(cls_embedding_np, emb)[0][0].item() for emb in label_embeddings["Functional"]]
        non_functional_similarities = [util.cos_sim(cls_embedding_np, emb)[0][0].item() for emb in label_embeddings["Non-Functional"]]

        # Calculate maximum similarity for each label
        max_functional_similarity = max(functional_similarities)
        max_non_functional_similarity = max(non_functional_similarities)

        # Determine classification based on higher maximum similarity
        if max_functional_similarity > max_non_functional_similarity:
            classifications.append("F")
        else:
            classifications.append("NF")

    return classifications

<h2>Evaluation & Testing</h2>

In [21]:
from sklearn.metrics import classification_report, accuracy_score

prediction_mapping = {"F": 0, "NF": 1}

true_labels = []  # This will collect true labels

# Iterate over the DataLoader to get embeddings and classify
all_predictions = []
for batch in dataloader:
    input_ids, attention_mask, labels = batch  # Ensure labels are included in the batch
    input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)

    with torch.no_grad():
        outputs = bert4re(input_ids=input_ids, attention_mask=attention_mask)
        hidden_states = outputs.last_hidden_state

    # Classify requirements in the batch
    predictions = classify_requirements(hidden_states, label_embeddings)
    all_predictions.extend(predictions)
    true_labels.extend(labels.tolist())  # Append true labels from the batch

# Convert string predictions to numerical values
numerical_predictions = [prediction_mapping[pred] for pred in all_predictions]

# Ensure the true_labels list is populated correctly
if len(true_labels) != len(numerical_predictions):
    print(f"Error: Mismatch in true_labels ({len(true_labels)}) and predictions ({len(numerical_predictions)})")
else:
    # Evaluate accuracy
    accuracy = accuracy_score(true_labels, numerical_predictions)
    print(f"Accuracy: {accuracy * 100:.2f}%")

    # Generate classification report
    print(classification_report(true_labels, numerical_predictions))

Accuracy: 54.88%
              precision    recall  f1-score   support

           0       0.45      0.53      0.49       255
           1       0.63      0.56      0.60       370

    accuracy                           0.55       625
   macro avg       0.54      0.55      0.54       625
weighted avg       0.56      0.55      0.55       625

