# Hybrid Zero Shot Learning Approach

In [11]:
import pandas as pd

<h2>Data Retrieval</h2>

In [12]:
# Loading in promise dataset
df = pd.read_csv("datasets/promise.csv")

df.head()

Unnamed: 0,INPUT,TYPE
0,The system shall refresh the display every 60 ...,PE
1,The application shall match the color of the s...,LF
2,If projected the data must be readable. On a...,US
3,The product shall be available during normal b...,A
4,If projected the data must be understandable....,US


<h2>Data Preprocessing</h2>

<h3>Lowercasing</h3>

In [13]:
df['INPUT'] = df['INPUT'].str.lower()

df.head()

Unnamed: 0,INPUT,TYPE
0,the system shall refresh the display every 60 ...,PE
1,the application shall match the color of the s...,LF
2,if projected the data must be readable. on a...,US
3,the product shall be available during normal b...,A
4,if projected the data must be understandable....,US


<h3>Stopword Removal</h3>

In [14]:
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords

# Define stop words
stop_words = set(stopwords.words('english'))

# Function to remove stopwords
def remove_stopwords(text):
    words = text.split()  # Split the text into words
    filtered_text = ' '.join([word for word in words if word.lower() not in stop_words])
    return filtered_text

# Apply stopword removal to the DataFrame column
df['INPUT'] = df['INPUT'].apply(remove_stopwords)

df.head()

Unnamed: 0,INPUT,TYPE
0,system shall refresh display every 60 seconds.,PE
1,application shall match color schema set forth...,LF
2,projected data must readable. 10x10 projection...,US
3,product shall available normal business hours....,A
4,projected data must understandable. 10x10 proj...,US


<h3>Punctuation Removal<h3>

In [15]:
import string

# Replacing percent symbol to string
df['INPUT'] = df['INPUT'].str.replace('%', ' percent')

# Remove punctuation from a specific column (e.g., 'Column1')
df['INPUT'] = df['INPUT'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)) if isinstance(x, str) else x)

df.head(20)

Unnamed: 0,INPUT,TYPE
0,system shall refresh display every 60 seconds,PE
1,application shall match color schema set forth...,LF
2,projected data must readable 10x10 projection ...,US
3,product shall available normal business hours ...,A
4,projected data must understandable 10x10 proje...,US
5,product shall ensure accessed authorized users...,SE
6,product shall intuitive selfexplanatory 90 pe...,US
7,product shall respond fast keep uptodate data ...,PE
8,system shall mdi form allows viewing graph dat...,F
9,system shall display events vertical table time,F


<h2>Loading Pre-Trained Models</h2>

<h3>Loading BERT4RE</h3>

In [16]:
from transformers import AutoTokenizer, AutoModel

# Specify the model name for BERT4RE
model_name = "thearod5/bert4re"  # Replace with the correct model name or path if needed

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
bert4re = AutoModel.from_pretrained(model_name)

print("Model and tokenizer loaded successfully.")

Some weights of RobertaModel were not initialized from the model checkpoint at thearod5/bert4re and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model and tokenizer loaded successfully.


<h3>Testing BERT4RE</h3>

In [20]:
import torch

# Move model to CPU or GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert4re.to(device)

# Prepare a sample sentence and encode it
input_text = "John works at OpenAI as a researcher."
inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)

# Pass the input through the model
with torch.no_grad():  # Disable gradient calculation for inference
    outputs = bert4re(**inputs)

# Extract the embeddings (hidden states of the last layer)
embeddings = outputs.last_hidden_state  # Shape: (batch_size, sequence_length, hidden_size)

# Optionally, get sentence-level embedding by averaging token embeddings (ignoring padding tokens)
sentence_embedding = embeddings.mean(dim=1)  # Shape: (batch_size, hidden_size)

# Print the output shape and sentence embedding
print("Embedding shape:", sentence_embedding.shape)
print("Sentence embedding:", sentence_embedding)

Embedding shape: torch.Size([1, 768])
Sentence embedding: tensor([[-6.8532e-01, -2.0724e-01,  4.6072e-01,  3.8424e-01, -8.1794e-01,
          6.2162e-02,  1.6305e-01,  6.3627e-01, -6.2024e-01,  2.3580e-01,
         -4.1779e-01,  4.5084e-02,  1.0638e+00, -5.3630e-02, -6.2707e-01,
         -2.7198e-01,  5.0179e-01,  6.1894e-01, -5.1897e-01, -5.8201e-01,
          4.4262e-01, -2.5631e-01,  1.5594e-01, -1.6746e-01, -7.8109e-01,
         -2.5813e-01, -9.9017e-01, -8.3839e-01, -4.7253e-01, -7.3076e-01,
          1.2344e-01,  6.4781e-01, -3.2000e-01,  1.6285e-01, -9.6721e-01,
          8.0904e-01, -4.3488e-01, -5.2084e-01, -3.0309e-01, -2.9167e-01,
         -2.0697e-01,  2.6938e-01,  6.5628e-01, -6.7765e-01, -3.0446e-01,
          1.2397e+00,  4.9944e-01,  7.6995e-01, -2.0611e-01,  4.7138e-01,
         -1.2173e-01,  2.2105e-01, -1.7483e-01, -3.7982e-01, -4.4242e-01,
          2.8855e-01, -2.1307e-01,  2.2410e-01, -8.2249e-01, -1.9172e-01,
          6.2317e-02, -1.4458e-01, -6.6763e-01, -4.327

<h3>Loading SBERT</h3>

In [18]:
from sentence_transformers import SentenceTransformer

# Load a pretrained SBERT model
model_name = "all-MiniLM-L6-v2"  # You can choose other models like 'stsb-roberta-base' or 'paraphrase-MiniLM-L12-v2'
sbert = SentenceTransformer(model_name)

print("SBERT model loaded successfully.")

SBERT model loaded successfully.


<h3>Testing SBERT</h3>

In [19]:
# Example sentences to encode
sentences = ["This is a test sentence.", "Sentence embeddings are useful for NLP tasks."]

# Generate sentence embeddings
embeddings = sbert.encode(sentences)

# Print the shape and embeddings
print("Embedding shape:", embeddings.shape)
print("Embeddings:", embeddings)

Embedding shape: (2, 384)
Embeddings: [[ 8.42964873e-02  5.79537377e-02  4.49330732e-03  1.05821118e-01
   7.08337268e-03 -1.78447124e-02 -1.68880336e-02 -1.52283423e-02
   4.04730998e-02  3.34225111e-02  1.04327604e-01 -4.70358431e-02
   6.88471505e-03  4.10179794e-02  1.87119059e-02 -4.14923243e-02
   2.36474331e-02 -5.65018207e-02 -3.36961895e-02  5.09910136e-02
   6.93032742e-02  5.47842607e-02 -9.78837349e-03  2.36972161e-02
   1.99964996e-02  9.71729029e-03 -5.88992164e-02  7.30741676e-03
   4.70264815e-02 -4.51004971e-03 -5.57997487e-02 -4.15946264e-03
   6.47570863e-02  4.80763353e-02  1.70207415e-02 -3.18335625e-03
   5.74023910e-02  3.52318846e-02 -5.88387996e-03  1.48329055e-02
   1.15763173e-02 -1.07480787e-01  1.91041678e-02  2.20856871e-02
   1.08645335e-02  3.78199364e-03 -3.19403633e-02  1.07277809e-02
  -4.84222732e-03 -2.83362102e-02 -5.25735617e-02 -7.05868378e-02
  -5.75558245e-02 -1.36329243e-02  5.68215596e-03  2.30746213e-02
   3.56977880e-02  1.49984248e-02  4.9