In [1]:
import math
import re
import spacy
from random import randrange, shuffle, random, randint
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from datasets import load_dataset

# --------------------------------------------
# Data Preparation
# --------------------------------------------
# Load BookCorpus dataset from Hugging Face and take a 100k-sample subset
dataset = load_dataset("bookcorpus", split="train")
subset_100k = dataset.select(range(100000))
texts = subset_100k['text']

# Save texts to a file
with open("bookcorpus_subset.txt", "w", encoding="utf-8") as f:
    for text in texts:
        f.write(text + "\n")

# Read raw text and process with spaCy
with open("bookcorpus_subset.txt", "r", encoding="utf-8") as file:
    raw_text = file.read()

nlp = spacy.load("en_core_web_sm")
def process_text_in_chunks(raw_text, chunk_size=1000000):
    chunks = [raw_text[i:i+chunk_size] for i in range(0, len(raw_text), chunk_size)]
    all_sentences = []
    for chunk in chunks:
        doc = nlp(chunk)
        sentences = list(doc.sents)
        all_sentences.extend(sentences)
    return all_sentences

sentences = process_text_in_chunks(raw_text)
sampled_sentences = sentences[:100000]  # limit to 100k sentences

# Clean the sentences
text = [re.sub(r"[.,!?\\-]", '', sentence.text.lower()) for sentence in sampled_sentences]

# Build vocabulary
word_list = list(set(" ".join(text).split()))
word2id = {'[PAD]': 0, '[CLS]': 1, '[SEP]': 2, '[MASK]': 3}
for i, word in enumerate(word_list):
    word2id[word] = i + 4
id2word = {i: word for i, word in enumerate(word2id)}
vocab_size = len(word2id)
print(f"Vocabulary size: {vocab_size}")


Vocabulary size: 23068


In [2]:
# Tokenize sentences (convert words to ids)
token_list = []
for sentence in text:
    token_list.append([word2id[word] for word in sentence.split()])


In [3]:
# --------------------------------------------
# Batch generation function
# --------------------------------------------
batch_size = 6
max_mask = 5
max_len = 1000

def make_batch(batch_size=6, max_mask=5, max_len=1000):
    batch = []
    positive = negative = 0
    while positive != batch_size / 2 or negative != batch_size / 2:
        tokens_a_index, tokens_b_index = randrange(len(sentences)), randrange(len(sentences))
        tokens_a, tokens_b = token_list[tokens_a_index], token_list[tokens_b_index]
        input_ids = [word2id['[CLS]']] + tokens_a + [word2id['[SEP]']] + tokens_b + [word2id['[SEP]']]
        segment_ids = [0] * (1 + len(tokens_a) + 1) + [1] * (len(tokens_b) + 1)
        n_pred = min(max_mask, max(1, int(round(len(input_ids) * 0.15))))
        candidates_masked_pos = [i for i, token in enumerate(input_ids) if token not in (word2id['[CLS]'], word2id['[SEP]'])]
        shuffle(candidates_masked_pos)
        masked_tokens, masked_pos = [], []
        for pos in candidates_masked_pos[:n_pred]:
            masked_pos.append(pos)
            masked_tokens.append(input_ids[pos])
            r = random()
            if r < 0.1:  # random token
                index = randint(0, vocab_size - 1)
                input_ids[pos] = word2id[id2word[index]]
            elif r < 0.9:  # replace with [MASK]
                input_ids[pos] = word2id['[MASK]']
            else:
                pass
        n_pad = max_len - len(input_ids)
        input_ids.extend([0] * n_pad)
        segment_ids.extend([0] * n_pad)
        if max_mask > n_pred:
            n_pad = max_mask - n_pred
            masked_tokens.extend([0] * n_pad)
            masked_pos.extend([0] * n_pad)
        # For NSP: use a simple rule (here positive if consecutive indices)
        if tokens_a_index + 1 == tokens_b_index and positive < batch_size / 2:
            batch.append([input_ids, segment_ids, masked_tokens, masked_pos, True])
            positive += 1
        elif tokens_a_index + 1 != tokens_b_index and negative < batch_size / 2:
            batch.append([input_ids, segment_ids, masked_tokens, masked_pos, False])
            negative += 1
    return batch

batch = make_batch(batch_size, max_mask, max_len)
input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(*batch))
print(input_ids.shape, segment_ids.shape, masked_tokens.shape, masked_pos.shape, isNext.shape)
assert input_ids.shape == torch.Size([6, max_len])
assert segment_ids.shape == torch.Size([6, max_len])
assert masked_tokens.shape == torch.Size([6, max_mask])
assert masked_pos.shape == torch.Size([6, max_mask])
assert isNext.shape == torch.Size([6])

torch.Size([6, 1000]) torch.Size([6, 1000]) torch.Size([6, 5]) torch.Size([6, 5]) torch.Size([6])


In [4]:
# --------------------------------------------
# Model components
# --------------------------------------------
n_layers = 6
n_heads  = 8
d_model  = 768
d_ff = d_model * 4
d_k = d_v = 64
n_segments = 2

class Embedding(nn.Module):
    def __init__(self):
        super(Embedding, self).__init__()
        self.tok_embed = nn.Embedding(vocab_size, d_model)  # token embedding
        self.pos_embed = nn.Embedding(max_len, d_model)       # position embedding
        self.seg_embed = nn.Embedding(n_segments, d_model)    # segment embedding
        self.norm = nn.LayerNorm(d_model)
    def forward(self, x, seg):
        seq_len = x.size(1)
        pos = torch.arange(seq_len, dtype=torch.long, device=x.device).unsqueeze(0).expand_as(x)
        embedding = self.tok_embed(x) + self.pos_embed(pos) + self.seg_embed(seg)
        return self.norm(embedding)

def get_attn_pad_mask(seq_q, seq_k):
    batch_size, len_q = seq_q.size()
    len_k = seq_k.size(1)
    pad_attn_mask = seq_k.eq(0).unsqueeze(1)  # 1 for PAD tokens
    return pad_attn_mask.expand(batch_size, len_q, len_k)

class ScaledDotProductAttention(nn.Module):
    def __init__(self):
        super(ScaledDotProductAttention, self).__init__()
    def forward(self, Q, K, V, attn_mask):
        scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(d_k)
        scores.masked_fill_(attn_mask, -1e9)
        attn = F.softmax(scores, dim=-1)
        context = torch.matmul(attn, V)
        return context, attn

class MultiHeadAttention(nn.Module):
    def __init__(self):
        super(MultiHeadAttention, self).__init__()
        self.W_Q = nn.Linear(d_model, d_k * n_heads)
        self.W_K = nn.Linear(d_model, d_k * n_heads)
        self.W_V = nn.Linear(d_model, d_v * n_heads)
        self.linear = nn.Linear(n_heads * d_v, d_model)
        self.layer_norm = nn.LayerNorm(d_model)
    def forward(self, Q, K, V, attn_mask):
        residual = Q
        batch_size = Q.size(0)
        q_s = self.W_Q(Q).view(batch_size, -1, n_heads, d_k).transpose(1,2)
        k_s = self.W_K(K).view(batch_size, -1, n_heads, d_k).transpose(1,2)
        v_s = self.W_V(V).view(batch_size, -1, n_heads, d_v).transpose(1,2)
        attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1)
        context, attn = ScaledDotProductAttention()(q_s, k_s, v_s, attn_mask)
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, n_heads * d_v)
        output = self.linear(context)
        output = self.layer_norm(output + residual)
        return output, attn

class PoswiseFeedForwardNet(nn.Module):
    def __init__(self):
        super(PoswiseFeedForwardNet, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
    def forward(self, x):
        return self.fc2(F.gelu(self.fc1(x)))

class EncoderLayer(nn.Module):
    def __init__(self):
        super(EncoderLayer, self).__init__()
        self.enc_self_attn = MultiHeadAttention()
        self.pos_ffn = PoswiseFeedForwardNet()
    def forward(self, enc_inputs, enc_self_attn_mask):
        enc_outputs, attn = self.enc_self_attn(enc_inputs, enc_inputs, enc_inputs, enc_self_attn_mask)
        enc_outputs = self.pos_ffn(enc_outputs)
        return enc_outputs, attn


In [5]:
# --------------------------------------------
# BERT Model Definition
# --------------------------------------------
class BERT(nn.Module):
    def __init__(self):
        super(BERT, self).__init__()
        self.embedding = Embedding()
        self.layers = nn.ModuleList([EncoderLayer() for _ in range(n_layers)])
        # For NSP and MLM
        self.fc = nn.Linear(d_model, d_model)
        self.activ = nn.Tanh()
        self.linear = nn.Linear(d_model, d_model)
        self.norm = nn.LayerNorm(d_model)
        self.classifier = nn.Linear(d_model, 2)
        # Share weights for MLM: tie decoder to token embedding weights
        embed_weight = self.embedding.tok_embed.weight
        self.decoder = nn.Linear(d_model, vocab_size, bias=False)
        self.decoder.weight = embed_weight
        self.decoder_bias = nn.Parameter(torch.zeros(vocab_size))
    def forward(self, input_ids, segment_ids, masked_pos, attention_mask=None):
        output = self.embedding(input_ids, segment_ids)
        enc_self_attn_mask = get_attn_pad_mask(input_ids, input_ids)
        if attention_mask is not None:
            enc_self_attn_mask = enc_self_attn_mask * attention_mask.unsqueeze(1).unsqueeze(2)
        for layer in self.layers:
            output, _ = layer(output, enc_self_attn_mask)
        # Next Sentence Prediction (use [CLS] token representation)
        h_pooled = self.activ(self.fc(output[:, 0]))  # [batch_size, d_model]
        logits_nsp = self.classifier(h_pooled)         # [batch_size, 2]
        # Masked Language Modeling: gather output for masked positions
        masked_pos_expanded = masked_pos[:, :, None].expand(-1, -1, output.size(-1))
        h_masked = torch.gather(output, 1, masked_pos_expanded)
        h_masked = self.norm(F.gelu(self.linear(h_masked)))
        logits_lm = self.decoder(h_masked) + self.decoder_bias
        return logits_lm, logits_nsp

    def get_sentence_embedding(self, input_ids, segment_ids):
        """
        Returns the pooled [CLS] representation that can be used as the sentence embedding.
        """
        output = self.embedding(input_ids, segment_ids)
        enc_self_attn_mask = get_attn_pad_mask(input_ids, input_ids)
        for layer in self.layers:
            output, _ = layer(output, enc_self_attn_mask)
        h_pooled = self.activ(self.fc(output[:, 0]))
        return h_pooled

In [6]:
# --------------------------------------------
# Training Loop
# --------------------------------------------
model = BERT()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
num_epochs = 300
best_loss = float('inf')
start_time = torch.cuda.Event(enable_timing=True)
end_time = torch.cuda.Event(enable_timing=True)

print("Starting training...")
for epoch in range(num_epochs):
    batch = make_batch(batch_size, max_mask, max_len)
    input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(*batch))
    optimizer.zero_grad()
    logits_lm, logits_nsp = model(input_ids, segment_ids, masked_pos)
    loss_lm = criterion(logits_lm.transpose(1, 2), masked_tokens)
    loss_lm = loss_lm.mean()
    loss_nsp = criterion(logits_nsp, isNext)
    loss = loss_lm + loss_nsp
    if epoch % 100 == 0:
        print(f'Epoch: {epoch:03d} loss = {loss.item():.6f}')
    loss.backward()
    optimizer.step()
    if loss.item() < best_loss:
        best_loss = loss.item()
        torch.save(model.state_dict(), 'bert_model_for_sbert.pth')
        print(f'Best model saved at epoch {epoch} with loss {best_loss:.6f}')
print("Training completed.")

Starting training...
Epoch: 000 loss = 107.271065
Best model saved at epoch 0 with loss 107.271065
Best model saved at epoch 9 with loss 103.135223
Best model saved at epoch 10 with loss 101.163689
Best model saved at epoch 11 with loss 91.795013
Best model saved at epoch 13 with loss 83.215134
Best model saved at epoch 15 with loss 72.218376
Best model saved at epoch 18 with loss 67.562126
Best model saved at epoch 19 with loss 56.529549
Best model saved at epoch 26 with loss 53.223518
Best model saved at epoch 28 with loss 42.277283
Best model saved at epoch 36 with loss 42.113510
Best model saved at epoch 38 with loss 41.771416
Best model saved at epoch 40 with loss 39.090973
Best model saved at epoch 41 with loss 35.485992
Best model saved at epoch 52 with loss 31.839205
Best model saved at epoch 59 with loss 31.565279
Best model saved at epoch 60 with loss 30.824751
Best model saved at epoch 61 with loss 29.999926
Best model saved at epoch 72 with loss 29.092245
Best model saved a

In [7]:
# Example: Using the saved model to extract sentence embeddings
# (Task 2 can now load this state_dict into an identical BERT instance and call get_sentence_embedding)
loaded_model = BERT()
loaded_model.load_state_dict(torch.load('bert_model_for_sbert.pth'))
loaded_model.eval()
# Example dummy input (using first sentence from token_list)
dummy_input_ids = torch.LongTensor([ [word2id['[CLS]']] + token_list[0][:max_len-2] + [word2id['[SEP]']] + [0]*(max_len - (len(token_list[0]) + 2)) ])
dummy_segment_ids = torch.zeros_like(dummy_input_ids)
sentence_embedding = loaded_model.get_sentence_embedding(dummy_input_ids, dummy_segment_ids)
print("Extracted sentence embedding shape:", sentence_embedding.shape)


Extracted sentence embedding shape: torch.Size([1, 768])


In [1]:
##TASK 2

In [6]:
# Load the trained BERT model
model = BERT()  # Your BERT class from Task 1
model.load_state_dict(torch.load('bert_model_for_sbert.pth'))
model.eval()  # Set the model to evaluation mode

BERT(
  (embedding): Embedding(
    (tok_embed): Embedding(23068, 768)
    (pos_embed): Embedding(1000, 768)
    (seg_embed): Embedding(2, 768)
    (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (layers): ModuleList(
    (0-5): 6 x EncoderLayer(
      (enc_self_attn): MultiHeadAttention(
        (W_Q): Linear(in_features=768, out_features=512, bias=True)
        (W_K): Linear(in_features=768, out_features=512, bias=True)
        (W_V): Linear(in_features=768, out_features=512, bias=True)
        (linear): Linear(in_features=512, out_features=768, bias=True)
        (layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
      (pos_ffn): PoswiseFeedForwardNet(
        (fc1): Linear(in_features=768, out_features=3072, bias=True)
        (fc2): Linear(in_features=3072, out_features=768, bias=True)
      )
    )
  )
  (fc): Linear(in_features=768, out_features=768, bias=True)
  (activ): Tanh()
  (linear): Linear(in_features=768, out_features=768, bi

In [14]:
from datasets import load_dataset

# Load the SNLI dataset
dataset = load_dataset("snli")
train_data = dataset["train"]
val_data = dataset["validation"]
test_data = dataset["test"]

In [15]:
def preprocess_snli(data):
    sentences1 = []
    sentences2 = []
    labels = []
    for example in data:
        if example["label"] != -1:  # Filter out invalid examples
            sentences1.append(example["premise"])
            sentences2.append(example["hypothesis"])
            labels.append(example["label"])
    return sentences1, sentences2, labels

train_sentences1, train_sentences2, train_labels = preprocess_snli(train_data)
val_sentences1, val_sentences2, val_labels = preprocess_snli(val_data)
test_sentences1, test_sentences2, test_labels = preprocess_snli(test_data)

In [16]:
def tokenize_sentences(sentences, word2id, max_len=100):
    tokenized = []
    for sentence in sentences:
        tokens = [word2id.get(word, word2id["[PAD]"]) for word in sentence.split()]
        tokens = tokens[:max_len] + [word2id["[PAD]"]] * (max_len - len(tokens))  # Pad to max_len
        tokenized.append(tokens)
    return torch.LongTensor(tokenized)

max_len = 100  # Maximum sequence length
train_tokens1 = tokenize_sentences(train_sentences1, word2id, max_len)
train_tokens2 = tokenize_sentences(train_sentences2, word2id, max_len)
val_tokens1 = tokenize_sentences(val_sentences1, word2id, max_len)
val_tokens2 = tokenize_sentences(val_sentences2, word2id, max_len)
test_tokens1 = tokenize_sentences(test_sentences1, word2id, max_len)
test_tokens2 = tokenize_sentences(test_sentences2, word2id, max_len)

In [17]:
class SentenceBERT(nn.Module):
    def __init__(self, bert_model):
        super(SentenceBERT, self).__init__()
        self.bert = bert_model
        self.classifier = nn.Linear(d_model * 3, 3)  # 3 classes for SNLI (entailment, neutral, contradiction)

    def forward(self, input_ids1, segment_ids1, input_ids2, segment_ids2):
        # Get sentence embeddings for both sentences
        u = self.bert.get_sentence_embedding(input_ids1, segment_ids1)  # [batch_size, d_model]
        v = self.bert.get_sentence_embedding(input_ids2, segment_ids2)  # [batch_size, d_model]

        # Compute the element-wise absolute difference
        diff = torch.abs(u - v)  # [batch_size, d_model]

        # Concatenate u, v, and |u - v|
        combined = torch.cat([u, v, diff], dim=1)  # [batch_size, d_model * 3]

        # Pass through the classifier
        logits = self.classifier(combined)  # [batch_size, 3]
        return logits

In [None]:
# Initialize the Sentence-BERT model
sbert_model = SentenceBERT(model).to(device)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(sbert_model.parameters(), lr=0.001)

# Training loop
num_epochs = 3
batch_size = 32
best_val_loss = float('inf')  # Track the best validation loss

for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")
    sbert_model.train()
    total_loss = 0
    num_batches = 0

    # Training phase
    for i in range(0, len(train_tokens1), batch_size):
        # Get a batch of data
        input_ids1 = train_tokens1[i:i+batch_size].to(device)
        input_ids2 = train_tokens2[i:i+batch_size].to(device)
        labels = torch.LongTensor(train_labels[i:i+batch_size]).to(device)

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        logits = sbert_model(input_ids1, torch.zeros_like(input_ids1), input_ids2, torch.zeros_like(input_ids2))
        loss = criterion(logits, labels)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Accumulate loss
        total_loss += loss.item()
        num_batches += 1

    # Calculate average training loss for the epoch
    avg_train_loss = total_loss / num_batches
    print(f"Training Loss: {avg_train_loss:.4f}")

    # Validation phase
    sbert_model.eval()
    val_loss = 0
    val_batches = 0

    with torch.no_grad():
        for i in range(0, len(val_tokens1), batch_size):
            input_ids1 = val_tokens1[i:i+batch_size].to(device)
            input_ids2 = val_tokens2[i:i+batch_size].to(device)
            labels = torch.LongTensor(val_labels[i:i+batch_size]).to(device)

            # Forward pass
            logits = sbert_model(input_ids1, torch.zeros_like(input_ids1), input_ids2, torch.zeros_like(input_ids2))
            loss = criterion(logits, labels)

            # Accumulate validation loss
            val_loss += loss.item()
            val_batches += 1

    # Calculate average validation loss for the epoch
    avg_val_loss = val_loss / val_batches
    print(f"Validation Loss: {avg_val_loss:.4f}")

    # Save the best model
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(sbert_model.state_dict(), 'best_sbert_model.pth')
        print(f"Best model saved with Validation Loss: {best_val_loss:.4f}")

    print("-" * 50)

print("Training completed.")

In [None]:
#I have completed the training for above task and saved the model but accidently i ran this again and the ouput dissapeared.

In [None]:
#TASK 3

In [18]:
# Load the best Sentence-BERT model
sbert_model = SentenceBERT(model).to(device)
sbert_model.load_state_dict(torch.load('best_sbert_model.pth'))
sbert_model.eval()  # Set the model to evaluation mode

SentenceBERT(
  (bert): BERT(
    (embedding): Embedding(
      (tok_embed): Embedding(23068, 768)
      (pos_embed): Embedding(1000, 768)
      (seg_embed): Embedding(2, 768)
      (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    )
    (layers): ModuleList(
      (0-5): 6 x EncoderLayer(
        (enc_self_attn): MultiHeadAttention(
          (W_Q): Linear(in_features=768, out_features=512, bias=True)
          (W_K): Linear(in_features=768, out_features=512, bias=True)
          (W_V): Linear(in_features=768, out_features=512, bias=True)
          (linear): Linear(in_features=512, out_features=768, bias=True)
          (layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (pos_ffn): PoswiseFeedForwardNet(
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
    )
    (fc): Linear(in_features=768, out_features=768, bias=True)
    (a

In [19]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Function to evaluate the model
def evaluate_model(test_tokens1, test_tokens2, test_labels, batch_size=8):
    predictions = []
    true_labels = []

    with torch.no_grad():
        for i in range(0, len(test_tokens1), batch_size):
            # Get a batch of data
            input_ids1 = test_tokens1[i:i+batch_size].to(device)
            input_ids2 = test_tokens2[i:i+batch_size].to(device)
            labels = torch.LongTensor(test_labels[i:i+batch_size]).to(device)

            # Forward pass
            logits = sbert_model(input_ids1, torch.zeros_like(input_ids1), input_ids2, torch.zeros_like(input_ids2))
            _, preds = torch.max(logits, dim=1)

            # Store predictions and true labels
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    # Compute metrics
    accuracy = accuracy_score(true_labels, predictions)
    precision = precision_score(true_labels, predictions, average='weighted')
    recall = recall_score(true_labels, predictions, average='weighted')
    f1 = f1_score(true_labels, predictions, average='weighted')

    # Classification report
    print("Classification Report:")
    print(classification_report(true_labels, predictions, target_names=['entailment', 'neutral', 'contradiction']))

    return accuracy, precision, recall, f1

# Evaluate on the test set
accuracy, precision, recall, f1 = evaluate_model(test_tokens1, test_tokens2, test_labels)
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

Classification Report:
               precision    recall  f1-score   support

   entailment       0.00      0.00      0.00      3368
      neutral       0.00      0.00      0.00      3219
contradiction       0.33      1.00      0.50      3237

     accuracy                           0.33      9824
    macro avg       0.11      0.33      0.17      9824
 weighted avg       0.11      0.33      0.16      9824

Accuracy: 0.3295
Precision: 0.1086
Recall: 0.3295
F1-Score: 0.1633


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
#The classification report shows that the model struggles with classifying "entailment" and "neutral" labels, 
#as both have zero precision, recall, and F1 scores, indicating poor performance on these classes. It performs well on 
#"contradiction" with a perfect recall (1.00) but low precision (0.33), resulting in a low F1 score of 0.50. Overall, 
#the model's accuracy is 32.95%, with poor performance on the minority classes, as seen in the macro and weighted averages. 
#This suggests the need for improvements, such as addressing class imbalance and model optimization.

In [None]:
#TASK 3 PART 2

In [None]:
#The Sentence-BERT model for NLI tasks faces challenges such as class imbalance, limited dataset size, and high computational costs.
#Performance can also be sensitive to hyperparameters, and overfitting is a risk with small datasets. To improve, techniques like class balancing, 
#using larger datasets, leveraging pre-trained models, and hyperparameter tuning can enhance results. Regularization methods, advanced architectures
#like RoBERTa, and data augmentation can help with generalization, while using additional metrics like MCC can better assess performance. 
#Utilizing cloud-based GPUs and conducting error analysis will optimize training and refine the model.