In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/dt_pairs/pytorch/default/1/final_model.pth
/kaggle/input/test-review-pairs/review_pairs_dataset.csv
/kaggle/input/review-pairs-dataset/dnnlp_project_dataset.csv


In [2]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
import numpy as np
from torch.utils.data import Dataset, DataLoader

In [3]:
torch.cuda.empty_cache()

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [14]:
import torch
import torch.nn as nn
from transformers import AutoModel

class CrossEncoderCSR(nn.Module):
    def __init__(self, model_name):
        super(CrossEncoderCSR, self).__init__()
        # Initialize encoder
        self.encoder = AutoModel.from_pretrained(model_name)
        self.encoder_config = self.encoder.config

        # Check for layer access compatibility
        if hasattr(self.encoder, "encoder"):  # For BERT and similar models
            self.layers = self.encoder.encoder.layer
        elif hasattr(self.encoder, "transformer"):  # For DistilBERT
            self.layers = self.encoder.transformer.layer
        else:
            raise AttributeError("The model type is not supported for layer extraction.")
        
        self.total_layers = len(self.layers)
        self.d_k = self.encoder_config.hidden_size

    def forward(self, s1_input_ids, s1_attention_mask, s2_input_ids, s2_attention_mask, c_input_ids, c_attention_mask):
        device = s1_input_ids.device
        
        # Encode context
        c_output = self.encoder(c_input_ids, attention_mask=c_attention_mask, output_hidden_states=True)
        c_hidden_state = c_output.last_hidden_state  # Shape: (B, L_c, H)

        # Obtain query vector for the condition (last hidden state pooled)
        q_c = c_hidden_state[:, 0, :]  # Use [CLS] token as the query vector (B, H)

        # Start with token embeddings for s1 and s2
        s1_hidden_state = self.encoder.embeddings(s1_input_ids).to(device)
        s2_hidden_state = self.encoder.embeddings(s2_input_ids).to(device)

        # Preprocess attention masks
        s1_extended_mask = self._get_extended_attention_mask(s1_attention_mask, s1_input_ids.size(), device)
        s2_extended_mask = self._get_extended_attention_mask(s2_attention_mask, s2_input_ids.size(), device)

        # Process up to the t-th layer
        for i, layer in enumerate(self.layers):
            s1_hidden_state = layer(s1_hidden_state, s1_extended_mask)[0]
            s2_hidden_state = layer(s2_hidden_state, s2_extended_mask)[0]

            if i >= self.total_layers // 2:  # Introduce router from t-th layer onwards
                k_s1 = s1_hidden_state
                k_s2 = s2_hidden_state

                # Compute router weights
                s1_w_t = self.get_c_router_weights(q_c, k_s1)
                s2_w_t = self.get_c_router_weights(q_c, k_s2)

                # Adjust multi-head attention hidden state
                s1_hidden_state = (1 + s1_w_t.unsqueeze(-1)) * s1_hidden_state
                s2_hidden_state = (1 + s2_w_t.unsqueeze(-1)) * s2_hidden_state

        # Average pooling for final sentence representation
        s1_representation = s1_hidden_state.mean(dim=1)  # Shape: (B, H)
        s2_representation = s2_hidden_state.mean(dim=1)  # Shape: (B, H)

        return s1_representation, s2_representation

    def get_c_router_weights(self, q_c, k_s):
        q_c = q_c.unsqueeze(1)  # Shape: (B, 1, H)
        scores = torch.matmul(q_c, k_s.transpose(-2, -1))  # Shape: (B, 1, L_s)
        scores = scores / torch.sqrt(torch.tensor(self.d_k, dtype=torch.float32).to(q_c.device))  # Scale scores
        weights = nn.functional.softmax(scores, dim=-1)  # Shape: (B, 1, L_s)
        return weights.squeeze(1)  # Shape: (B, L_s)

    def _get_extended_attention_mask(self, attention_mask, input_shape, device):
        """
        Prepare the attention mask for use in transformer layers.
        """
        extended_attention_mask = attention_mask[:, None, None, :]  # (B, 1, 1, L)
        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0  # Invert and scale
        return extended_attention_mask.to(device)


In [15]:
# Tokenizer initialization
model_name = "distilbert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Custom Dataset class
class ContradictionDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        s1, s2, aspect, label = row['sentence1'], row['sentence2'], row['aspect'], row['label']

        # Tokenize inputs
        s1_tokens = self.tokenizer(s1, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt").to("cuda")
        s2_tokens = self.tokenizer(s2, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt").to("cuda")
        aspect_tokens = self.tokenizer(aspect, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt").to("cuda")

        # Map label to binary value
        label = 1 if label == "c" else 0

        return {
            "s1_input_ids": s1_tokens["input_ids"].squeeze(0),
            "s1_attention_mask": s1_tokens["attention_mask"].squeeze(0),
            "s2_input_ids": s2_tokens["input_ids"].squeeze(0),
            "s2_attention_mask": s2_tokens["attention_mask"].squeeze(0),
            "aspect_input_ids": aspect_tokens["input_ids"].squeeze(0),
            "aspect_attention_mask": aspect_tokens["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long),
        }




In [16]:
df = pd.read_csv("/kaggle/input/review-pairs-dataset/dnnlp_project_dataset.csv")
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,paper_id,pair_id,hypothesis,premise,aspect,s1,s2,line_pair,label
0,0,0,ICLR_2019_1401,3,further the paper makes several misleading cla...,the paper is rather well written but it strong...,clarity,positive,negative,"(6, 2)",n
1,1,1,NIPS_2016_89,3,4 .i like the key idea and the speedup is very...,review scores reflect this reviewers impressio...,originality,negative,positive,"(5, 20)",n
2,2,2,NIPS_2016_89,4,the idea to use sampling is nice but the analy...,review scores reflect this reviewers impressio...,originality,negative,positive,"(5, 18)",n
3,3,3,NIPS_2016_89,5,to summarize i think this paper give some empi...,in my opinion the overall quality of the paper...,soundness,positive,negative,"(4, 10)",n
4,4,4,NIPS_2016_89,5,to summarize i think this paper give some empi...,the context and relevance as well as the contr...,soundness,positive,negative,"(5, 10)",n


In [17]:
df.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'paper_id', 'pair_id', 'hypothesis',
       'premise', 'aspect', 's1', 's2', 'line_pair', 'label'],
      dtype='object')

In [18]:
final_df = df[['hypothesis', 'premise', 'aspect', 'label']]
final_df.rename(columns={'hypothesis': 'sentence1', 'premise': 'sentence2'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df.rename(columns={'hypothesis': 'sentence1', 'premise': 'sentence2'}, inplace=True)


In [19]:
final_df.head()

Unnamed: 0,sentence1,sentence2,aspect,label
0,further the paper makes several misleading cla...,the paper is rather well written but it strong...,clarity,n
1,4 .i like the key idea and the speedup is very...,review scores reflect this reviewers impressio...,originality,n
2,the idea to use sampling is nice but the analy...,review scores reflect this reviewers impressio...,originality,n
3,to summarize i think this paper give some empi...,in my opinion the overall quality of the paper...,soundness,n
4,to summarize i think this paper give some empi...,the context and relevance as well as the contr...,soundness,n


In [20]:
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
from transformers import AdamW
from sklearn.model_selection import train_test_split

# Hyperparameters
BATCH_SIZE = 3
EPOCHS = 3
LEARNING_RATE = 2e-5
MAX_LENGTH = 128

# Split data into train and validation sets
train_data, val_data = train_test_split(final_df, test_size=0.2, random_state=42)

# Create datasets and loaders
train_dataset = ContradictionDataset(train_data, tokenizer, max_length=MAX_LENGTH)
val_dataset = ContradictionDataset(val_data, tokenizer, max_length=MAX_LENGTH)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

# Model, loss function, and optimizer
model = CrossEncoderCSR(model_name)
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)



In [24]:
import torch
import os
from tqdm import tqdm  # Import tqdm for progress tracking

def train_model(
    model, train_loader, val_loader, optimizer, criterion, epochs, device, 
    save_path="model_checkpoints", save_interval=1, checkpoint_path=None
):
    # Create directory for saving models
    os.makedirs(save_path, exist_ok=True)

    # Initialize epoch and loss for resuming training
    start_epoch = 0

    # Load checkpoint if provided
    if checkpoint_path and os.path.exists(checkpoint_path):
        print(f"Loading checkpoint from {checkpoint_path}...")
        checkpoint = torch.load(checkpoint_path)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        start_epoch = checkpoint['epoch'] + 1
        print(f"Resumed training from epoch {start_epoch}")

    for epoch in range(start_epoch, epochs):
        model.train()
        total_loss = 0
        
        print(f"\nEpoch {epoch + 1}/{epochs}")
        train_progress = tqdm(train_loader, desc="Training", leave=False)  # Training progress bar
        
        for batch in train_progress:
            optimizer.zero_grad()
            
            # Move batch to device
            s1_input_ids = batch["s1_input_ids"].to(device)
            s1_attention_mask = batch["s1_attention_mask"].to(dtype=torch.float).to(device)
            s2_input_ids = batch["s2_input_ids"].to(device)
            s2_attention_mask = batch["s2_attention_mask"].to(dtype=torch.float).to(device)
            aspect_input_ids = batch["aspect_input_ids"].to(device)
            aspect_attention_mask = batch["aspect_attention_mask"].to(dtype=torch.float).to(device)
            labels = batch["label"].to(device)
            
            # Forward pass
            s1_rep, s2_rep = model(
                s1_input_ids, s1_attention_mask, 
                s2_input_ids, s2_attention_mask, 
                aspect_input_ids, aspect_attention_mask
            )
            
            # Compute similarity
            logits = torch.cosine_similarity(s1_rep, s2_rep, dim=1).unsqueeze(1)
            logits = torch.cat([1 - logits, logits], dim=1)  # Binary classification logits
            
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            train_progress.set_postfix(loss=loss.item())  # Update progress bar
        
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader)}")
        
        # Save checkpoint at specified intervals
        if (epoch + 1) % save_interval == 0:
            checkpoint_file = os.path.join(save_path, f"model_epoch_{epoch + 1}.pth")
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
            }, checkpoint_file)
            print(f"Checkpoint saved at {checkpoint_file}")
        
        # Validation
        model.eval()
        val_loss = 0
        correct = 0
        val_progress = tqdm(val_loader, desc="Validation", leave=False)  # Validation progress bar
        
        with torch.no_grad():
            for batch in val_progress:
                s1_input_ids = batch["s1_input_ids"].to(device)
                s1_attention_mask = batch["s1_attention_mask"].to(dtype=torch.float).to(device)
                s2_input_ids = batch["s2_input_ids"].to(device)
                s2_attention_mask = batch["s2_attention_mask"].to(dtype=torch.float).to(device)
                aspect_input_ids = batch["aspect_input_ids"].to(device)
                aspect_attention_mask = batch["aspect_attention_mask"].to(dtype=torch.float).to(device)
                labels = batch["label"].to(device)
                
                s1_rep, s2_rep = model(
                    s1_input_ids, s1_attention_mask, 
                    s2_input_ids, s2_attention_mask, 
                    aspect_input_ids, aspect_attention_mask
                )
                
                logits = torch.cosine_similarity(s1_rep, s2_rep, dim=1).unsqueeze(1)
                logits = torch.cat([1 - logits, logits], dim=1)
                
                loss = criterion(logits, labels)
                val_loss += loss.item()
                
                preds = torch.argmax(logits, dim=1)
                correct += (preds == labels).sum().item()
                val_progress.set_postfix(val_loss=loss.item())  # Update progress bar
        
        accuracy = correct / len(val_loader.dataset)
        print(f"Validation Loss: {val_loss / len(val_loader)}, Accuracy: {accuracy}")
    
    # Save final model
    final_model_file = os.path.join(save_path, "final_model.pth")
    torch.save(model.state_dict(), final_model_file)
    print(f"Final model saved at {final_model_file}")

In [25]:
# Example usage
train_model(
    model, 
    train_loader, 
    val_loader, 
    optimizer, 
    criterion, 
    epochs=10, 
    device=device, 
    save_path="model_checkpoints", 
    save_interval=1, 
    checkpoint_path="model_epoch_1.pth"
)



Epoch 1/10


                                                                           

Epoch 1/10, Loss: 0.5199278740246389
Checkpoint saved at model_checkpoints/model_epoch_1.pth


                                                                               

Validation Loss: 0.5056030732573998, Accuracy: 0.7896821261073476

Epoch 2/10


                                                                           

Epoch 2/10, Loss: 0.5210800939279598
Checkpoint saved at model_checkpoints/model_epoch_2.pth


                                                                               

Validation Loss: 0.5452257129397196, Accuracy: 0.7304846274101094

Epoch 3/10


                                                                           

Epoch 3/10, Loss: 0.5242511757173556
Checkpoint saved at model_checkpoints/model_epoch_3.pth


                                                                               

Validation Loss: 0.5624324323940777, Accuracy: 0.7007816571130797

Epoch 4/10


                                                                           

Epoch 4/10, Loss: 0.5099867963358324
Checkpoint saved at model_checkpoints/model_epoch_4.pth


                                                                               

Validation Loss: 0.4847264990895269, Accuracy: 0.8215737363210005

Epoch 5/10


                                                                           

Epoch 5/10, Loss: 0.490827322200148
Checkpoint saved at model_checkpoints/model_epoch_5.pth


                                                                               

Validation Loss: 0.48199807186169935, Accuracy: 0.8205315268368942

Epoch 6/10


                                                                           

Epoch 6/10, Loss: 0.4961587170723343
Checkpoint saved at model_checkpoints/model_epoch_6.pth


                                                                               

Validation Loss: 0.5057416217155328, Accuracy: 0.8083376758728504

Epoch 7/10


                                                                           

Epoch 7/10, Loss: 0.5033764623889369
Checkpoint saved at model_checkpoints/model_epoch_7.pth


                                                                               

Validation Loss: 0.5102476445156249, Accuracy: 0.7933298593017196

Epoch 8/10


                                                                           

Epoch 8/10, Loss: 0.502159209053964
Checkpoint saved at model_checkpoints/model_epoch_8.pth


                                                                               

Validation Loss: 0.5005227898332841, Accuracy: 0.782803543512246

Epoch 9/10


                                                                           

Epoch 9/10, Loss: 0.49756612687748597
Checkpoint saved at model_checkpoints/model_epoch_9.pth


                                                                               

Validation Loss: 0.5270550387721391, Accuracy: 0.7750911933298593

Epoch 10/10


                                                                           

Epoch 10/10, Loss: 0.4977144192719629
Checkpoint saved at model_checkpoints/model_epoch_10.pth


                                                                               

Validation Loss: 0.6592679152864335, Accuracy: 0.6112558624283481
Final model saved at model_checkpoints/final_model.pth


In [15]:
test_df = pd.read_csv("/kaggle/input/test-review-pairs/review_pairs_dataset.csv")
test_df.head()

Unnamed: 0,Review1,Review2,Aspect,Contradictory
0,The usability of this Cloud Computing system i...,The usability of this Machine Learning system ...,usability,False
1,The usability of this AI system is remarkable.,The usability of this Cybersecurity system is ...,usability,True
2,The reliability of this AI system is excellent.,The reliability of this Cybersecurity system i...,reliability,False
3,The reliability of this Cloud Computing system...,The reliability of this Data Science system is...,reliability,True
4,The scalability of this Cybersecurity system i...,The scalability of this Cloud Computing system...,scalability,False


In [16]:
test_df.rename(columns={'Review1': 'sentence1', 'Review2': 'sentence2', 'Aspect': 'aspect', 'Contradictory': 'label'}, inplace=True)

In [17]:
def func(val):
    if val:
        return 'c'
    else :
        return 'n'

In [20]:
test_df['label'] = test_df['label'].apply(func)

In [22]:
# Create datasets and loaders
test_data = ContradictionDataset(test_df, tokenizer, max_length=MAX_LENGTH)

test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=True)

In [23]:
import torch
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from tqdm import tqdm

# Function to load the model
def load_model(model, checkpoint_path, device):
    model.load_state_dict(torch.load(checkpoint_path, map_location=device))
    model.to(device)
    model.eval()
    print(f"Model loaded from {checkpoint_path}")

# Function to evaluate the model
def evaluate_model(model, test_loader, device):
    model.eval()
    all_preds = []
    all_labels = []

    test_progress = tqdm(test_loader, desc="Testing", leave=False)  # Testing progress bar

    with torch.no_grad():
        for batch in test_progress:
            # Move batch to device
            s1_input_ids = batch["s1_input_ids"].to(device)
            s1_attention_mask = batch["s1_attention_mask"].to(dtype=torch.float).to(device)
            s2_input_ids = batch["s2_input_ids"].to(device)
            s2_attention_mask = batch["s2_attention_mask"].to(dtype=torch.float).to(device)
            aspect_input_ids = batch["aspect_input_ids"].to(device)
            aspect_attention_mask = batch["aspect_attention_mask"].to(dtype=torch.float).to(device)
            labels = batch["label"].to(device)

            # Forward pass
            s1_rep, s2_rep = model(
                s1_input_ids, s1_attention_mask, 
                s2_input_ids, s2_attention_mask, 
                aspect_input_ids, aspect_attention_mask
            )
            
            # Compute similarity and logits
            logits = torch.cosine_similarity(s1_rep, s2_rep, dim=1).unsqueeze(1)
            logits = torch.cat([1 - logits, logits], dim=1)
            
            # Predictions
            preds = torch.argmax(logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Calculate metrics
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average="binary")
    accuracy = accuracy_score(all_labels, all_preds)

    print(f"Test Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

    return precision, recall, f1, accuracy

# Example Usage
# Replace `YourModelClass` with your actual model class
# Replace `test_loader` with your DataLoader for the test dataset
# Replace `model_checkpoint.pth` with your actual .pth file path

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CrossEncoderCSR()  # Initialize your model architecture
checkpoint_path = "/kaggle/input/dt_pairs/pytorch/default/1/final_model.pth"

# Load model weights
load_model(model, checkpoint_path, device)

# Evaluate the model
precision, recall, f1, accuracy = evaluate_model(model, test_loader, device)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load(checkpoint_path, map_location=device))


Model loaded from /kaggle/input/dt_pairs/pytorch/default/1/final_model.pth


                                                          

Test Accuracy: 0.5000
Precision: 0.5000
Recall: 1.0000
F1 Score: 0.6667




In [22]:
def test_cross_encoder_csr():
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained("roberta-base")
    
    # Create model
    model = CrossEncoderCSR(model_name="roberta-base")
    
    # Prepare test inputs
    sentences1 = ["The cat sit on mat"]
    sentences2 = ["The mat is covered by cat"]
    context = ["Test123"]
    
    # Tokenize inputs
    s1_inputs = tokenizer(sentences1, padding='max_length', truncation=True, return_tensors="pt", max_length=512)
    s2_inputs = tokenizer(sentences2, padding='max_length', truncation=True, return_tensors="pt", max_length=512)
    c_inputs = tokenizer(context, padding='max_length', truncation=True, return_tensors="pt", max_length=512)
    
    # Forward pass
    with torch.no_grad():
        s1_hidden, s2_hidden = model(
            s1_input_ids=s1_inputs['input_ids'], 
            s1_attention_mask=s1_inputs['attention_mask'].to(dtype=torch.float),
            s2_input_ids=s2_inputs['input_ids'], 
            s2_attention_mask=s2_inputs['attention_mask'].to(dtype=torch.float),
            c_input_ids=c_inputs['input_ids'], 
            c_attention_mask=c_inputs['attention_mask'].to(dtype=torch.float)
        )
    
    # print("S1 Hidden States Shape:", s1_hidden)
    # print("S2 Hidden States Shape:", s2_hidden)
    
    # Optional: Compute cosine similarity
    def cosine_similarity(a, b):
        return torch.nn.functional.cosine_similarity(a, b, dim=-1)
    
    # Compute and print similarity
    for i in range(len(sentences1)):
        sim = cosine_similarity(s1_hidden[i], s2_hidden[i])
        print(f"Similarity between '{sentences1[i]}' and '{sentences2[i]}': {sim.item()}")

if __name__ == "__main__":
    test_cross_encoder_csr()

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Similarity between 'The cat sit on mat' and 'The mat is covered by cat': 0.9815711975097656


In [14]:
import torch

# Original tensor
x = torch.tensor([1, 2, 3])

# Add a new dimension at dim=0
x_unsqueezed = x.unsqueeze(0)

print(x_unsqueezed)

tensor([[1],
        [2],
        [3]])
