In [1]:
!pip install transformers
!pip install torch
!pip install datasets



In [2]:
# Update the path to the uploaded model file on Kaggle
saved_model_path = "/kaggle/input/studentmodel2/student_model_state2.pth"


In [3]:
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification, RobertaConfig
from datasets import load_dataset
from torch.utils.data import DataLoader

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Function to create half-sized Roberta
def create_half_size_roberta_base():
    student_config = RobertaConfig.from_pretrained("roberta-base")
    student_config.hidden_size //= 2
    student_config.num_attention_heads //= 2
    student_config.intermediate_size //= 2
    student_model = RobertaForSequenceClassification(student_config)
    return student_model

# Load the student model
student_model = create_half_size_roberta_base()
student_model.to(device)

# Update the path to the uploaded model file on Kaggle
saved_model_path = "/kaggle/input/studentmodel2/student_model_state2.pth"
student_model.load_state_dict(torch.load(saved_model_path, map_location=device))

# Load and preprocess the SST-2 dataset
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
dataset = load_dataset("glue", "sst2")

# Encoding the dataset
def encode_dataset(example):
    return tokenizer(example['sentence'], truncation=True, padding='max_length', max_length=128)

encoded_dataset = dataset.map(encode_dataset, batched=True)

# Collation function
def collate_fn(batch):
    return {
        'input_ids': torch.stack([torch.tensor(item['input_ids']) for item in batch]),
        'attention_mask': torch.stack([torch.tensor(item['attention_mask']) for item in batch]),
        'label': torch.stack([torch.tensor(item['label']) for item in batch])
    }

# DataLoader
val_dataloader = DataLoader(encoded_dataset["validation"], shuffle=False, batch_size=32, collate_fn=collate_fn)

# Evaluate the model
student_model.eval()
correct_predictions = 0
total_predictions = 0

with torch.no_grad():
    for batch in val_dataloader:
        inputs = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = student_model(inputs, attention_mask=attention_mask)
        _, predicted = torch.max(outputs.logits, 1)
        total_predictions += labels.size(0)
        correct_predictions += (predicted == labels).sum().item()

accuracy = 100 * correct_predictions / total_predictions
print(f"Accuracy of the loaded student model on the validation set: {accuracy:.2f}%")




Downloading config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.78k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/4.47k [00:00<?, ?B/s]

Downloading and preparing dataset glue/sst2 (download: 7.09 MiB, generated: 4.81 MiB, post-processed: Unknown size, total: 11.90 MiB) to /root/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad...


Downloading data:   0%|          | 0.00/7.44M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

Dataset glue downloaded and prepared to /root/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

Accuracy of the loaded student model on the validation set: 80.39%


In [4]:
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, TensorDataset
from datasets import load_dataset, load_metric
from sklearn.metrics import matthews_corrcoef
import numpy as np

# Function to create half-sized Roberta
def create_half_size_roberta_base():
    student_config = RobertaConfig.from_pretrained("roberta-base")
    student_config.hidden_size //= 2
    student_config.num_attention_heads //= 2
    student_config.intermediate_size //= 2
    student_model = RobertaForSequenceClassification(student_config)
    return student_model

# Create a separate model for fine-tuning
model = create_half_size_roberta_base()

# Load the model and tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the student model state from Kaggle input
student_model_path = '/kaggle/input/studentmodel2/student_model_state2.pth'
model.load_state_dict(torch.load(student_model_path, map_location=device))

# Move the model to the specified device
model = model.to(device)

# Load dataset
dataset = load_dataset('glue', 'cola', split='train')

# Shuffle the dataset to ensure randomness
dataset = dataset.shuffle(seed=42)

# Tokenize and prepare DataLoader
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
inputs = tokenizer([x['sentence'] for x in dataset], padding=True, return_tensors='pt', truncation=True)
inputs = {key: val.to(device) for key, val in inputs.items()}
labels = torch.tensor(dataset['label']).to(device)
data_loader = DataLoader(TensorDataset(inputs['input_ids'], inputs['attention_mask'], labels), batch_size=8)

num_epochs = 5  # Adjust the number of epochs
# Fine-tuning hyperparameters
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8, weight_decay=0.01)
# Learning rate scheduler
warmup_proportion = 0.1
num_training_steps = len(data_loader) * num_epochs
num_warmup_steps = int(warmup_proportion * num_training_steps)

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps
)

# Fine-tune the model

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    all_predictions = []
    all_labels = []

    for input_ids, attention_mask, label in data_loader:
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=label)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

        _, predicted = torch.max(outputs.logits, dim=1)
        all_predictions.extend(predicted.cpu().numpy())
        all_labels.extend(label.cpu().numpy())

    average_loss = total_loss / len(data_loader)
    print(f'Epoch {epoch + 1}/{num_epochs}, Average Loss: {average_loss:.4f}')

    # Calculate MCC for the epoch
    mcc = matthews_corrcoef(all_labels, all_predictions)
    print(f'Epoch {epoch + 1}, Matthews Correlation: {mcc:.4f}')


Downloading and preparing dataset glue/cola (download: 368.14 KiB, generated: 596.73 KiB, post-processed: Unknown size, total: 964.86 KiB) to /root/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad...


Downloading data:   0%|          | 0.00/377k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8551 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1043 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1063 [00:00<?, ? examples/s]

Dataset glue downloaded and prepared to /root/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad. Subsequent calls will reuse this data.




Epoch 1/5, Average Loss: 0.6520
Epoch 1, Matthews Correlation: -0.0102
Epoch 2/5, Average Loss: 0.6090
Epoch 2, Matthews Correlation: 0.0000
Epoch 3/5, Average Loss: 0.6075
Epoch 3, Matthews Correlation: 0.0000
Epoch 4/5, Average Loss: 0.6069
Epoch 4, Matthews Correlation: 0.0000
Epoch 5/5, Average Loss: 0.6073
Epoch 5, Matthews Correlation: 0.0000


In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Evaluate the fine-tuned model
model = model.eval()
correct = 0
total = 0
all_predictions = []
all_labels = []



# Load the validation dataset
validation_dataset = load_dataset('glue', 'cola', split='validation')
validation_inputs = tokenizer([x['sentence'] for x in validation_dataset], padding=True, return_tensors='pt', truncation=True)
validation_inputs = {key: val.to(device) for key, val in validation_inputs.items()}
validation_labels = torch.tensor(validation_dataset['label']).to(device)
validation_data_loader = DataLoader(TensorDataset(validation_inputs['input_ids'], validation_inputs['attention_mask'], validation_labels), batch_size=8)

with torch.no_grad():
    for input_ids, attention_mask, label in validation_data_loader:
        outputs = model(input_ids, attention_mask=attention_mask)
        _, predicted = torch.max(outputs.logits, 1)
        total += label.size(0)
        correct += (predicted == label).sum().item()

        # Collect predictions and true labels for MCC calculation
        all_predictions.extend(predicted.cpu().numpy())
        all_labels.extend(label.cpu().numpy())

    accuracy = 100 * correct / total

    # Calculate MCC
    mcc = matthews_corrcoef(all_labels, all_predictions)

    print(f'Accuracy after fine-tuning: {accuracy:.2f}%')
    print(f'Matthews Correlation: {mcc:.4f}')

Accuracy after fine-tuning: 69.13%
Matthews Correlation: 0.0000


In [6]:
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, TensorDataset
from datasets import load_dataset, load_metric
from sklearn.metrics import matthews_corrcoef
import numpy as np

# Function to create half-sized RoBERTa
def create_half_size_roberta_base():
    student_config = RobertaConfig.from_pretrained("roberta-base")
    student_config.hidden_size //= 2
    student_config.num_attention_heads //= 2
    student_config.intermediate_size //= 2
    student_model = RobertaForSequenceClassification(student_config)
    return student_model

# Create a separate model for fine-tuning
model = create_half_size_roberta_base()

# Load the model and tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the student model state from Kaggle input
student_model_path = '/kaggle/input/studentmodel2/student_model_state2.pth'
model.load_state_dict(torch.load(student_model_path, map_location=device))

# Move the model to the specified device
model = model.to(device)

# Load MRPC dataset
mrpc_dataset = load_dataset('glue', 'mrpc', split='train')

# Shuffle the dataset to ensure randomness
mrpc_dataset = mrpc_dataset.shuffle(seed=42)

# Tokenize and prepare DataLoader
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
inputs = tokenizer(mrpc_dataset['sentence1'], mrpc_dataset['sentence2'], padding=True, return_tensors='pt', truncation=True)
inputs = {key: val.to(device) for key, val in inputs.items()}
labels = torch.tensor(mrpc_dataset['label']).to(device)
data_loader = DataLoader(TensorDataset(inputs['input_ids'], inputs['attention_mask'], labels), batch_size=8)

# Fine-tuning hyperparameters
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8, weight_decay=0.01)
# Learning rate scheduler
warmup_proportion = 0.1
num_epochs = 10  # Adjust the number of epochs
num_training_steps = len(data_loader) * num_epochs
num_warmup_steps = int(warmup_proportion * num_training_steps)

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps
)

# Fine-tune the model
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    all_predictions = []
    all_labels = []

    for input_ids, attention_mask, label in data_loader:
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=label)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

        _, predicted = torch.max(outputs.logits, dim=1)
        all_predictions.extend(predicted.cpu().numpy())
        all_labels.extend(label.cpu().numpy())

    average_loss = total_loss / len(data_loader)
    print(f'Epoch {epoch + 1}/{num_epochs}, Average Loss: {average_loss:.4f}')

    # Calculate MCC for the epoch
    mcc = matthews_corrcoef(all_labels, all_predictions)
    print(f'Epoch {epoch + 1}, Matthews Correlation: {mcc:.4f}')


Downloading and preparing dataset glue/mrpc (download: 1.43 MiB, generated: 1.43 MiB, post-processed: Unknown size, total: 2.85 MiB) to /root/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data: 0.00B [00:00, ?B/s]

Downloading data: 0.00B [00:00, ?B/s]

Downloading data: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/3668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/408 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

Dataset glue downloaded and prepared to /root/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad. Subsequent calls will reuse this data.




Epoch 1/10, Average Loss: 0.6946
Epoch 1, Matthews Correlation: 0.0362
Epoch 2/10, Average Loss: 0.6284
Epoch 2, Matthews Correlation: 0.0208
Epoch 3/10, Average Loss: 0.6171
Epoch 3, Matthews Correlation: 0.1072
Epoch 4/10, Average Loss: 0.5870
Epoch 4, Matthews Correlation: 0.2646
Epoch 5/10, Average Loss: 0.5490
Epoch 5, Matthews Correlation: 0.3808
Epoch 6/10, Average Loss: 0.5126
Epoch 6, Matthews Correlation: 0.4710
Epoch 7/10, Average Loss: 0.4822
Epoch 7, Matthews Correlation: 0.5356
Epoch 8/10, Average Loss: 0.4552
Epoch 8, Matthews Correlation: 0.5790
Epoch 9/10, Average Loss: 0.4409
Epoch 9, Matthews Correlation: 0.6072
Epoch 10/10, Average Loss: 0.4299
Epoch 10, Matthews Correlation: 0.6228


In [7]:
# Set the model to evaluation mode
model = model.eval()

correct = 0
total = 0

# Initialize lists to store predictions and true labels
all_predictions = []
all_labels = []

# Load the MRPC validation dataset
mrpc_validation_dataset = load_dataset('glue', 'mrpc', split='validation')
mrpc_validation_inputs = tokenizer(mrpc_validation_dataset['sentence1'], mrpc_validation_dataset['sentence2'], padding=True, return_tensors='pt', truncation=True)
mrpc_validation_inputs = {key: val.to(device) for key, val in mrpc_validation_inputs.items()}
mrpc_validation_labels = torch.tensor(mrpc_validation_dataset['label']).to(device)
mrpc_validation_data_loader = DataLoader(TensorDataset(mrpc_validation_inputs['input_ids'], mrpc_validation_inputs['attention_mask'], mrpc_validation_labels), batch_size=8)

with torch.no_grad():
    for input_ids, attention_mask, label in mrpc_validation_data_loader:
        outputs = model(input_ids, attention_mask=attention_mask)
        _, predicted = torch.max(outputs.logits, 1)
        
        total += label.size(0)
        correct += (predicted == label).sum().item()

        # Collect predictions and true labels for MCC calculation
        all_predictions.extend(predicted.cpu().numpy())
        all_labels.extend(label.cpu().numpy())

# Calculate accuracy
accuracy = 100 * correct / total

# Calculate F1 score (since MRPC is a paraphrase classification task)
from sklearn.metrics import f1_score
f1 = f1_score(all_labels, all_predictions)

# Calculate Matthews Correlation Coefficient (MCC)
mcc = matthews_corrcoef(all_labels, all_predictions)

print(f'Accuracy after fine-tuning: {accuracy:.2f}%')
print(f'F1 Score: {f1:.4f}')
print(f'Matthews Correlation: {mcc:.4f}')


Accuracy after fine-tuning: 66.18%
F1 Score: 0.7715
Matthews Correlation: 0.1409


In [8]:
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, TensorDataset
from datasets import load_dataset, load_metric
from sklearn.metrics import matthews_corrcoef
import numpy as np

# Function to create half-sized RoBERTa
def create_half_size_roberta_base():
    student_config = RobertaConfig.from_pretrained("roberta-base")
    student_config.hidden_size //= 2
    student_config.num_attention_heads //= 2
    student_config.intermediate_size //= 2
    student_model = RobertaForSequenceClassification(student_config)
    return student_model

# Create a separate model for fine-tuning
model = create_half_size_roberta_base()

# Load the model and tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the student model state from Kaggle input
student_model_path = '/kaggle/input/studentmodel2/student_model_state2.pth'
model.load_state_dict(torch.load(student_model_path, map_location=device))

# Move the model to the specified device
model = model.to(device)

# Load SST-2 dataset
sst2_dataset = load_dataset('glue', 'sst2', split='train')

# Shuffle the dataset to ensure randomness
sst2_dataset = sst2_dataset.shuffle(seed=42)

# Tokenize and prepare DataLoader
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
inputs = tokenizer(sst2_dataset['sentence'], padding=True, return_tensors='pt', truncation=True)
inputs = {key: val.to(device) for key, val in inputs.items()}
labels = torch.tensor(sst2_dataset['label']).to(device)
data_loader = DataLoader(TensorDataset(inputs['input_ids'], inputs['attention_mask'], labels), batch_size=8)

# Fine-tuning hyperparameters
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8, weight_decay=0.01)
# Learning rate scheduler
warmup_proportion = 0.1
num_epochs = 10  # Adjust the number of epochs
num_training_steps = len(data_loader) * num_epochs
num_warmup_steps = int(warmup_proportion * num_training_steps)

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps
)

# Fine-tune the model
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    all_predictions = []
    all_labels = []

    for input_ids, attention_mask, label in data_loader:
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=label)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

        _, predicted = torch.max(outputs.logits, dim=1)
        all_predictions.extend(predicted.cpu().numpy())
        all_labels.extend(label.cpu().numpy())

    average_loss = total_loss / len(data_loader)
    print(f'Epoch {epoch + 1}/{num_epochs}, Average Loss: {average_loss:.4f}')

    # Calculate accuracy for the epoch
    accuracy = (100 * (np.array(all_predictions) == np.array(all_labels)).sum()) / len(all_labels)
    print(f'Epoch {epoch + 1}, Accuracy: {accuracy:.2f}%')




Epoch 1/10, Average Loss: 0.2350
Epoch 1, Accuracy: 93.34%
Epoch 2/10, Average Loss: 0.2265
Epoch 2, Accuracy: 93.82%
Epoch 3/10, Average Loss: 0.2071
Epoch 3, Accuracy: 94.52%
Epoch 4/10, Average Loss: 0.1898
Epoch 4, Accuracy: 95.12%
Epoch 5/10, Average Loss: 0.1736
Epoch 5, Accuracy: 95.62%
Epoch 6/10, Average Loss: 0.1578
Epoch 6, Accuracy: 96.17%
Epoch 7/10, Average Loss: 0.1459
Epoch 7, Accuracy: 96.57%
Epoch 8/10, Average Loss: 0.1358
Epoch 8, Accuracy: 96.84%
Epoch 9/10, Average Loss: 0.1253
Epoch 9, Accuracy: 97.14%
Epoch 10/10, Average Loss: 0.1189
Epoch 10, Accuracy: 97.31%


In [9]:
# Set the model to evaluation mode
model = model.eval()

correct = 0
total = 0

# Initialize lists to store predictions and true labels
all_predictions = []
all_labels = []

# Load the SST-2 validation dataset
sst2_validation_dataset = load_dataset('glue', 'sst2', split='validation')
sst2_validation_inputs = tokenizer(sst2_validation_dataset['sentence'], padding=True, return_tensors='pt', truncation=True)
sst2_validation_inputs = {key: val.to(device) for key, val in sst2_validation_inputs.items()}
sst2_validation_labels = torch.tensor(sst2_validation_dataset['label']).to(device)
sst2_validation_data_loader = DataLoader(TensorDataset(sst2_validation_inputs['input_ids'], sst2_validation_inputs['attention_mask'], sst2_validation_labels), batch_size=8)

with torch.no_grad():
    for input_ids, attention_mask, label in sst2_validation_data_loader:
        outputs = model(input_ids, attention_mask=attention_mask)
        _, predicted = torch.max(outputs.logits, 1)
        
        total += label.size(0)
        correct += (predicted == label).sum().item()

        # Collect predictions and true labels for MCC calculation
        all_predictions.extend(predicted.cpu().numpy())
        all_labels.extend(label.cpu().numpy())

# Calculate accuracy
accuracy = 100 * correct / total

# Calculate Matthews Correlation Coefficient (MCC)
mcc = matthews_corrcoef(all_labels, all_predictions)

print(f'Accuracy after fine-tuning: {accuracy:.2f}%')
print(f'Matthews Correlation: {mcc:.4f}')


Accuracy after fine-tuning: 79.13%
Matthews Correlation: 0.5828
