<a href="https://colab.research.google.com/github/24-FYP-Automated-Feedback-Generation/Multiple-inputs-gpt2/blob/main/Multiple_Inputs_gpt2_initial_weight_with_scheduler.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pandas openpyxl transformers datasets peft

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m32.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [2]:
import pandas as pd

In [3]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) y
Token is valid (permission: write).
The token `fyp2` has been saved to /root/.cache/huggingface/stored_tokens
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate 

# Data Preprocessing

In [4]:
from torch.utils.data import DataLoader, Dataset, random_split
from datasets import Dataset as HF_Dataset
import ast
from transformers import RobertaTokenizer, BertTokenizer, GPT2Tokenizer

In [5]:
# Load your Excel file
file_path = "/content/code_parrot_annotated_dataset.csv"
df = pd.read_csv(file_path)

In [6]:
df.drop([
    "difficulty",
    "random_col_1",
    "random_col_2",
    "random_col_3",
    "random_col_4",
    "random_col_5",
    "random_col_6",
    "random_col_7",
    "random_col_8",
    "random_col_9",
    "random_col_10",
    "random_col_11",
    "random_col_12",
    "random_col_13",
    "random_col_14",
    "random_col_15",
    "random_col_16"
], axis=1, inplace =True)

In [7]:
df.columns

Index(['question', 'prefer_solution', 'flaw_solution', 'metacognitive_vector',
       'metacognitive_feedback'],
      dtype='object')

In [8]:
hf_dataset = HF_Dataset.from_pandas(df)

In [9]:
hf_dataset

Dataset({
    features: ['question', 'prefer_solution', 'flaw_solution', 'metacognitive_vector', 'metacognitive_feedback'],
    num_rows: 2000
})

In [10]:
# Custom Dataset Class for Your Dataset
class MetacognitiveDataset(Dataset):
    def __init__(self, hf_dataset, text_tokenizer_input, text_tokenizer_target, code_tokenizer, max_length=768):
        self.text_tokenizer_input = text_tokenizer_input
        self.text_tokenizer_target = text_tokenizer_target
        self.code_tokenizer = code_tokenizer
        self.data = hf_dataset
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        metacognition = self.data['metacognitive_vector'][idx]
        problem = self.data['question'][idx]
        expected_answer = self.data['prefer_solution'][idx]
        student_answer = self.data['flaw_solution'][idx]
        target = self.data['metacognitive_feedback'][idx]

        self.text_tokenizer_target.pad_token_id = self.text_tokenizer_target.eos_token_id

        # Tokenize inputs and truncate/pad
        metacognition_ids = torch.tensor(
            ast.literal_eval(metacognition), dtype=torch.float
        )
        problem_ids = torch.tensor(
            self.text_tokenizer_input.encode(problem, max_length=512, truncation=True, padding="max_length")
        )
        expected_answer_ids = torch.tensor(
            self.code_tokenizer.encode(expected_answer, max_length=512, truncation=True, padding="max_length")
        )
        student_answer_ids = torch.tensor(
            self.code_tokenizer.encode(student_answer, max_length=512, truncation=True, padding="max_length")
        )
        target_ids = torch.tensor(
            self.text_tokenizer_target.encode(target, max_length=512, truncation=True, padding="max_length")
        )

        return metacognition_ids, problem_ids, expected_answer_ids, student_answer_ids, target_ids

In [11]:
# Pad Collate Function for Variable-Length Batches
def pad_collate(batch):
    metacognitions, problems, expected_answers, student_answers, targets = zip(*batch)
    metacognitions = torch.stack(metacognitions)
    problems = torch.stack(problems)
    expected_answers = torch.stack(expected_answers)
    student_answers = torch.stack(student_answers)
    targets = torch.stack(targets)
    return metacognitions, problems, expected_answers, student_answers, targets

In [12]:
text_tokenizer_input = BertTokenizer.from_pretrained("bert-base-uncased")
text_tokenizer_target = GPT2Tokenizer.from_pretrained("gpt2")
code_tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

In [13]:
dataset = MetacognitiveDataset(hf_dataset, text_tokenizer_input,text_tokenizer_target, code_tokenizer)

In [14]:
train_size = int(0.9 * len(dataset))  # 80% for training
test_size = len(dataset) - train_size

train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

In [15]:
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=pad_collate)

In [16]:
del df, hf_dataset, dataset, train_dataset

In [17]:
len(test_dataset)

200

In [18]:
del text_tokenizer_input, code_tokenizer

# Model

In [19]:
import torch
import torch.nn as nn
from transformers import RobertaModel, BertModel
from transformers import GPT2Config
from transformers.models.gpt2.modeling_gpt2 import GPT2Attention
from peft import LoraConfig, LoraModel

In [20]:
# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [21]:
class Encoder:
    """
    Implements encoder to creare latent representation for python code segments.
    """
    def __init__(self, model_name, max_length=768):
        super(Encoder, self).__init__()
        self.model_name = model_name
        if model_name == 'bert-base-uncased':
            self.model = BertModel.from_pretrained(model_name)
        elif model_name == 'microsoft/codebert-base':
            self.model = RobertaModel.from_pretrained(model_name)

    def get_encoded(self, input):
        # Ensure we're in evaluation mode
        self.model.to(device).eval()
        # Encode using the model
        with torch.no_grad():
            outputs = self.model(input)

        # Extract the last hidden state or a pooled representation
        # Last hidden state: token-level embeddings
        token_embeddings = outputs.last_hidden_state
        return token_embeddings

In [22]:

class MultiHeadBiAttention(nn.Module):
    """
    Implements Multi-Head Bi-Attention mechanism.
    """
    def __init__(self,
                 hidden_dim,
                 num_heads,
                 in_proj_weight,
                 in_proj_bias,
                 out_proj_weight,
                 out_proj_bias,
                 dropout_rate=0.1):
        super(MultiHeadBiAttention, self).__init__()

        q_proj, k_proj, v_proj = in_proj_weight.split(hidden_dim, dim=1)

        self.self_attention = nn.MultiheadAttention(hidden_dim, num_heads)
        self.self_attention.in_proj_weight.data = torch.cat([q_proj, k_proj, v_proj], dim=0)
        self.self_attention.in_proj_bias.data = in_proj_bias
        self.self_attention.out_proj.weight.data = out_proj_weight
        self.self_attention.out_proj.bias.data = out_proj_bias

        self.bi_attention_expected = nn.MultiheadAttention(hidden_dim, num_heads)
        self.bi_attention_expected.in_proj_weight.data = torch.cat([q_proj, k_proj, v_proj], dim=0)
        self.bi_attention_expected.in_proj_bias.data = in_proj_bias
        self.bi_attention_expected.out_proj.weight.data = out_proj_weight
        self.bi_attention_expected.out_proj.bias.data = out_proj_bias

        self.bi_attention_problem = nn.MultiheadAttention(hidden_dim, num_heads)
        self.bi_attention_problem.in_proj_weight.data = torch.cat([q_proj, k_proj, v_proj], dim=0)
        self.bi_attention_problem.in_proj_bias.data = in_proj_bias
        self.bi_attention_problem.out_proj.weight.data = out_proj_weight
        self.bi_attention_problem.out_proj.bias.data = out_proj_bias

        self.layer_norm = nn.LayerNorm(hidden_dim)
        self.drop_out = nn.Dropout(dropout_rate)


        # Trainable weights for combining attention outputs
        self.weight_self = nn.Parameter(torch.tensor(0.33))  # Initialized to 1/3
        self.weight_bi_expected = nn.Parameter(torch.tensor(0.33))
        self.weight_bi_problem = nn.Parameter(torch.tensor(0.33))
        self.weight_metacognition = nn.Parameter(torch.tensor(0.33))



    def forward(self, metacognition, problem, expected_answer, student_answer):
        # MH Self-Attention on Current State
        student_answer = self.layer_norm(student_answer)
        attn_output, _= self.self_attention(student_answer, student_answer, student_answer)

        # MH Bi-Attention: Cross attention with Personality and Dialog History
        bi_attn_output_expected, _ = self.bi_attention_expected(student_answer, expected_answer, expected_answer)
        bi_attn_output_problem, _ = self.bi_attention_problem(student_answer, problem, problem)

        attn_output = self.drop_out(attn_output)
        bi_attn_output_expected = self.drop_out(bi_attn_output_expected)
        bi_attn_output_problem = self.drop_out(bi_attn_output_problem)

        fusion_output = (
            self.weight_self * attn_output +
            self.weight_bi_expected * bi_attn_output_expected +
            self.weight_bi_problem * bi_attn_output_problem +
            self.weight_metacognition * metacognition
        )
        return fusion_output

In [23]:
class CustomTransformerBlock(nn.Module):
    """
    A custom transformer block with Attention Fusion and Layer Normalization.
    """
    def __init__(self,
                 hidden_dim,
                 num_heads,
                 ff_dim,
                 in_proj_weight,
                 in_proj_bias,
                 out_proj_weight,
                 out_proj_bias,
                 dropout_rate=0.1):
        super(CustomTransformerBlock, self).__init__()
        self.attention_fusion = MultiHeadBiAttention(hidden_dim,
                                                     num_heads,
                                                     in_proj_weight,
                                                     in_proj_bias,
                                                     out_proj_weight,
                                                     out_proj_bias)
        self.layer_norm1 = nn.LayerNorm(hidden_dim)
        self.mlp = nn.Sequential(
            nn.Linear(hidden_dim, ff_dim, bias=False),
            nn.Linear(ff_dim, hidden_dim, bias=False),
            nn.ReLU(),
            nn.Dropout(dropout_rate)
        )
        self.layer_norm2 = nn.LayerNorm(hidden_dim)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, metacognition, problem, expected_answer, student_answer):
        # Attention Fusion
        attn_output = self.attention_fusion(metacognition, problem, expected_answer, student_answer)
        attn_output = self.layer_norm1(student_answer + attn_output)  # Add & Norm

        # MLP and Layer Norm
        mlp_output = self.mlp(attn_output)
        output = self.layer_norm2(attn_output + mlp_output)
        output = self.dropout(output)
        return output

In [24]:
from transformers import GPT2Model
gpt2_model = GPT2Model.from_pretrained("gpt2")

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

In [25]:
gpt2_model.h[0].attn.c_proj.weight.shape

torch.Size([768, 768])

In [26]:
from transformers import GPT2Model

class ModifiedModel(nn.Module):
    """
    Encoder-Decoder model using GPT-2 with Attention Fusion.
    """
    def __init__(self, encoder_model_dict, hidden_dim=768, num_heads=12, ff_dim=2048, num_layers=12, dropout_rate=0.1):
        super(ModifiedModel, self).__init__()

        self.text_encoder = Encoder(encoder_model_dict['text_encoder'])
        self.code_encoder = Encoder(encoder_model_dict['code_encoder'])

        # Embedding layer for Current Reply
        self.metacognition_expansion = nn.Sequential(
            nn.Linear(16, hidden_dim, bias=False),
            nn.ReLU()
        )

        gpt2_model = GPT2Model.from_pretrained("gpt2")

        # Transformer blocks with Attention Fusion
        self.transformer_blocks = nn.ModuleList([
            CustomTransformerBlock(hidden_dim,
                                   num_heads,
                                   ff_dim,
                                   gpt2_model.h[i].attn.c_attn.weight,
                                   gpt2_model.h[i].attn.c_attn.bias,
                                   gpt2_model.h[i].attn.c_proj.weight,
                                   gpt2_model.h[i].attn.c_proj.bias,
                                   ) for i in range(num_layers)
        ])

        self.final_layer_norm = nn.LayerNorm(hidden_dim)
        self.dropout = nn.Dropout(dropout_rate)
        self.lm_head = nn.Linear(hidden_dim, 50257, bias=False)

    def forward(self, metacognition, problem, expected_answer, student_answer):
        # Encode Personality and Dialog History
        problem = self.text_encoder.get_encoded(problem)
        expected_answer = self.code_encoder.get_encoded(expected_answer)
        current_state = self.code_encoder.get_encoded(student_answer)

        # Embed Current Reply/State
        metacognition = self.metacognition_expansion(metacognition)  # Shape: (batch_size,hidden_dim)
        # Expand the vector to match sequence dimensions
        metacognition = metacognition.unsqueeze(1).repeat(1, 512, 1) # Shape: (batch_size, seq_len, hidden_dim)

        # Pass through custom transformer blocks
        for block in self.transformer_blocks:
            current_state = block(metacognition, problem, expected_answer, current_state)

        # Final linear layer
        current_state = self.final_layer_norm(current_state)
        current_state = self.dropout(current_state)
        logits = self.lm_head(current_state)
        return logits

In [27]:
encoder_model_dict ={'text_encoder':'bert-base-uncased', 'code_encoder':'microsoft/codebert-base'}

In [28]:
# Model, Optimizer, and Training
model = ModifiedModel(encoder_model_dict)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [29]:
print(f"Number of trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad)}")


Number of trainable parameters: 161460528


In [30]:
model

ModifiedModel(
  (metacognition_expansion): Sequential(
    (0): Linear(in_features=16, out_features=768, bias=False)
    (1): ReLU()
  )
  (transformer_blocks): ModuleList(
    (0-11): 12 x CustomTransformerBlock(
      (attention_fusion): MultiHeadBiAttention(
        (self_attention): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (bi_attention_expected): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (bi_attention_problem): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (drop_out): Dropout(p=0.1, inplace=False)
      )
      (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): Sequential(
        (0): Linear(in

# Training

In [31]:
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR

In [32]:
model.to(device)

ModifiedModel(
  (metacognition_expansion): Sequential(
    (0): Linear(in_features=16, out_features=768, bias=False)
    (1): ReLU()
  )
  (transformer_blocks): ModuleList(
    (0-11): 12 x CustomTransformerBlock(
      (attention_fusion): MultiHeadBiAttention(
        (self_attention): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (bi_attention_expected): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (bi_attention_problem): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (drop_out): Dropout(p=0.1, inplace=False)
      )
      (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): Sequential(
        (0): Linear(in

In [33]:
# Separate parameters for attention layers
self_attention_params = []
bi_attention_params = []
other_params = []

# Group parameters by layer
for name, param in model.named_parameters():
    if "self_attention" in name and param.requires_grad:
        self_attention_params.append(param)
    elif "bi_attention" in name and param.requires_grad:
        bi_attention_params.append(param)
    elif param.requires_grad:  # All other trainable parameters
        other_params.append(param)


In [34]:
# Define parameter groups for the optimizer
param_groups = [
    {'params': self_attention_params, 'lr': 8e-6, 'weight_decay': 0.0},
    {'params': bi_attention_params, 'lr': 8e-6, 'weight_decay': 0.0},
    {'params': other_params, 'lr': 3e-4, 'weight_decay': 0.0}
]

In [35]:
optimizer = torch.optim.AdamW(param_groups, lr=3e-4, betas=(0.9, 0.999))

In [36]:
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts
scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=10, T_mult=2)

In [37]:
text_tokenizer_target.pad_token_id = text_tokenizer_target.eos_token_id
loss_fn = nn.CrossEntropyLoss(ignore_index=text_tokenizer_target.pad_token_id)

In [38]:
def train_model(model, train_loader, optimizer, num_epochs, device):

    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for step, (metacognition, problem, expected_answer, student_answer, metacognitive_feedback) in enumerate(train_loader):
            # Move data to device
            metacognition_ids = metacognition.to(device)
            problem_ids = problem.to(device)
            expected_answer_ids = expected_answer.to(device)
            student_answer_ids = student_answer.to(device)
            target_ids = metacognitive_feedback.to(device)

            # Forward pass
            optimizer.zero_grad()
            logits = model(metacognition_ids, problem_ids, expected_answer_ids, student_answer_ids)


            # Flatten the logits and target_ids to match the required dimensions
            logits = logits.view(-1, logits.size(-1))  # Flatten to shape [batch_size * sequence_length, vocab_size]
            target_ids = target_ids.view(-1)  # Flatten to shape [batch_size * sequence_length]

            # Compute loss
            loss = loss_fn(logits,target_ids)

            # Backward pass and optimization
            loss.backward()

            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            optimizer.step()

            # Scheduler step (CosineAnnealingWarmRestarts is called per batch)
            scheduler.step(epoch + step / len(train_loader))

            total_loss += loss.item()
            if step % 50 == 0:
                print(f"Epoch [{epoch+1}/{num_epochs}], Step [{step}], Loss: {loss.item():.4f}")

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch [{epoch+1}/{num_epochs}] Average Loss: {avg_loss:.4f}")


In [None]:
# Train the model
train_model(model, train_loader, optimizer, num_epochs=100, device=device)

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Epoch [1/100], Step [0], Loss: 11.0946
Epoch [1/100], Step [50], Loss: 5.9949
Epoch [1/100], Step [100], Loss: 5.4534
Epoch [1/100], Step [150], Loss: 5.6250
Epoch [1/100], Step [200], Loss: 5.6050
Epoch [1/100], Step [250], Loss: 5.5249
Epoch [1/100], Step [300], Loss: 5.5351
Epoch [1/100], Step [350], Loss: 5.4369
Epoch [1/100], Step [400], Loss: 5.5113
Epoch [1/100] Average Loss: 5.7008
Epoch [2/100], Step [0], Loss: 5.3337
Epoch [2/100], Step [50], Loss: 5.4181
Epoch [2/100], Step [100], Loss: 5.5435
Epoch [2/100], Step [150], Loss: 5.3949
Epoch [2/100], Step [200], Loss: 5.5028
Epoch [2/100], Step [250], Loss: 5.5186
Epoch [2/100], Step [300], Loss: 5.5811
Epoch [2/100], Step [350], Loss: 5.4028
Epoch [2/100], Step [400], Loss: 5.4645
Epoch [2/100] Average Loss: 5.4598
Epoch [3/100], Step [0], Loss: 5.6387
Epoch [3/100], Step [50], Loss: 5.3781
Epoch [3/100], Step [100], Loss: 5.3611
Epoch [3/100], Step [150], Loss: 5.3023
Epoch [3/100], Step [200], Loss: 5.1743
Epoch [3/100], Ste

In [None]:
# Save the state dictionary
torch.save(model.state_dict(), "model_weights.pkl")

In [None]:
# Save the entire model
torch.save(model, "model.pkl")

# Testing

# Inferencing

In [None]:
def generate_text(model, tokenizer, metacognition_ids, problem_ids, expected_answer_ids, student_answer_ids, max_length=512):

    model.eval()

    # Generate the output logits (the model will predict the next tokens)
    with torch.no_grad():
        output = model(metacognition_ids.to(device), problem_ids.to(device), expected_answer_ids.to(device), student_answer_ids.to(device))
    print(output[0])
    # Apply argmax to get the predicted token IDs
    predicted_ids = output[0].argmax(dim=-1)  # Get token IDs by taking the max logits
    print(predicted_ids)
    # Decode the generated output back into text
    generated_text = tokenizer.decode(predicted_ids, skip_special_tokens=True)

    return generated_text

In [None]:
x=100

In [None]:
generate_text(model, text_tokenizer_target, test_dataset.data[x]['metacognitive_vector'], test_dataset.data[x]['question'], test_dataset.data[x]['prefer_solution'], test_dataset.data[x]['flaw_solution'])