In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/modified-dataset/modified_dataset.csv


# Import libraries.

In [3]:
pip install transformers torch

Note: you may need to restart the kernel to use updated packages.


In [4]:
import torch
from torch import nn
from torch.optim import Adam
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoModel, AutoTokenizer, GPT2Model

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
device

device(type='cuda')

# Models initialization and tokenizations

In [7]:
model_name_encoder = "bert-base-uncased"

In [8]:
context_encoder = AutoModel.from_pretrained(model_name_encoder)
#this same encoder will be used as the persona encoder but with a linear projection of 16->768

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [9]:
decoder = GPT2Model.from_pretrained("gpt2")

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

In [10]:
decoder.resize_token_embeddings(decoder.config.vocab_size)

Embedding(50257, 768)

In [11]:
# Tokenizer
tokenizer_encoder = AutoTokenizer.from_pretrained(model_name_encoder)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [12]:
tokenizer_decoder = AutoTokenizer.from_pretrained("gpt2")

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [13]:
# Set padding token to eos_token for GPT-2
tokenizer_decoder.pad_token = tokenizer_decoder.eos_token

In [14]:
# Learnable projection layer for metacognitive profile
profile_projection = nn.Linear(16, context_encoder.config.hidden_size)

# Defining the PAA layers and Model

In [15]:
class PAALayer(nn.Module):
    def __init__(self, hidden_size):
        super(PAALayer, self).__init__()
        self.cross_attn = nn.MultiheadAttention(hidden_size, num_heads=8)
        self.sigmoid = nn.Sigmoid()
        self.linear = nn.Linear(hidden_size * 2, hidden_size)

    def forward(self, persona_hidden, context_hidden, decoder_hidden, tau):
        c1, _ = self.cross_attn(decoder_hidden, persona_hidden, persona_hidden)
        c2, _ = self.cross_attn(decoder_hidden, context_hidden, context_hidden)
        
        # Adaptive weight calculation
        w1 = self.sigmoid(self.linear(torch.cat((c1, decoder_hidden), dim=-1)))
        w2 = 1 - w1

        # Mask creation
        m1 = torch.where(w1 > tau, 0, 1)
        m2 = torch.where(w1 < 1 - tau, 0, 1)

        # Weighted summation with masks
        paa_output = w1 * m1 * c1 + w2 * m2 * c2 + decoder_hidden
        return paa_output

In [27]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class PAA_Model(nn.Module):
    def __init__(self, context_encoder, decoder, profile_projection, paa_layer):
        super(PAA_Model, self).__init__()
        self.context_encoder = context_encoder
        self.decoder = decoder
        self.profile_projection = profile_projection
        self.paa_layer = paa_layer
        self.loss_fn = nn.CrossEntropyLoss()

    def forward(self, context_tokens, target_tokens, profile_vector, tau):
        # Encode context
        context_hidden = self.context_encoder(**context_tokens).last_hidden_state
        print(f"context_hidden shape: {context_hidden.shape}")

        # Project metacognitive profile
        projected_profile = self.profile_projection(profile_vector).unsqueeze(1)
        print(f"projected profile shape: {projected_profile.shape}")

        # Expand persona representation
        persona_hidden = projected_profile.expand(-1, context_hidden.size(1), -1)
        print(f"persona_hidden shape: {persona_hidden.shape}")

        # Resize or pad persona_hidden and context_hidden to match decoder_hidden length
        target_length = target_tokens['input_ids'].shape[1]  # Match target sequence length (e.g., decoder_hidden length)
        context_hidden_resized = self.resize_sequence(context_hidden, target_length)
        persona_hidden_resized = self.resize_sequence(persona_hidden, target_length)

        print(f"Resized context_hidden shape: {context_hidden_resized.shape}")
        print(f"Resized persona_hidden shape: {persona_hidden_resized.shape}")

        # Decoder's output
        decoder_hidden = self.decoder(**target_tokens).last_hidden_state
        print(f"decoder_hidden shape: {decoder_hidden.shape}")

        # Apply PAA
        paa_output = self.paa_layer(persona_hidden_resized, context_hidden_resized, decoder_hidden, tau)

        # No softmax here, CrossEntropyLoss expects raw logits
        logits = paa_output
        target = target_tokens['input_ids'][:, 1:].contiguous().view(-1)  # Flatten target tokens
        print(f"logits shape: {logits.shape}")
        print(f"target shape: {target.shape}")

        # Ensure logits and target have matching batch size
        logits = logits.view(-1, logits.size(-1))  # Shape: (batch_size * seq_len, vocab_size)

        assert logits.size(0) == target.size(0), f"Batch size mismatch: logits batch size {logits.size(0)} vs target batch size {target.size(0)}"

        # Calculate loss
        loss = self.loss_fn(logits, target)
        return loss

    def resize_sequence(self, tensor, target_length):
        """
        Resize tensor sequence to the target length using padding or interpolation.
        This method can be adjusted to use either padding or interpolation as per the requirement.
        """
        current_length = tensor.size(1)
        if current_length < target_length:
            # Padding case
            padding_length = target_length - current_length
            return F.pad(tensor, (0, 0, 0, padding_length), "constant", 0)
        elif current_length > target_length:
            # Truncation case
            return tensor[:, :target_length, :]
        else:
            return tensor  # No change if lengths match


# Model and optimizer

In [17]:
paa_layer = PAALayer(context_encoder.config.hidden_size)

In [28]:
model = PAA_Model(context_encoder, decoder, profile_projection, paa_layer).to(device)

In [19]:
optimizer = Adam(model.parameters(), lr=5e-5)

# Dataset preparation

In [20]:
file_path = "/kaggle/input/modified-dataset/modified_dataset.csv"
df = pd.read_csv(file_path)

In [21]:
df.head(10)

Unnamed: 0,description,student_code,feedback,metacognitive_feedback,metacognitive_profile
0,Create a Python program that performs the foll...,""""""" store the final answer in a variable named...","[\n {\n 'line_number': 2,\n 'feedback...",It appears that you are almost on the right tr...,"[2, 1, 3, 3, 2, 3, 2, 1, 3, 1, 1, 3, 2, 1, 2, 1]"
1,Create a Python program that accomplishes the ...,""""""" store the final answer in a variable named...","[\n {\n 'line_number': 4,\n '...",To improve your solution and better align with...,"[3, 1, 2, 2, 3, 1, 3, 3, 2, 3, 3, 2, 1, 2, 2, 1]"
2,Create a Python program that accomplishes the ...,""""""" store the final answer in a variable named...","[\n {\n 'line_number': 2,\n '...","Based on your approach to the problem, it seem...","[2, 1, 1, 2, 2, 3, 3, 1, 3, 3, 2, 1, 2, 1, 2, 2]"
3,Create a Python program that accomplishes the ...,"x=eval(input(""Enter your age:""))\ny=str(input(...","[\n {\n 'line_number': 1,\n '...","Based on your approach, it seems like you ofte...","[1, 3, 1, 3, 1, 3, 2, 3, 2, 3, 3, 2, 3, 2, 3, 2]"
4,Create a Python program that accomplishes the ...,"n = str(input(""Enter your name:""))\na = str(in...","[\n {\n 'line_number': 3,\n '...",**Metacognitive Feedback**:\n\nYou've made a g...,"[3, 1, 3, 3, 2, 1, 3, 3, 3, 1, 2, 3, 2, 1, 1, 3]"
5,Create a Python program that accomplishes the ...,""""""" store your answer in a variable named resu...","[\n {\n 'line_number': 4,\n 'feedbac...",### Metacognitive Feedback:\n\nYou have demons...,"[3, 2, 3, 3, 2, 3, 2, 1, 2, 2, 1, 1, 1, 2, 1, 2]"
6,Create a Python program that accomplishes the ...,"name = input(""Enter your name: "")\nage = eval(...","[\n {\n 'line_number': 2,\n '...",### Metacognitive Feedback:\n\nYou have a stro...,"[3, 2, 1, 1, 3, 3, 1, 1, 3, 2, 2, 2, 1, 1, 2, 3]"
7,Create a Python program that accomplishes the ...,""""""" store your answer in a variable named resu...","[\n {\n ""line_number"": 2,\n ""feedback...",To improve your approach to solving this progr...,"[2, 1, 1, 2, 1, 3, 3, 1, 3, 2, 1, 3, 2, 1, 2, 3]"
8,Create a program that carries out the followin...,""""""" store your answer in a variable named resu...","[\n {\n 'line_number': 1,\n '...","You have made a good start, but there are seve...","[2, 1, 3, 1, 3, 1, 1, 2, 3, 1, 3, 1, 2, 2, 2, 3]"
9,Your task is to create a function named 'count...,def count_substring(string):\n ans = 0\n ...,"[\n {\n ""line_number"": 4,\n ""feedbac...",Certainly! Let's break down the feedback to en...,"[1, 1, 3, 2, 2, 1, 3, 2, 2, 3, 1, 1, 1, 2, 2, 2]"


In [22]:
import ast
def tokenize_data(df, tokenizer_encoder,tokenizer_decoder):
    context_tokens = tokenizer_encoder(list(df['description']), padding=True, truncation=True, return_tensors="pt")
    target_tokens = tokenizer_decoder(list(df['metacognitive_feedback']), padding=True, truncation=True, return_tensors="pt")
    profile_vectors = torch.tensor([ast.literal_eval(profile) for profile in df['metacognitive_profile']], dtype=torch.float)
    return context_tokens, target_tokens, profile_vectors

In [23]:
context_tokens, target_tokens, profile_vectors = tokenize_data(df, tokenizer_encoder,tokenizer_decoder)

In [24]:
context_tokens = {key: value.to(device) for key, value in context_tokens.items()}
target_tokens = {key: value.to(device) for key, value in target_tokens.items()}
profile_vectors = profile_vectors.to(device)

In [30]:
context_tokens[0]

KeyError: 0

In [25]:
# Create DataLoader
train_data = TensorDataset(context_tokens['input_ids'], target_tokens['input_ids'], profile_vectors)
train_loader = DataLoader(train_data, batch_size=8, shuffle=True)

# Training loop

In [29]:
model.train()
num_epochs = 5
tau = 0.5

for epoch in range(num_epochs):
    total_loss = 0
    for context_ids, target_ids, profile_vector in train_loader:
        # Move tensors to the GPU (if available)
        context_ids = context_ids.to(device)
        target_ids = target_ids.to(device)
        profile_vector = profile_vector.to(device)

        optimizer.zero_grad()

        # Prepare input tensors
        context_tokens = {'input_ids': context_ids, 'attention_mask': context_ids != tokenizer_encoder.pad_token_id}
        target_tokens = {'input_ids': target_ids, 'attention_mask': target_ids != tokenizer_decoder.pad_token_id}

        # Forward pass through the model
        loss = model(context_tokens, target_tokens, profile_vector, tau)

        # Backpropagate and update the model
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_loader)}")

context_hidden shape: torch.Size([8, 351, 768])
projected profile shape: torch.Size([8, 1, 768])
persona_hidden shape: torch.Size([8, 351, 768])
Resized context_hidden shape: torch.Size([8, 743, 768])
Resized persona_hidden shape: torch.Size([8, 743, 768])


OutOfMemoryError: CUDA out of memory. Tried to allocate 70.00 MiB. GPU 0 has a total capacity of 15.89 GiB of which 47.12 MiB is free. Process 2135 has 15.84 GiB memory in use. Of the allocated memory 15.09 GiB is allocated by PyTorch, and 474.22 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
torch.cuda.empty_cache()

In [None]:
print(torch.cuda.memory_summary())