<a href="https://colab.research.google.com/github/AkibCoding/KD_ROBERTA/blob/main/Retry%26evaluateKDroberta2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install torch
!pip install datasets

Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m30.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m40.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m80.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m78.5 MB/s[0m eta [36m0:00:00[0m
Col

In [None]:
from datasets import load_dataset
import torch
from torch.nn import CrossEntropyLoss, CosineEmbeddingLoss
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import os
from google.colab import drive
from transformers import RobertaTokenizer
from transformers import RobertaForSequenceClassification, RobertaConfig

# Setting up device for GPU usage
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Load the SST-2 dataset
dataset = load_dataset("glue", "sst2")

# Initialize the tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-large")

# Tokenize the datasets with padding and truncation
def tokenize_batch(batch):
    return tokenizer(batch['sentence'], truncation=True, padding='max_length', max_length=128, return_tensors='pt')

encoded_dataset = dataset.map(tokenize_batch, batched=True)

# Load the pre-trained RoBERTa-large model (teacher)
teacher_model = RobertaForSequenceClassification.from_pretrained("roberta-large")
teacher_model = teacher_model.to(device)
teacher_model.eval()  # Set the teacher model to evaluation mode

# Create a half-sized RoBERTa-base model (student)
def create_half_size_roberta_base():
    student_config = RobertaConfig.from_pretrained("roberta-base")
    student_config.hidden_size //= 2
    student_config.num_attention_heads //= 2
    student_config.intermediate_size //= 2
    student_model = RobertaForSequenceClassification(student_config)
    return student_model

student_model = create_half_size_roberta_base()
student_model = student_model.to(device)

# Custom dataset class
class CustomDataset(Dataset):
    def __init__(self, encoded_dataset, mode='train'):
        self.input_ids = torch.tensor(encoded_dataset[mode]['input_ids'])
        self.attention_mask = torch.tensor(encoded_dataset[mode]['attention_mask'])
        self.labels = torch.tensor(encoded_dataset[mode]['label'])  # Change 'labels' to 'label'

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.labels[idx]  # Change 'labels' to 'label'
        }

train_dataset = CustomDataset(encoded_dataset)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

# Distillation loss function
def distillation_loss(teacher_logits, student_logits, labels, temperature=1.0, alpha=0.5):
    kl_loss = torch.nn.KLDivLoss(reduction="batchmean")(torch.nn.functional.log_softmax(student_logits / temperature, dim=-1),
                            torch.nn.functional.softmax(teacher_logits / temperature, dim=-1))
    ce_loss = CrossEntropyLoss()(student_logits, labels)
    return alpha * kl_loss + (1. - alpha) * ce_loss

# Mount Google Drive
drive.mount('/content/drive')

# Define Saving and Loading Functions for the model
SAVE_DIR = "/content/drive/My Drive/Colab Notebooks/"
SAVE_PATH = os.path.join(SAVE_DIR, "student_model_state2.pth")

def save_model_to_drive(model, save_path=SAVE_PATH):
    torch.save(model.state_dict(), save_path)

def load_model_from_drive(model, save_path=SAVE_PATH):
    if os.path.exists(save_path):
        model.load_state_dict(torch.load(save_path))
        print("Model loaded from", save_path)
    else:
        print("No previous model state found!")

# Training loop with backup functionality
def train_student_custom_loop(teacher_model, student_model, train_dataloader, temperature=2.0):
    optimizer = torch.optim.AdamW(student_model.parameters(), lr=5e-5)

    epochs = 3
    for epoch in range(epochs):
        for i, batch in tqdm(enumerate(train_dataloader)):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            student_outputs = student_model(input_ids=input_ids, attention_mask=attention_mask)
            student_logits = student_outputs.logits

            with torch.no_grad():
                teacher_outputs = teacher_model(input_ids=input_ids, attention_mask=attention_mask)
                teacher_logits = teacher_outputs.logits

            loss = distillation_loss(teacher_logits, student_logits, labels, temperature)

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            # Save the student model after every 100 iterations
            if i % 100 == 0:
                save_model_to_drive(student_model)
                print(f"Epoch {epoch}, Iteration {i}, Loss: {loss.item()}")

# Load back the student model's state if available
load_model_from_drive(student_model)

# Train the student model
train_student_custom_loop(teacher_model, student_model, train_dataloader)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
No previous model state found!


2it [00:04,  1.68s/it]

Epoch 0, Iteration 0, Loss: 0.3527374565601349


102it [00:29,  2.78it/s]

Epoch 0, Iteration 100, Loss: 0.3721655309200287


202it [00:54,  2.61it/s]

Epoch 0, Iteration 200, Loss: 0.3595923185348511


302it [01:19,  2.89it/s]

Epoch 0, Iteration 300, Loss: 0.29188257455825806


402it [01:45,  2.58it/s]

Epoch 0, Iteration 400, Loss: 0.33063197135925293


502it [02:10,  2.90it/s]

Epoch 0, Iteration 500, Loss: 0.3465023636817932


602it [02:35,  2.98it/s]

Epoch 0, Iteration 600, Loss: 0.33287250995635986


702it [03:00,  2.58it/s]

Epoch 0, Iteration 700, Loss: 0.2907215654850006


802it [03:24,  3.00it/s]

Epoch 0, Iteration 800, Loss: 0.3404220938682556


902it [03:49,  2.69it/s]

Epoch 0, Iteration 900, Loss: 0.4014955163002014


1002it [04:14,  2.84it/s]

Epoch 0, Iteration 1000, Loss: 0.2858242690563202


1102it [04:39,  2.45it/s]

Epoch 0, Iteration 1100, Loss: 0.32098981738090515


1202it [05:04,  2.68it/s]

Epoch 0, Iteration 1200, Loss: 0.3969229459762573


1302it [05:29,  2.83it/s]

Epoch 0, Iteration 1300, Loss: 0.42759382724761963


1402it [05:54,  2.97it/s]

Epoch 0, Iteration 1400, Loss: 0.28638988733291626


1502it [06:19,  2.98it/s]

Epoch 0, Iteration 1500, Loss: 0.2937457859516144


1602it [06:43,  2.99it/s]

Epoch 0, Iteration 1600, Loss: 0.4203629195690155


1702it [07:08,  2.71it/s]

Epoch 0, Iteration 1700, Loss: 0.24944336712360382


1802it [07:33,  3.00it/s]

Epoch 0, Iteration 1800, Loss: 0.26644736528396606


1902it [07:58,  2.82it/s]

Epoch 0, Iteration 1900, Loss: 0.16812895238399506


2002it [08:23,  2.98it/s]

Epoch 0, Iteration 2000, Loss: 0.2830246090888977


2102it [08:47,  2.95it/s]

Epoch 0, Iteration 2100, Loss: 0.255389541387558


2202it [09:12,  2.66it/s]

Epoch 0, Iteration 2200, Loss: 0.18387337028980255


2302it [09:37,  2.77it/s]

Epoch 0, Iteration 2300, Loss: 0.35449206829071045


2402it [10:02,  2.78it/s]

Epoch 0, Iteration 2400, Loss: 0.2585107088088989


2502it [10:27,  3.02it/s]

Epoch 0, Iteration 2500, Loss: 0.26369303464889526


2602it [10:52,  3.01it/s]

Epoch 0, Iteration 2600, Loss: 0.22709496319293976


2702it [11:17,  2.75it/s]

Epoch 0, Iteration 2700, Loss: 0.27908751368522644


2802it [11:41,  3.04it/s]

Epoch 0, Iteration 2800, Loss: 0.20441345870494843


2902it [12:06,  2.75it/s]

Epoch 0, Iteration 2900, Loss: 0.2383068948984146


3002it [12:31,  3.00it/s]

Epoch 0, Iteration 3000, Loss: 0.226631760597229


3102it [12:55,  2.99it/s]

Epoch 0, Iteration 3100, Loss: 0.34840407967567444


3202it [13:20,  2.73it/s]

Epoch 0, Iteration 3200, Loss: 0.2491903454065323


3302it [13:45,  3.01it/s]

Epoch 0, Iteration 3300, Loss: 0.18108060956001282


3402it [14:10,  2.70it/s]

Epoch 0, Iteration 3400, Loss: 0.28952741622924805


3502it [14:35,  3.01it/s]

Epoch 0, Iteration 3500, Loss: 0.25025931000709534


3602it [15:00,  2.99it/s]

Epoch 0, Iteration 3600, Loss: 0.2601875364780426


3702it [15:25,  2.70it/s]

Epoch 0, Iteration 3700, Loss: 0.19648981094360352


3802it [15:49,  2.98it/s]

Epoch 0, Iteration 3800, Loss: 0.15548501908779144


3902it [16:14,  2.72it/s]

Epoch 0, Iteration 3900, Loss: 0.21261826157569885


4002it [16:39,  2.98it/s]

Epoch 0, Iteration 4000, Loss: 0.2861689329147339


4102it [17:04,  2.97it/s]

Epoch 0, Iteration 4100, Loss: 0.16176940500736237


4202it [17:29,  2.76it/s]

Epoch 0, Iteration 4200, Loss: 0.33682647347450256


4302it [17:53,  2.96it/s]

Epoch 0, Iteration 4300, Loss: 0.2044256180524826


4402it [18:18,  2.76it/s]

Epoch 0, Iteration 4400, Loss: 0.23601314425468445


4502it [18:43,  2.94it/s]

Epoch 0, Iteration 4500, Loss: 0.24408994615077972


4602it [19:08,  3.06it/s]

Epoch 0, Iteration 4600, Loss: 0.1919027864933014


4702it [19:33,  2.75it/s]

Epoch 0, Iteration 4700, Loss: 0.3703460991382599


4802it [19:58,  3.02it/s]

Epoch 0, Iteration 4800, Loss: 0.158024862408638


4902it [20:22,  2.92it/s]

Epoch 0, Iteration 4900, Loss: 0.20144739747047424


5002it [20:47,  2.98it/s]

Epoch 0, Iteration 5000, Loss: 0.1536223292350769


5102it [21:12,  2.94it/s]

Epoch 0, Iteration 5100, Loss: 0.306274950504303


5202it [21:37,  2.96it/s]

Epoch 0, Iteration 5200, Loss: 0.38682639598846436


5302it [22:02,  2.99it/s]

Epoch 0, Iteration 5300, Loss: 0.126993790268898


5402it [22:26,  2.66it/s]

Epoch 0, Iteration 5400, Loss: 0.19354470074176788


5502it [22:51,  3.02it/s]

Epoch 0, Iteration 5500, Loss: 0.20193763077259064


5602it [23:16,  2.91it/s]

Epoch 0, Iteration 5600, Loss: 0.26111048460006714


5702it [23:41,  2.95it/s]

Epoch 0, Iteration 5700, Loss: 0.224186509847641


5802it [24:05,  3.03it/s]

Epoch 0, Iteration 5800, Loss: 0.2367124706506729


5902it [24:30,  2.68it/s]

Epoch 0, Iteration 5900, Loss: 0.17732736468315125


6002it [24:55,  3.06it/s]

Epoch 0, Iteration 6000, Loss: 0.23512093722820282


6102it [25:20,  2.98it/s]

Epoch 0, Iteration 6100, Loss: 0.23378221690654755


6202it [25:45,  2.69it/s]

Epoch 0, Iteration 6200, Loss: 0.23327840864658356


6302it [26:09,  3.03it/s]

Epoch 0, Iteration 6300, Loss: 0.17915409803390503


6402it [26:34,  2.80it/s]

Epoch 0, Iteration 6400, Loss: 0.41939178109169006


6502it [26:59,  3.04it/s]

Epoch 0, Iteration 6500, Loss: 0.3592933118343353


6602it [27:24,  3.01it/s]

Epoch 0, Iteration 6600, Loss: 0.2002132385969162


6702it [27:48,  2.81it/s]

Epoch 0, Iteration 6700, Loss: 0.1861669272184372


6802it [28:13,  3.08it/s]

Epoch 0, Iteration 6800, Loss: 0.2596372365951538


6902it [28:38,  2.79it/s]

Epoch 0, Iteration 6900, Loss: 0.21133284270763397


7002it [29:03,  2.99it/s]

Epoch 0, Iteration 7000, Loss: 0.2435215711593628


7102it [29:27,  3.04it/s]

Epoch 0, Iteration 7100, Loss: 0.13608291745185852


7202it [29:52,  2.71it/s]

Epoch 0, Iteration 7200, Loss: 0.179485484957695


7302it [30:17,  3.00it/s]

Epoch 0, Iteration 7300, Loss: 0.1465263068675995


7402it [30:42,  2.79it/s]

Epoch 0, Iteration 7400, Loss: 0.3477191627025604


7502it [31:07,  2.92it/s]

Epoch 0, Iteration 7500, Loss: 0.1834578961133957


7602it [31:31,  3.01it/s]

Epoch 0, Iteration 7600, Loss: 0.12786759436130524


7702it [31:56,  2.78it/s]

Epoch 0, Iteration 7700, Loss: 0.26743361353874207


7802it [32:21,  3.01it/s]

Epoch 0, Iteration 7800, Loss: 0.19947655498981476


7902it [32:46,  2.80it/s]

Epoch 0, Iteration 7900, Loss: 0.33648160099983215


8002it [33:11,  2.98it/s]

Epoch 0, Iteration 8000, Loss: 0.37137100100517273


8102it [33:35,  2.99it/s]

Epoch 0, Iteration 8100, Loss: 0.1887405514717102


8202it [34:00,  2.79it/s]

Epoch 0, Iteration 8200, Loss: 0.23179368674755096


8302it [34:25,  3.02it/s]

Epoch 0, Iteration 8300, Loss: 0.26974937319755554


8402it [34:50,  2.75it/s]

Epoch 0, Iteration 8400, Loss: 0.16159942746162415


8419it [34:54,  4.02it/s]
2it [00:00,  2.56it/s]

Epoch 1, Iteration 0, Loss: 0.13177348673343658


102it [00:25,  2.82it/s]

Epoch 1, Iteration 100, Loss: 0.23384016752243042


202it [00:50,  2.96it/s]

Epoch 1, Iteration 200, Loss: 0.2629777491092682


302it [01:15,  3.00it/s]

Epoch 1, Iteration 300, Loss: 0.15822553634643555


402it [01:39,  2.83it/s]

Epoch 1, Iteration 400, Loss: 0.24949055910110474


502it [02:04,  3.01it/s]

Epoch 1, Iteration 500, Loss: 0.29487934708595276


602it [02:29,  2.80it/s]

Epoch 1, Iteration 600, Loss: 0.14903786778450012


702it [02:54,  2.98it/s]

Epoch 1, Iteration 700, Loss: 0.13958242535591125


802it [03:18,  2.97it/s]

Epoch 1, Iteration 800, Loss: 0.15301863849163055


902it [03:43,  2.74it/s]

Epoch 1, Iteration 900, Loss: 0.14512687921524048


1002it [04:08,  3.02it/s]

Epoch 1, Iteration 1000, Loss: 0.13355129957199097


1102it [04:33,  2.80it/s]

Epoch 1, Iteration 1100, Loss: 0.1269700676202774


1202it [04:57,  3.02it/s]

Epoch 1, Iteration 1200, Loss: 0.23162972927093506


1302it [05:22,  3.05it/s]

Epoch 1, Iteration 1300, Loss: 0.14527317881584167


1402it [05:47,  2.72it/s]

Epoch 1, Iteration 1400, Loss: 0.20808088779449463


1502it [06:12,  3.05it/s]

Epoch 1, Iteration 1500, Loss: 0.1987520456314087


1602it [06:36,  3.00it/s]

Epoch 1, Iteration 1600, Loss: 0.23113444447517395


1702it [07:01,  2.96it/s]

Epoch 1, Iteration 1700, Loss: 0.12942300736904144


1802it [07:26,  2.99it/s]

Epoch 1, Iteration 1800, Loss: 0.17297418415546417


1902it [07:51,  2.79it/s]

Epoch 1, Iteration 1900, Loss: 0.29385122656822205


2002it [08:16,  2.99it/s]

Epoch 1, Iteration 2000, Loss: 0.2846750020980835


2102it [08:41,  2.74it/s]

Epoch 1, Iteration 2100, Loss: 0.36795172095298767


2202it [09:05,  3.04it/s]

Epoch 1, Iteration 2200, Loss: 0.19625668227672577


2302it [09:30,  2.48it/s]

Epoch 1, Iteration 2300, Loss: 0.12877219915390015


2402it [09:55,  2.77it/s]

Epoch 1, Iteration 2400, Loss: 0.3457031548023224


2502it [10:20,  3.02it/s]

Epoch 1, Iteration 2500, Loss: 0.18403387069702148


2602it [10:45,  2.77it/s]

Epoch 1, Iteration 2600, Loss: 0.17197592556476593


2702it [11:09,  3.15it/s]

Epoch 1, Iteration 2700, Loss: 0.15558558702468872


2802it [11:34,  2.98it/s]

Epoch 1, Iteration 2800, Loss: 0.20306159555912018


2902it [11:59,  2.80it/s]

Epoch 1, Iteration 2900, Loss: 0.18828324973583221


3002it [12:24,  2.82it/s]

Epoch 1, Iteration 3000, Loss: 0.17602390050888062


3102it [12:48,  2.95it/s]

Epoch 1, Iteration 3100, Loss: 0.37430500984191895


3202it [13:13,  3.00it/s]

Epoch 1, Iteration 3200, Loss: 0.16320955753326416


3302it [13:38,  2.96it/s]

Epoch 1, Iteration 3300, Loss: 0.42265403270721436


3402it [14:03,  2.79it/s]

Epoch 1, Iteration 3400, Loss: 0.14193719625473022


3502it [14:27,  2.98it/s]

Epoch 1, Iteration 3500, Loss: 0.2467348426580429


3602it [14:52,  2.98it/s]

Epoch 1, Iteration 3600, Loss: 0.1922120451927185


3702it [15:17,  3.02it/s]

Epoch 1, Iteration 3700, Loss: 0.15877503156661987


3802it [15:42,  2.99it/s]

Epoch 1, Iteration 3800, Loss: 0.1803305298089981


3902it [16:06,  2.78it/s]

Epoch 1, Iteration 3900, Loss: 0.1887652724981308


4002it [16:31,  3.02it/s]

Epoch 1, Iteration 4000, Loss: 0.1450217366218567


4102it [16:56,  2.96it/s]

Epoch 1, Iteration 4100, Loss: 0.27774375677108765


4202it [17:21,  2.88it/s]

Epoch 1, Iteration 4200, Loss: 0.15415509045124054


4302it [17:45,  3.04it/s]

Epoch 1, Iteration 4300, Loss: 0.3105478286743164


4402it [18:10,  2.80it/s]

Epoch 1, Iteration 4400, Loss: 0.22460554540157318


4502it [18:35,  3.02it/s]

Epoch 1, Iteration 4500, Loss: 0.1377263367176056


4602it [19:00,  2.78it/s]

Epoch 1, Iteration 4600, Loss: 0.2273716777563095


4702it [19:25,  3.01it/s]

Epoch 1, Iteration 4700, Loss: 0.24627916514873505


4802it [19:49,  3.01it/s]

Epoch 1, Iteration 4800, Loss: 0.17509377002716064


4902it [20:14,  2.77it/s]

Epoch 1, Iteration 4900, Loss: 0.27152442932128906


5002it [20:39,  3.02it/s]

Epoch 1, Iteration 5000, Loss: 0.13180309534072876


5102it [21:04,  2.80it/s]

Epoch 1, Iteration 5100, Loss: 0.1491188257932663


5202it [21:28,  3.00it/s]

Epoch 1, Iteration 5200, Loss: 0.2541119456291199


5302it [21:53,  3.02it/s]

Epoch 1, Iteration 5300, Loss: 0.2587721049785614


5402it [22:18,  2.74it/s]

Epoch 1, Iteration 5400, Loss: 0.16289512813091278


5502it [22:43,  2.99it/s]

Epoch 1, Iteration 5500, Loss: 0.1864842027425766


5602it [23:08,  2.90it/s]

Epoch 1, Iteration 5600, Loss: 0.15702268481254578


5702it [23:32,  3.01it/s]

Epoch 1, Iteration 5700, Loss: 0.13318562507629395


5802it [23:57,  3.02it/s]

Epoch 1, Iteration 5800, Loss: 0.14033783972263336


5902it [24:22,  2.80it/s]

Epoch 1, Iteration 5900, Loss: 0.158950537443161


6002it [24:46,  3.00it/s]

Epoch 1, Iteration 6000, Loss: 0.1345636546611786


6102it [25:11,  2.97it/s]

Epoch 1, Iteration 6100, Loss: 0.19372068345546722


6202it [25:36,  2.98it/s]

Epoch 1, Iteration 6200, Loss: 0.1337970793247223


6302it [26:01,  3.04it/s]

Epoch 1, Iteration 6300, Loss: 0.18425874412059784


6402it [26:25,  2.69it/s]

Epoch 1, Iteration 6400, Loss: 0.15135367214679718


6502it [26:50,  3.04it/s]

Epoch 1, Iteration 6500, Loss: 0.33714377880096436


6602it [27:15,  2.99it/s]

Epoch 1, Iteration 6600, Loss: 0.132162943482399


6702it [27:40,  3.00it/s]

Epoch 1, Iteration 6700, Loss: 0.3959878385066986


6802it [28:04,  2.97it/s]

Epoch 1, Iteration 6800, Loss: 0.2657777667045593


6902it [28:29,  2.78it/s]

Epoch 1, Iteration 6900, Loss: 0.1691020131111145


7002it [28:54,  2.97it/s]

Epoch 1, Iteration 7000, Loss: 0.2186107486486435


7102it [29:19,  2.97it/s]

Epoch 1, Iteration 7100, Loss: 0.2339245080947876


7202it [29:43,  3.00it/s]

Epoch 1, Iteration 7200, Loss: 0.15972650051116943


7302it [30:08,  2.96it/s]

Epoch 1, Iteration 7300, Loss: 0.1534183919429779


7402it [30:33,  2.40it/s]

Epoch 1, Iteration 7400, Loss: 0.30329588055610657


7502it [30:58,  3.00it/s]

Epoch 1, Iteration 7500, Loss: 0.17738232016563416


7602it [31:23,  2.96it/s]

Epoch 1, Iteration 7600, Loss: 0.13833048939704895


7702it [31:48,  2.96it/s]

Epoch 1, Iteration 7700, Loss: 0.12590113282203674


7802it [32:12,  2.96it/s]

Epoch 1, Iteration 7800, Loss: 0.1447656750679016


7902it [32:37,  2.70it/s]

Epoch 1, Iteration 7900, Loss: 0.24303698539733887


8002it [33:02,  2.99it/s]

Epoch 1, Iteration 8000, Loss: 0.16282224655151367


8102it [33:27,  2.86it/s]

Epoch 1, Iteration 8100, Loss: 0.16737176477909088


8202it [33:52,  3.02it/s]

Epoch 1, Iteration 8200, Loss: 0.13798370957374573


8302it [34:16,  2.99it/s]

Epoch 1, Iteration 8300, Loss: 0.2184154987335205


8402it [34:41,  2.76it/s]

Epoch 1, Iteration 8400, Loss: 0.2963826358318329


8419it [34:45,  4.04it/s]
2it [00:00,  2.55it/s]

Epoch 2, Iteration 0, Loss: 0.12657427787780762


102it [00:25,  2.78it/s]

Epoch 2, Iteration 100, Loss: 0.1815633624792099


202it [00:50,  2.97it/s]

Epoch 2, Iteration 200, Loss: 0.15238165855407715


302it [01:15,  2.71it/s]

Epoch 2, Iteration 300, Loss: 0.2738197147846222


402it [01:40,  2.94it/s]

Epoch 2, Iteration 400, Loss: 0.1925182342529297


502it [02:04,  3.05it/s]

Epoch 2, Iteration 500, Loss: 0.1294146627187729


602it [02:29,  2.75it/s]

Epoch 2, Iteration 600, Loss: 0.24569429457187653


702it [02:54,  2.96it/s]

Epoch 2, Iteration 700, Loss: 0.25434190034866333


802it [03:19,  2.78it/s]

Epoch 2, Iteration 800, Loss: 0.23575370013713837


902it [03:43,  3.01it/s]

Epoch 2, Iteration 900, Loss: 0.13019946217536926


1002it [04:08,  3.03it/s]

Epoch 2, Iteration 1000, Loss: 0.12552566826343536


1102it [04:33,  2.77it/s]

Epoch 2, Iteration 1100, Loss: 0.24834899604320526


1202it [04:58,  2.74it/s]

Epoch 2, Iteration 1200, Loss: 0.1840897500514984


1302it [05:22,  2.82it/s]

Epoch 2, Iteration 1300, Loss: 0.12588807940483093


1402it [05:47,  3.11it/s]

Epoch 2, Iteration 1400, Loss: 0.1408168375492096


1502it [06:12,  3.13it/s]

Epoch 2, Iteration 1500, Loss: 0.1448582410812378


1602it [06:37,  2.83it/s]

Epoch 2, Iteration 1600, Loss: 0.2078070044517517


1702it [07:01,  2.99it/s]

Epoch 2, Iteration 1700, Loss: 0.1636095643043518


1802it [07:26,  2.80it/s]

Epoch 2, Iteration 1800, Loss: 0.20665507018566132


1902it [07:51,  3.02it/s]

Epoch 2, Iteration 1900, Loss: 0.334430456161499


2002it [08:15,  2.98it/s]

Epoch 2, Iteration 2000, Loss: 0.25035572052001953


2102it [08:41,  2.34it/s]

Epoch 2, Iteration 2100, Loss: 0.13968908786773682


2202it [09:05,  3.02it/s]

Epoch 2, Iteration 2200, Loss: 0.25594615936279297


2302it [09:30,  2.98it/s]

Epoch 2, Iteration 2300, Loss: 0.1690807193517685


2402it [09:55,  2.83it/s]

Epoch 2, Iteration 2400, Loss: 0.17612908780574799


2502it [10:20,  2.97it/s]

Epoch 2, Iteration 2500, Loss: 0.2764716148376465


2602it [10:44,  2.76it/s]

Epoch 2, Iteration 2600, Loss: 0.1471419483423233


2702it [11:09,  2.92it/s]

Epoch 2, Iteration 2700, Loss: 0.12584859132766724


2802it [11:34,  3.04it/s]

Epoch 2, Iteration 2800, Loss: 0.17269471287727356


2885it [11:54,  4.13it/s]

In [None]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification, RobertaConfig
from datasets import load_dataset
import torch
from torch.utils.data import DataLoader, Dataset
import os

# Setting up device for GPU usage
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Load the SST-2 dataset
dataset = load_dataset("glue", "sst2")

# Initialize the tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-large")

# Tokenize the datasets with padding and truncation
def tokenize_batch(batch):
    return tokenizer(batch['sentence'], truncation=True, padding='max_length', max_length=128, return_tensors='pt')

encoded_dataset = dataset.map(tokenize_batch, batched=True)
# Custom dataset class
class CustomDataset(Dataset):
    def __init__(self, encoded_dataset, mode='train'):
        self.input_ids = torch.tensor(encoded_dataset[mode]['input_ids'])
        self.attention_mask = torch.tensor(encoded_dataset[mode]['attention_mask'])
        self.labels = torch.tensor(encoded_dataset[mode]['label'])  # Change 'labels' to 'label'

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.labels[idx]  # Change 'labels' to 'label'
        }

train_dataset = CustomDataset(encoded_dataset)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
# Define the student model creation function
def create_half_size_roberta_base():
    student_config = RobertaConfig.from_pretrained("roberta-base")
    student_config.hidden_size //= 2
    student_config.num_attention_heads //= 2
    student_config.intermediate_size //= 2
    student_model = RobertaForSequenceClassification(student_config)
    return student_model

student_model = create_half_size_roberta_base()
student_model = student_model.to(device)

# Load the student model from Google Drive
SAVE_DIR = "/content/drive/My Drive/Colab Notebooks/student_model_state2.pth"
SAVE_PATH = os.path.join(SAVE_DIR, "student_model_state3.pth")

def load_model_from_drive(model, save_path=SAVE_PATH):
    if os.path.exists(save_path):
        model.load_state_dict(torch.load(save_path))
        print("Model loaded from", save_path)
    else:
        print("No previous model state found!")

load_model_from_drive(student_model)

# Evaluation
student_model.eval()
correct = 0
total = 0

val_dataloader = DataLoader(CustomDataset(encoded_dataset, mode='validation'), batch_size=8)

with torch.no_grad():
    for batch in val_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = student_model(input_ids=input_ids, attention_mask=attention_mask)
        _, predicted = torch.max(outputs.logits, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = 100 * correct / total
print(f"Accuracy of the student model on the validation set: {accuracy:.2f}%")


Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

No previous model state found!
Accuracy of the student model on the validation set: 49.08%


In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)


Mounted at /content/drive


In [None]:
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification, RobertaConfig
from datasets import load_dataset
from torch.utils.data import DataLoader

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Function to create half-sized Roberta
def create_half_size_roberta_base():
    student_config = RobertaConfig.from_pretrained("roberta-base")
    student_config.hidden_size //= 2
    student_config.num_attention_heads //= 2
    student_config.intermediate_size //= 2
    student_model = RobertaForSequenceClassification(student_config)
    return student_model

# Load the student model
student_model = create_half_size_roberta_base()
student_model.to(device)

# Load weights from saved model
saved_model_path = "/content/drive/MyDrive/Colab Notebooks/student_model_state2.pth"
student_model.load_state_dict(torch.load(saved_model_path, map_location=device))

# Load and preprocess the SST-2 dataset
tokenizer = RobertaTokenizer.from_pretrained("roberta-large")
dataset = load_dataset("glue", "sst2")

# Encoding the dataset
def encode_dataset(example):
    return tokenizer(example['sentence'], truncation=True, padding='max_length', max_length=128)

encoded_dataset = dataset.map(encode_dataset, batched=True)

# Collation function
def collate_fn(batch):
    return {
        'input_ids': torch.stack([torch.tensor(item['input_ids']) for item in batch]),
        'attention_mask': torch.stack([torch.tensor(item['attention_mask']) for item in batch]),
        'label': torch.stack([torch.tensor(item['label']) for item in batch])
    }

# DataLoader
val_dataloader = DataLoader(encoded_dataset["validation"], shuffle=False, batch_size=32, collate_fn=collate_fn)

# Evaluate the model
student_model.eval()
correct_predictions = 0
total_predictions = 0

with torch.no_grad():
    for batch in val_dataloader:
        inputs = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = student_model(inputs, attention_mask=attention_mask)
        _, predicted = torch.max(outputs.logits, 1)
        total_predictions += labels.size(0)
        correct_predictions += (predicted == labels).sum().item()

accuracy = 100 * correct_predictions / total_predictions
print(f"Accuracy of the loaded student model on the validation set: {accuracy:.2f}%")


Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

Accuracy of the loaded student model on the validation set: 80.39%


model performance on train set. seems the model is overfitted

In [None]:
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification, RobertaConfig
from datasets import load_dataset
from torch.utils.data import DataLoader

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Function to create half-sized Roberta
def create_half_size_roberta_base():
    student_config = RobertaConfig.from_pretrained("roberta-base")  # Use the same architecture as the knowledge distillation
    student_config.hidden_size //= 2
    student_config.num_attention_heads //= 2
    student_config.intermediate_size //= 2
    student_model = RobertaForSequenceClassification(student_config)
    return student_model

# Load the student model
student_model = create_half_size_roberta_base()
student_model.to(device)

# Load weights from saved model
saved_model_path = "/content/drive/MyDrive/Colab Notebooks/student_model_state2.pth"
student_model.load_state_dict(torch.load(saved_model_path, map_location=device))

# Load and preprocess the SST-2 dataset
tokenizer = RobertaTokenizer.from_pretrained("roberta-large")  # Use the same tokenizer as the knowledge distillation

dataset = load_dataset("glue", "sst2")

# Encoding the dataset
def encode_dataset(example):
    return tokenizer(example['sentence'], truncation=True, padding='max_length', max_length=128)

encoded_dataset = dataset.map(encode_dataset, batched=True)

# Collation function
def collate_fn(batch):
    return {
        'input_ids': torch.stack([torch.tensor(item['input_ids']) for item in batch]),
        'attention_mask': torch.stack([torch.tensor(item['attention_mask']) for item in batch]),
        'label': torch.stack([torch.tensor(item['label']) for item in batch])
    }

# DataLoader
val_dataloader = DataLoader(encoded_dataset["train"], shuffle=False, batch_size=32, collate_fn=collate_fn)

# Evaluate the model
student_model.eval()
correct_predictions = 0
total_predictions = 0

with torch.no_grad():
    for batch in val_dataloader:
        inputs = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = student_model(inputs, attention_mask=attention_mask)
        _, predicted = torch.max(outputs.logits, 1)
        total_predictions += labels.size(0)
        correct_predictions += (predicted == labels).sum().item()

accuracy = 100 * correct_predictions / total_predictions
print(f"Accuracy of the loaded student model on the validation set: {accuracy:.2f}%")


Accuracy of the loaded student model on the validation set: 92.88%


performance of same configuration but no KD

In [None]:
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification, RobertaConfig
from datasets import load_dataset
from torch.utils.data import DataLoader

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the RoBERTa Base tokenizer and configuration
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
config = RobertaConfig.from_pretrained("roberta-base")
config.hidden_size //= 2
config.num_attention_heads //= 2
config.intermediate_size //= 2

# Load the RoBERTa Base model based on the configuration
student_model = RobertaForSequenceClassification(config)
student_model.to(device)

# Load and preprocess the SST-2 dataset
dataset = load_dataset("glue", "sst2")

# Encoding the validation dataset
def encode_dataset(example):
    return tokenizer(example['sentence'], truncation=True, padding='max_length', max_length=128)

encoded_dataset = dataset["validation"].map(encode_dataset, batched=True)

# Collation function
def collate_fn(batch):
    return {
        'input_ids': torch.stack([torch.tensor(item['input_ids']) for item in batch]),
        'attention_mask': torch.stack([torch.tensor(item['attention_mask']) for item in batch]),
        'label': torch.stack([torch.tensor(item['label']) for item in batch])
    }

# DataLoader
val_dataloader = DataLoader(encoded_dataset, shuffle=False, batch_size=32, collate_fn=collate_fn)

# Evaluate the model on the validation set
student_model.eval()
correct_predictions = 0
total_predictions = 0

with torch.no_grad():
    for batch in val_dataloader:
        inputs = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = student_model(inputs, attention_mask=attention_mask)
        _, predicted = torch.max(outputs.logits, 1)
        total_predictions += labels.size(0)
        correct_predictions += (predicted == labels).sum().item()

accuracy = 100 * correct_predictions / total_predictions
print(f"Accuracy of the RoBERTa Base model on the validation set: {accuracy:.2f}%")


Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Accuracy of the RoBERTa Base model on the validation set: 50.46%


In [None]:
from transformers import RobertaForSequenceClassification, RobertaConfig

def print_total_parameters_in_millions(model, model_name):
    print(f"Total Parameters for {model_name}:")
    total_params = sum(p.numel() for p in model.parameters())
    total_params_millions = total_params / 1_000_000  # Convert to millions
    print(f"{total_params_millions:.2f}M\n")  # Display 2 decimal points
    print("-" * 50 + "\n")

# Example usage:

# Initialize the teacher model
teacher_model = RobertaForSequenceClassification.from_pretrained("roberta-large")

# Initialize the student model
def create_half_size_roberta_base():
    student_config = RobertaConfig.from_pretrained("roberta-base")
    student_config.hidden_size //= 2
    student_config.num_attention_heads //= 2
    student_config.intermediate_size //= 2
    student_model = RobertaForSequenceClassification(student_config)
    return student_model

student_model = create_half_size_roberta_base()

print_total_parameters_in_millions(teacher_model, "Teacher Model")
print_total_parameters_in_millions(student_model, "Student Model")


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total Parameters for Teacher Model:
355.36M

--------------------------------------------------

Total Parameters for Student Model:
40.94M

--------------------------------------------------



In [None]:
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, TensorDataset
from datasets import load_dataset

# Load the model and tokenizer
model = RobertaForSequenceClassification.from_pretrained("roberta-large")
tokenizer = RobertaTokenizer.from_pretrained("roberta-large")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load dataset
dataset = load_dataset('glue', 'sst2', split='train')

# Shuffle the dataset to ensure randomness
dataset = dataset.shuffle(seed=42)

# Tokenize and prepare DataLoader
inputs = tokenizer([x['sentence'] for x in dataset], padding=True, return_tensors='pt', truncation=True)
inputs = {key: val.to(device) for key, val in inputs.items()}
labels = torch.tensor(dataset['label']).to(device)
data_loader = DataLoader(TensorDataset(inputs['input_ids'], inputs['attention_mask'], labels), batch_size=8)

# Fine-tuning hyperparameters
optimizer = AdamW(model.parameters(), lr=2e-5)  # Adjust learning rate as needed
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=100, num_training_steps=len(data_loader)*10)  # Adjust warm-up steps

model = model.to(device)

# Fine-tune the model
num_epochs = 10  # Adjust the number of epochs
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for input_ids, attention_mask, label in data_loader:
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=label)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # Gradient clipping
        optimizer.step()
        scheduler.step()

    average_loss = total_loss / len(data_loader)
    print(f'Epoch {epoch + 1}/{num_epochs}, Average Loss: {average_loss:.4f}')

# Evaluate the fine-tuned model
model = model.eval()
correct = 0
total = 0

dataset = load_dataset('glue', 'sst2', split='validation')
inputs = tokenizer([x['sentence'] for x in dataset], padding=True, return_tensors='pt', truncation=True)
inputs = {key: val.to(device) for key, val in inputs.items()}
labels = torch.tensor(dataset['label']).to(device)
data_loader = DataLoader(TensorDataset(inputs['input_ids'], inputs['attention_mask'], labels), batch_size=8)

with torch.no_grad():
    for input_ids, attention_mask, label in data_loader:
        outputs = model(input_ids, attention_mask=attention_mask)
        _, predicted = torch.max(outputs.logits, 1)
        total += label.size(0)
        correct += (predicted == label).sum().item()

accuracy = 100 * correct / total
print(f'Accuracy after fine-tuning: {accuracy:.2f}%')


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


KeyboardInterrupt: ignored

In [None]:
# Evaluate the fine-tuned model
model = model.eval()
correct = 0
total = 0

dataset = load_dataset('glue', 'sst2', split='validation')
inputs = tokenizer([x['sentence'] for x in dataset], padding=True, return_tensors='pt', truncation=True)
inputs = {key: val.to(device) for key, val in inputs.items()}
labels = torch.tensor(dataset['label']).to(device)
data_loader = DataLoader(TensorDataset(inputs['input_ids'], inputs['attention_mask'], labels), batch_size=8)

with torch.no_grad():
    for input_ids, attention_mask, label in data_loader:
        outputs = model(input_ids, attention_mask=attention_mask)
        _, predicted = torch.max(outputs.logits, 1)
        total += label.size(0)
        correct += (predicted == label).sum().item()

accuracy = 100 * correct / total
print(f'Accuracy after fine-tuning: {accuracy:.2f}%')

Accuracy after fine-tuning: 92.66%


Fine tuning part 2 to save the fine tune teacher model

In [None]:
import os
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, TensorDataset
from datasets import load_dataset

save_dir = '/content/drive/MyDrive/saved_models'
os.makedirs(save_dir, exist_ok=True)  # Ensure directory exists
# Load the model and tokenizer
model = RobertaForSequenceClassification.from_pretrained("roberta-large")
tokenizer = RobertaTokenizer.from_pretrained("roberta-large")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load dataset
dataset = load_dataset('glue', 'sst2', split='train')

# Shuffle the dataset to ensure randomness
dataset = dataset.shuffle(seed=42)

# Tokenize and prepare DataLoader
inputs = tokenizer([x['sentence'] for x in dataset], padding=True, return_tensors='pt', truncation=True)
inputs = {key: val.to(device) for key, val in inputs.items()}
labels = torch.tensor(dataset['label']).to(device)
data_loader = DataLoader(TensorDataset(inputs['input_ids'], inputs['attention_mask'], labels), batch_size=8)

# Fine-tuning hyperparameters
optimizer = AdamW(model.parameters(), lr=2e-5)  # Adjust learning rate as needed
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=100, num_training_steps=len(data_loader)*10)  # Adjust warm-up steps

model = model.to(device)
# Fine-tune the model
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    correct_train = 0
    total_train = 0

    for input_ids, attention_mask, label in data_loader:
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=label)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # Gradient clipping
        optimizer.step()
        scheduler.step()

        _, predicted = torch.max(outputs.logits, 1)
        total_train += label.size(0)
        correct_train += (predicted == label).sum().item()

    average_loss = total_loss / len(data_loader)
    train_accuracy = 100 * correct_train / total_train
    print(f'Epoch {epoch + 1}/{num_epochs}, Average Loss: {average_loss:.4f}, Training Accuracy: {train_accuracy:.2f}%')

    # Optionally, add a validation phase to check model's performance on a validation set

    # Save the model after each epoch
    save_path = os.path.join(save_dir, f'teacher_epoch{epoch + 1}.pth')
    torch.save(model.state_dict(), save_path)
    print(f'Model saved to {save_path}')

# ... [Your evaluation code] ...


Downloading model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Average Loss: 0.6937, Training Accuracy: 54.49%
Model saved to /content/drive/MyDrive/saved_models/teacher_epoch1.pth


In [None]:
import os
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, TensorDataset
from datasets import load_dataset
from google.colab import drive


save_dir = '/content/drive/MyDrive/saved_models'
os.makedirs(save_dir, exist_ok=True)  # Ensure directory exists

# Mount Google Drive
drive.mount('/content/drive')

# Specify the path to the uploaded model file on Google Drive
uploaded_model_path = '/content/drive/MyDrive/saved_models/teacher_epoch1.pth'  # Update with your actual path

# Load the model and tokenizer
model = RobertaForSequenceClassification.from_pretrained("roberta-large")
tokenizer = RobertaTokenizer.from_pretrained("roberta-large")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the pre-trained weights from your uploaded checkpoint
if os.path.exists(uploaded_model_path):
    model.load_state_dict(torch.load(uploaded_model_path, map_location=device))
    print("Model loaded from", uploaded_model_path)
else:
    print("No pre-trained model checkpoint found at", uploaded_model_path)

# Load dataset
dataset = load_dataset('glue', 'sst2', split='train')

# Shuffle the dataset to ensure randomness
dataset = dataset.shuffle(seed=42)

# Tokenize and prepare DataLoader
inputs = tokenizer([x['sentence'] for x in dataset], padding=True, return_tensors='pt', truncation=True)
inputs = {key: val.to(device) for key, val in inputs.items()}
labels = torch.tensor(dataset['label']).to(device)
data_loader = DataLoader(TensorDataset(inputs['input_ids'], inputs['attention_mask'], labels), batch_size=8)

# Fine-tuning hyperparameters
optimizer = AdamW(model.parameters(), lr=2e-5)  # Adjust learning rate as needed
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=100, num_training_steps=len(data_loader)*10)  # Adjust warm-up steps

model = model.to(device)
# Fine-tune the model
num_epochs = 4
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    correct_train = 0
    total_train = 0

    for input_ids, attention_mask, label in data_loader:
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=label)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # Gradient clipping
        optimizer.step()
        scheduler.step()

        _, predicted = torch.max(outputs.logits, 1)
        total_train += label.size(0)
        correct_train += (predicted == label).sum().item()

    average_loss = total_loss / len(data_loader)
    train_accuracy = 100 * correct_train / total_train
    print(f'Epoch {epoch + 2}/{num_epochs}, Average Loss: {average_loss:.4f}, Training Accuracy: {train_accuracy:.2f}%')

    # Optionally, add a validation phase to check the model's performance on a validation set

    # Save the model after each epoch
    save_path = os.path.join(save_dir, f'teacher_epoch{epoch + 2}.pth')
    torch.save(model.state_dict(), save_path)
    print(f'Model saved to {save_path}')

# ... [Your evaluation code] ...


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded from /content/drive/MyDrive/saved_models/teacher_epoch1.pth
Epoch 2/4, Average Loss: 0.6940, Training Accuracy: 54.26%
Model saved to /content/drive/MyDrive/saved_models/teacher_epoch2.pth
Epoch 3/4, Average Loss: 0.6927, Training Accuracy: 54.43%
Model saved to /content/drive/MyDrive/saved_models/teacher_epoch3.pth


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
from datasets import load_dataset

# Load the model and tokenizer
model = RobertaForSequenceClassification.from_pretrained("roberta-large")
tokenizer = RobertaTokenizer.from_pretrained("roberta-large")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Load the trained model weights
model_path = '/content/drive/MyDrive/saved_models/teacher_epoch3.pth'
model.load_state_dict(torch.load(model_path, map_location=device))
model = model.eval()  # Set the model to evaluation mode


# Check if the model file exists
if os.path.exists(model_path):
    model.load_state_dict(torch.load(model_path, map_location=device))
    model = model.eval()  # Set the model to evaluation mode
    print(f'Model loaded from {model_path}. Proceeding to evaluation...')
else:
    print(f'No model found at {model_path}. Cannot proceed to evaluation.')

# Load dataset
dataset = load_dataset('glue', 'sst2', split='validation')

# Tokenize and prepare DataLoader
inputs = tokenizer([x['sentence'] for x in dataset], padding=True, return_tensors='pt', truncation=True)
inputs = {key: val.to(device) for key, val in inputs.items()}
labels = torch.tensor(dataset['label']).to(device)
data_loader = DataLoader(TensorDataset(inputs['input_ids'], inputs['attention_mask'], labels), batch_size=8)

# Evaluate the fine-tuned model
correct = 0
total = 0

with torch.no_grad():
    for input_ids, attention_mask, label in data_loader:
        outputs = model(input_ids, attention_mask=attention_mask)
        _, predicted = torch.max(outputs.logits, 1)
        total += label.size(0)
        correct += (predicted == label).sum().item()

accuracy = 100 * correct / total
print(f'Accuracy after fine-tuning: {accuracy:.2f}%')


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded from /content/drive/MyDrive/saved_models/teacher_epoch3.pth. Proceeding to evaluation...


Downloading builder script:   0%|          | 0.00/28.8k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/28.7k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/27.9k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.44M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

Accuracy after fine-tuning: 50.92%


Checking Teacher and Student Model parameters and Configurations

In [None]:
from transformers import RobertaForSequenceClassification, RobertaConfig

def print_total_parameters_in_millions(model, model_name):
    print(f"Total Parameters for {model_name}:")
    total_params = sum(p.numel() for p in model.parameters())
    total_params_millions = total_params / 1_000_000  # Convert to millions
    print(f"{total_params_millions:.2f}M\n")  # Display 2 decimal points
    print("-" * 50 + "\n")

def print_model_config(model, model_name):
    print(f"Configuration for {model_name}:")
    print(model.config)  # Print the config of the model
    print("-" * 50 + "\n")

# Example usage:

# Initialize the teacher model
teacher_model = RobertaForSequenceClassification.from_pretrained("roberta-large")

# Initialize the student model
def create_half_size_roberta_base():
    student_config = RobertaConfig.from_pretrained("roberta-base")
    student_config.hidden_size //= 2
    student_config.num_attention_heads //= 2
    student_config.intermediate_size //= 2
    student_model = RobertaForSequenceClassification(student_config)
    return student_model

student_model = create_half_size_roberta_base()

print_total_parameters_in_millions(teacher_model, "Teacher Model")
print_total_parameters_in_millions(student_model, "Student Model")

# Printing the config of teacher and student model
print_model_config(teacher_model, "Teacher Model")
print_model_config(student_model, "Student Model")


Downloading (…)lve/main/config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Total Parameters for Teacher Model:
355.36M

--------------------------------------------------

Total Parameters for Student Model:
40.94M

--------------------------------------------------

Configuration for Teacher Model:
RobertaConfig {
  "_name_or_path": "roberta-large",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.34.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

--------------------------------------------------

Configuration for Student Model:
RobertaConfig {
  "architect

Glue benchmarking on student model

In [None]:
single_batch = next(iter(eval_dataloader))
print(single_batch['input_ids'])


[tensor([0, 0, 0, 0, 0, 0, 0, 0]), tensor([  405,   879, 37984,   627,   405, 24648,   102,   368]), tensor([  128,  4825,   201,  3501,   128, 31923,  2128,   608]), tensor([   29,  3796,     7,  2156,    29,    19, 35138,    94]), tensor([   10,  7790,  1034, 14690,  2635, 12073,   822,    76]), tensor([18452, 23530,    14,  2156,   480,     8,   479,   128]), tensor([   8,    8,  295,  930,  182,   10, 1437,   29]), tensor([  747,  7764, 13887,  2156,  2156,   367,     2,  2556]), tensor([ 7920,  1437,    16, 18535,   182, 33639,     1,    19]), tensor([ 3251,     2, 10137, 34226,  2635, 16170,     1,   110]), tensor([  479,     1,     7,     8,   479, 12325,     1,  1931]), tensor([ 1437,     1, 14976,  2369,  1437,  2156,     1,    12]), tensor([    2,     1,    10,    32,     2,     5,     1, 12295]), tensor([  1,   1, 538,  70,   1, 822,   1, 479]), tensor([    1,     1,   756, 29932,     1,    16,     1,  1437]), tensor([  1,   1,  25, 576,   1,  10,   1,   2]), tensor([    1, 

In [None]:
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification, RobertaConfig
from datasets import load_dataset, load_metric
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Function to create half-sized Roberta
def create_half_size_roberta_base():
    student_config = RobertaConfig.from_pretrained("roberta-base")
    student_config.hidden_size //= 2
    student_config.num_attention_heads //= 2
    student_config.intermediate_size //= 2
    student_model = RobertaForSequenceClassification(student_config)
    return student_model

# Load datasets
tasks = ["sst2", "mrpc", "cola"]
glue_data = {task: load_dataset("glue", task) for task in tasks}

# Load tokenizer and create student model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = create_half_size_roberta_base()
state_dict = torch.load("/content/drive/MyDrive/Colab Notebooks/student_model_state2.pth")
model.load_state_dict(state_dict)
model.to(device)
model.eval()

# Preprocess the datasets
def tokenize_function(examples, task):
    if task == "sst2" or task == "cola":
        text = examples['sentence']
    elif task == "mrpc":
        text = [s1 + " " + s2 for s1, s2 in zip(examples['sentence1'], examples['sentence2'])]
    return tokenizer(text, padding='max_length', truncation=True, max_length=128)

tokenized_datasets = {task: glue_data[task].map(lambda x: tokenize_function(x, task), batched=True) for task in tasks}

# Evaluation
metrics = {
    "sst2": load_metric("accuracy"),
    "mrpc": load_metric("f1"),
    "cola": load_metric("matthews_correlation")
}

results = {}

with torch.no_grad():
    for task in tasks:
        eval_dataloader = torch.utils.data.DataLoader(tokenized_datasets[task]["validation"], batch_size=8)

        all_predictions = []
        all_labels = []

        for batch in eval_dataloader:
            try:
                # Print type and content for debugging
                print(f"Type of batch['input_ids']: {type(batch['input_ids'])}")
                print(f"Content of batch['input_ids']: {batch['input_ids']}")

                # Convert to tensor (if needed) and move to device
                if not isinstance(batch['input_ids'], torch.Tensor):
                    input_ids = torch.tensor(batch['input_ids'], dtype=torch.long).to(device)
                else:
                    input_ids = batch['input_ids'].to(device)

                attention_mask = batch['attention_mask'].to(device)
                labels = batch['label'].to(device)
                outputs = model(input_ids, attention_mask=attention_mask)
                _, predictions = torch.max(outputs.logits, dim=1)

                all_predictions.append(predictions.cpu().numpy())
                all_labels.append(labels.cpu().numpy())
            except Exception as e:
                print(f"Exception encountered: {e}")
                break  # Break out of the loop if we encounter an exception

        all_predictions = np.concatenate([np.array(preds) for preds in all_predictions], axis=0)
        all_labels = np.concatenate([np.array(labels) for labels in all_labels], axis=0)


        if task == "sst2":
            results[task] = {"accuracy": metrics[task].compute(predictions=all_predictions, references=all_labels)}
        elif task == "mrpc":
            results[task] = {"f1": metrics[task].compute(predictions=all_predictions, references=all_labels)["f1"]}
        elif task == "cola":
            results[task] = {"matthews_correlation": metrics[task].compute(predictions=all_predictions, references=all_labels)["matthews_correlation"]}

print(results)


Type of batch['input_ids']: <class 'list'>
Content of batch['input_ids']: [tensor([0, 0, 0, 0, 0, 0, 0, 0]), tensor([  405,   879, 37984,   627,   405, 24648,   102,   368]), tensor([  128,  4825,   201,  3501,   128, 31923,  2128,   608]), tensor([   29,  3796,     7,  2156,    29,    19, 35138,    94]), tensor([   10,  7790,  1034, 14690,  2635, 12073,   822,    76]), tensor([18452, 23530,    14,  2156,   480,     8,   479,   128]), tensor([   8,    8,  295,  930,  182,   10, 1437,   29]), tensor([  747,  7764, 13887,  2156,  2156,   367,     2,  2556]), tensor([ 7920,  1437,    16, 18535,   182, 33639,     1,    19]), tensor([ 3251,     2, 10137, 34226,  2635, 16170,     1,   110]), tensor([  479,     1,     7,     8,   479, 12325,     1,  1931]), tensor([ 1437,     1, 14976,  2369,  1437,  2156,     1,    12]), tensor([    2,     1,    10,    32,     2,     5,     1, 12295]), tensor([  1,   1, 538,  70,   1, 822,   1, 479]), tensor([    1,     1,   756, 29932,     1,    16,     1, 

ValueError: ignored

In [None]:
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification, RobertaConfig
from datasets import load_dataset
from sklearn.metrics import accuracy_score, f1_score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def create_half_size_roberta_base():
    student_config = RobertaConfig.from_pretrained("roberta-base")
    student_config.hidden_size //= 2
    student_config.num_attention_heads //= 2
    student_config.intermediate_size //= 2
    student_model = RobertaForSequenceClassification(student_config)
    return student_model

# Load datasets
tasks = ["sst2"]
glue_data = {task: load_dataset("glue", task) for task in tasks}

# Load tokenizer and create student model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = create_half_size_roberta_base()
state_dict = torch.load("/content/drive/MyDrive/Colab Notebooks/student_model_state2.pth")
model.load_state_dict(state_dict)
model.to(device)

# Preprocess the data
def encode(examples):
    return tokenizer(examples["sentence"], truncation=True, padding="max_length", max_length=256, return_tensors="pt")

encoded_data = {split: glue_data["sst2"][split].map(encode, batched=True) for split in ["train", "validation", "test"]}

# Collation function
def collate_fn(batch):
    return {
        'input_ids': torch.stack([torch.tensor(item['input_ids']) for item in batch]),
        'attention_mask': torch.stack([torch.tensor(item['attention_mask']) for item in batch]),
        'label': torch.tensor([item['label'] for item in batch])
    }

# Prediction function
def predict(model, data_loader):
    model.eval()
    predictions = []
    true_labels = []

    with torch.no_grad():
        for batch in data_loader:
            inputs = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            outputs = model(inputs, attention_mask=attention_mask)
            _, preds = torch.max(outputs.logits, dim=1)
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(batch["label"].numpy())

    return predictions, true_labels

data_loader = torch.utils.data.DataLoader(encoded_data["validation"], batch_size=32, collate_fn=collate_fn)
predictions, true_labels = predict(model, data_loader)

# Calculate accuracy and F1 score
accuracy = accuracy_score(true_labels, predictions)
f1 = f1_score(true_labels, predictions, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")


Accuracy: 0.8039
F1 Score: 0.8029


In [None]:
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification, RobertaConfig
from datasets import load_dataset
from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def create_half_size_roberta_base():
    student_config = RobertaConfig.from_pretrained("roberta-base")

    student_config.hidden_size //= 2
    student_config.num_attention_heads //= 2
    student_config.intermediate_size //= 2
    student_model = RobertaForSequenceClassification(student_config)
    return student_model

# Load datasets
tasks = ["sst2", "cola", "mrpc"]
glue_data = {task: load_dataset("glue", task) for task in tasks}

# Load tokenizer and create student model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = create_half_size_roberta_base()
state_dict = torch.load("/content/drive/MyDrive/Colab Notebooks/student_model_state2.pth")
model.load_state_dict(state_dict)
model.to(device)

# Preprocess the data
def encode(task, examples):
    if task == "sst2":
        return tokenizer(examples["sentence"], truncation=True, padding="max_length", max_length=256, return_tensors="pt")
    elif task == "cola":
        return tokenizer(examples["sentence"], truncation=True, padding="max_length", max_length=256, return_tensors="pt")
    elif task == "mrpc":
        return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, padding="max_length", max_length=256, return_tensors="pt")

encoded_data = {task: {split: glue_data[task][split].map(lambda examples: encode(task, examples), batched=True) for split in ["train", "validation", "test"]} for task in tasks}

# Collation function
def collate_fn(batch):
    return {
        'input_ids': torch.stack([torch.tensor(item['input_ids']) for item in batch]),
        'attention_mask': torch.stack([torch.tensor(item['attention_mask']) for item in batch]),
        'label': torch.tensor([item['label'] for item in batch])
    }

# Prediction function
def predict(model, data_loader):
    model.eval()
    predictions = []
    true_labels = []

    with torch.no_grad():
        for batch in data_loader:
            inputs = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            outputs = model(inputs, attention_mask=attention_mask)
            _, preds = torch.max(outputs.logits, dim=1)
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(batch["label"].numpy())

    return predictions, true_labels

for task in tasks:
    print(f"Evaluating on {task.upper()}")
    data_loader = torch.utils.data.DataLoader(encoded_data[task]["validation"], batch_size=32, collate_fn=collate_fn)
    predictions, true_labels = predict(model, data_loader)

    # Calculate accuracy, F1 score, and Matthews correlation coefficient
    accuracy = accuracy_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions, average='weighted')
    mcc = matthews_corrcoef(true_labels, predictions)

    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Matthews correlation coefficient: {mcc:.4f}\n")


Downloading data:   0%|          | 0.00/377k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8551 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1043 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1063 [00:00<?, ? examples/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data: 0.00B [00:00, ?B/s]

Downloading data: 0.00B [00:00, ?B/s]

Downloading data: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/3668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/408 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

Map:   0%|          | 0/8551 [00:00<?, ? examples/s]

Map:   0%|          | 0/1043 [00:00<?, ? examples/s]

Map:   0%|          | 0/1063 [00:00<?, ? examples/s]

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

Evaluating on SST2
Accuracy: 0.8039
F1 Score: 0.8029
Matthews correlation coefficient: 0.6170

Evaluating on COLA
Accuracy: 0.3969
F1 Score: 0.3937
Matthews correlation coefficient: -0.0587

Evaluating on MRPC
Accuracy: 0.3995
F1 Score: 0.3879
Matthews correlation coefficient: -0.0383



In [None]:
from transformers import RobertaForSequenceClassification, RobertaConfig

def print_total_parameters_in_millions(model, model_name):
    print(f"Total Parameters for {model_name}:")
    total_params = sum(p.numel() for p in model.parameters())
    total_params_millions = total_params / 1_000_000  # Convert to millions
    print(f"{total_params_millions:.2f}M\n")  # Display 2 decimal points
    print("-" * 50 + "\n")

def print_model_config(model, model_name):
    print(f"Configuration for {model_name}:")
    print(model.config)
    print("-" * 50 + "\n")

# Initialize the teacher model (roberta-large)
teacher_model = RobertaForSequenceClassification.from_pretrained("roberta-large")

# Initialize the student model from roberta-base with custom configuration
def create_custom_student_model():
    student_config = RobertaConfig.from_pretrained("roberta-base")
    student_config.hidden_size = 512
    student_config.num_attention_heads = 8
    student_config.num_hidden_layers = 12
    student_config.intermediate_size = 1536
    student_model = RobertaForSequenceClassification(student_config)
    return student_model

student_model = create_custom_student_model()

# Initialize the actual roberta-base model
roberta_base_model = RobertaForSequenceClassification.from_pretrained("roberta-base")

print_total_parameters_in_millions(teacher_model, "Teacher Model (roberta-large)")
print_model_config(teacher_model, "Teacher Model (roberta-large)")

print_total_parameters_in_millions(student_model, "Student Model (roberta-base with custom configuration)")
print_model_config(student_model, "Student Model (roberta-base with custom configuration)")

print_total_parameters_in_millions(roberta_base_model, "Actual roberta-base Model")
print_model_config(roberta_base_model, "Actual roberta-base Model")


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total Parameters for Teacher Model (roberta-large):
355.36M

--------------------------------------------------

Configuration for Teacher Model (roberta-large):
RobertaConfig {
  "_name_or_path": "roberta-large",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.34.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

--------------------------------------------------

Total Parameters for Student Model (roberta-base with custom configuration):
57.80M

----------------------------------------