In [2]:
from datasets import load_dataset
import copy

# Load Hugging Face Emotion dataset
dataset = load_dataset("dair-ai/emotion")
# Shuffle each split
dataset["train"] = dataset["train"].shuffle(seed=42)
dataset["test"] = dataset["test"].shuffle(seed=42)
dataset["validation"] = dataset["validation"].shuffle(seed=42)

# Define labels
num_labels = len(set(dataset['train']['label']))

dataset1 = copy.deepcopy(dataset)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:

train_size = int(1 * len(dataset1["train"]))
test_size = int(1 * len(dataset1["test"]))
val_size = int(1 * len(dataset1["validation"]))

dataset["train"] = dataset1["train"].select(range(train_size))
dataset["test"] = dataset1["test"].select(range(test_size))
dataset["validation"] = dataset1["validation"].select(range(val_size))
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})


In [4]:
import collections

# Function to count labels in each split
def get_label_distribution(dataset_split):
    labels = dataset_split["label"]
    label_counts = collections.Counter(labels)
    total_samples = len(labels)
    
    print(f"Total Samples: {total_samples}")
    for label, count in sorted(label_counts.items()):
        print(f"Class {label}: {count} samples ({(count / total_samples) * 100:.2f}%)")

# Check class distribution in Train, Validation, and Test sets
print("Train Set:")
get_label_distribution(dataset["train"])

print("\nValidation Set:")
get_label_distribution(dataset["validation"])

print("\nTest Set:")
get_label_distribution(dataset["test"])

# Split train dataset into 3 sets as per your partitioning plan
bert_finetuning_dataset = dataset["train"].select(range(0, 8000))  # Samples 1 - 8000
roberta_finetuning_dataset = dataset["train"].select(range(2000, 10000))  # Samples 2001 - 10000
xlnet_finetuning_dataset = dataset["train"].select(range(4000, 12000))  # Samples 4001 - 12000

print("bert_finetuning_dataset:")
get_label_distribution(bert_finetuning_dataset)

print("\nroberta_finetuning_dataset:")
get_label_distribution(roberta_finetuning_dataset)

print("\nxlnet_finetuning_dataset:")
get_label_distribution(xlnet_finetuning_dataset)

Train Set:
Total Samples: 16000
Class 0: 4666 samples (29.16%)
Class 1: 5362 samples (33.51%)
Class 2: 1304 samples (8.15%)
Class 3: 2159 samples (13.49%)
Class 4: 1937 samples (12.11%)
Class 5: 572 samples (3.57%)

Validation Set:
Total Samples: 2000
Class 0: 550 samples (27.50%)
Class 1: 704 samples (35.20%)
Class 2: 178 samples (8.90%)
Class 3: 275 samples (13.75%)
Class 4: 212 samples (10.60%)
Class 5: 81 samples (4.05%)

Test Set:
Total Samples: 2000
Class 0: 581 samples (29.05%)
Class 1: 695 samples (34.75%)
Class 2: 159 samples (7.95%)
Class 3: 275 samples (13.75%)
Class 4: 224 samples (11.20%)
Class 5: 66 samples (3.30%)
bert_finetuning_dataset:
Total Samples: 8000
Class 0: 2354 samples (29.43%)
Class 1: 2697 samples (33.71%)
Class 2: 650 samples (8.12%)
Class 3: 1071 samples (13.39%)
Class 4: 956 samples (11.95%)
Class 5: 272 samples (3.40%)

roberta_finetuning_dataset:
Total Samples: 8000
Class 0: 2343 samples (29.29%)
Class 1: 2716 samples (33.95%)
Class 2: 649 samples (8.11

 Define Base Models (BERT, RoBERTa, XLNet)

In [6]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel
import torch
from transformers import DataCollatorWithPadding
from torch.utils.data import DataLoader
import numpy as np
from sklearn.model_selection import train_test_split
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import matplotlib.pyplot as plt
from transformers import get_scheduler
from torch.optim import AdamW
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Define Transformer Models
MODEL_NAMES = {
    "bert": "bert-base-uncased",
    "roberta": "roberta-base",
    "xlnet": "xlnet-base-cased"
}

tokenizers = {name: AutoTokenizer.from_pretrained(model) for name, model in MODEL_NAMES.items()}
models = {name: AutoModel.from_pretrained(model) for name, model in MODEL_NAMES.items()}

bert_cl_model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAMES["bert"], num_labels=num_labels)
roberta_cl_model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAMES["roberta"], num_labels=num_labels)
xlnet_cl_model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAMES["xlnet"], num_labels=num_labels)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of XLNetForSequenceClassification were not

Load tokenizers and models

In [7]:
bert_tokenizer = tokenizers["bert"]
# Tokenize dataset
def tokenize_function(examples):
    return bert_tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

tokenized_datasets = bert_finetuning_dataset.map(tokenize_function, batched=True)

# Remove text column, rename label column
tokenized_datasets = tokenized_datasets.remove_columns(["text"]).rename_column("label", "labels")
tokenized_datasets.set_format("torch")

# Data collator for dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=bert_tokenizer, return_tensors="pt")

# Define batch size
batch_size = 32

# Create DataLoader for bert_finetuning_dataset
train_dataloader = DataLoader(
    tokenized_datasets,  # Use the tokenized version
    batch_size=batch_size,
    shuffle=True,  # Shuffle data for better training
    collate_fn=data_collator  # Ensures padding works dynamically
)



roberta_tokenizer = tokenizers["roberta"]
# Tokenize dataset
def tokenize_function1(examples):
    return roberta_tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

tokenized_datasets1 = roberta_finetuning_dataset.map(tokenize_function1, batched=True)

# Remove text column, rename label column
tokenized_datasets1 = tokenized_datasets1.remove_columns(["text"]).rename_column("label", "labels")
tokenized_datasets1.set_format("torch")

# Data collator for dynamic padding
data_collator1 = DataCollatorWithPadding(tokenizer=roberta_tokenizer, return_tensors="pt")

# Create DataLoader for roberta_finetuning_dataset
train_dataloader1 = DataLoader(
    tokenized_datasets1,  # Use the tokenized version
    batch_size=batch_size,
    shuffle=True,  # Shuffle data for better training
    collate_fn=data_collator1  # Ensures padding works dynamically
)



xlnet_tokenizer = tokenizers["xlnet"]
# Tokenize dataset
def tokenize_function2(examples):
    return xlnet_tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

tokenized_datasets2 = xlnet_finetuning_dataset.map(tokenize_function2, batched=True)

# Remove text column, rename label column
tokenized_datasets2 = tokenized_datasets2.remove_columns(["text"]).rename_column("label", "labels")
tokenized_datasets2.set_format("torch")

# Data collator for dynamic padding
data_collator2 = DataCollatorWithPadding(tokenizer=xlnet_tokenizer, return_tensors="pt")

# Create DataLoader for xlnet_finetuning_dataset
train_dataloader2 = DataLoader(
    tokenized_datasets2,  # Use the tokenized version
    batch_size=batch_size,
    shuffle=True,  # Shuffle data for better training
    collate_fn=data_collator2  # Ensures padding works dynamically
)

In [None]:
# fine tuning of bert model

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_cl_model.to(device)

# Freeze all layers except the last 3
for name, param in bert_cl_model.named_parameters():
    if "encoder.layer" in name and name.split(".")[2].isdigit():  # Ensure it's a number
        layer_num = int(name.split(".")[2])  # Extract layer index
        if layer_num < 9:  # Freeze first 9 layers, train last 3
            param.requires_grad = False

# Ensure classification head is trainable
for param in bert_cl_model.classifier.parameters():
    param.requires_grad = True


# Use AdamW optimizer
optimizer = AdamW(filter(lambda p: p.requires_grad, bert_cl_model.parameters()), lr=5e-5)

# Learning rate scheduler
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * 3)

# Loss function
loss_fn = torch.nn.CrossEntropyLoss()

# Training loop
epochs = 3
for epoch in range(epochs):
    bert_cl_model.train()
    total_loss, total_correct = 0, 0
    loop = tqdm(train_dataloader, leave=True)
    
    for batch in loop:
        batch = {k: v.to(device) for k, v in batch.items()}
        optimizer.zero_grad()

        # Forward pass
        outputs = bert_cl_model(**batch)
        loss = loss_fn(outputs.logits, batch["labels"])
        loss.backward()

        # Optimization step
        optimizer.step()
        lr_scheduler.step()

        # Metrics tracking
        total_loss += loss.item()
        total_correct += (outputs.logits.argmax(dim=1) == batch["labels"]).sum().item()
        
        loop.set_description(f"Epoch {epoch+1}/{epochs}")
        loop.set_postfix(loss=loss.item(), acc=total_correct / len(train_dataloader.dataset))

print("Training Complete!")



Epoch 1/3: 100%|██████████| 250/250 [36:23<00:00,  8.73s/it, acc=0.759, loss=0.409] 
Epoch 2/3: 100%|██████████| 250/250 [34:33<00:00,  8.29s/it, acc=0.939, loss=0.201] 
Epoch 3/3: 100%|██████████| 250/250 [44:52<00:00, 10.77s/it, acc=0.96, loss=0.0373]  

Training Complete!





In [9]:
bert_cl_model.save_pretrained("bert_finetuned_emotion")
bert_tokenizer.save_pretrained("bert_finetuned_emotion")

('bert_finetuned_emotion\\tokenizer_config.json',
 'bert_finetuned_emotion\\special_tokens_map.json',
 'bert_finetuned_emotion\\vocab.txt',
 'bert_finetuned_emotion\\added_tokens.json',
 'bert_finetuned_emotion\\tokenizer.json')

In [10]:
# Freeze all layers except the last 3
for name, param in roberta_cl_model.named_parameters():
    if "encoder.layer" in name and name.split(".")[2].isdigit():  # Ensure it's a number
        layer_num = int(name.split(".")[2])  # Extract layer index
        if layer_num < 9:  # Freeze first 9 layers, train last 3
            param.requires_grad = False

# Ensure classification head is trainable
for param in roberta_cl_model.classifier.parameters():
    param.requires_grad = True

# Use AdamW optimizer
optimizer = AdamW(filter(lambda p: p.requires_grad, roberta_cl_model.parameters()), lr=5e-5)

# Learning rate scheduler
lr_scheduler = get_scheduler(
    "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader1) * 3
)

# Loss function
loss_fn = torch.nn.CrossEntropyLoss()

# Training loop
epochs = 5
for epoch in range(epochs):
    roberta_cl_model.train()
    total_loss, total_correct = 0, 0
    loop = tqdm(train_dataloader1, leave=True)
    
    for batch in loop:
        batch = {k: v.to(device) for k, v in batch.items()}
        optimizer.zero_grad()

        # Forward pass
        outputs = roberta_cl_model(**batch)
        loss = loss_fn(outputs.logits, batch["labels"])
        loss.backward()

        # Optimization step
        optimizer.step()
        lr_scheduler.step()

        # Metrics tracking
        total_loss += loss.item()
        total_correct += (outputs.logits.argmax(dim=1) == batch["labels"]).sum().item()
        
        loop.set_description(f"Epoch {epoch+1}/{epochs}")
        loop.set_postfix(loss=loss.item(), acc=total_correct / len(train_dataloader1.dataset))

print("Training Complete!")

Epoch 1/5: 100%|██████████| 250/250 [34:45<00:00,  8.34s/it, acc=0.733, loss=0.2]  
Epoch 2/5: 100%|██████████| 250/250 [35:03<00:00,  8.41s/it, acc=0.918, loss=0.29]  
Epoch 3/5: 100%|██████████| 250/250 [35:05<00:00,  8.42s/it, acc=0.945, loss=0.0225]
Epoch 4/5: 100%|██████████| 250/250 [34:55<00:00,  8.38s/it, acc=0.957, loss=0.382] 
Epoch 5/5: 100%|██████████| 250/250 [35:29<00:00,  8.52s/it, acc=0.955, loss=0.0307] 

Training Complete!





In [11]:
# Save RoBERTa model & tokenizer
roberta_cl_model.save_pretrained("roberta_finetuned_emotion")
roberta_tokenizer.save_pretrained("roberta_finetuned_emotion")

('roberta_finetuned_emotion\\tokenizer_config.json',
 'roberta_finetuned_emotion\\special_tokens_map.json',
 'roberta_finetuned_emotion\\vocab.json',
 'roberta_finetuned_emotion\\merges.txt',
 'roberta_finetuned_emotion\\added_tokens.json',
 'roberta_finetuned_emotion\\tokenizer.json')

In [12]:

# Freeze all layers except the last 3
for name, param in xlnet_cl_model.named_parameters():
    if "transformer.layer" in name and name.split(".")[2].isdigit():  # Ensure it's a number
        layer_num = int(name.split(".")[2])  # Extract layer index
        if layer_num < 9:  # Freeze first 9 layers, train last 3
            param.requires_grad = False

# Ensure classification head is trainable
for param in xlnet_cl_model.logits_proj.parameters():  # XLNet uses `logits_proj`
    param.requires_grad = True

# Use AdamW optimizer
optimizer = AdamW(filter(lambda p: p.requires_grad, xlnet_cl_model.parameters()), lr=5e-5)

# Learning rate scheduler
lr_scheduler = get_scheduler(
    "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader2) * 3
)

# Loss function
loss_fn = torch.nn.CrossEntropyLoss()

# Training loop
epochs = 4
for epoch in range(epochs):
    xlnet_cl_model.train()
    total_loss, total_correct = 0, 0
    loop = tqdm(train_dataloader2, leave=True)
    
    for batch in loop:
        batch = {k: v.to(device) for k, v in batch.items()}
        optimizer.zero_grad()

        # Forward pass
        outputs = xlnet_cl_model(**batch)
        loss = loss_fn(outputs.logits, batch["labels"])
        loss.backward()

        # Optimization step
        optimizer.step()
        lr_scheduler.step()

        # Metrics tracking
        total_loss += loss.item()
        total_correct += (outputs.logits.argmax(dim=1) == batch["labels"]).sum().item()
        
        loop.set_description(f"Epoch {epoch+1}/{epochs}")
        loop.set_postfix(loss=loss.item(), acc=total_correct / len(train_dataloader2.dataset))

print("Training Complete!")

Epoch 1/4: 100%|██████████| 250/250 [48:38<00:00, 11.68s/it, acc=0.555, loss=0.873] 
Epoch 2/4: 100%|██████████| 250/250 [38:12<00:00,  9.17s/it, acc=0.741, loss=0.857]
Epoch 3/4: 100%|██████████| 250/250 [37:27<00:00,  8.99s/it, acc=0.786, loss=0.616]
Epoch 4/4: 100%|██████████| 250/250 [37:40<00:00,  9.04s/it, acc=0.796, loss=0.802]

Training Complete!





In [13]:
# Save XLNet model & tokenizer
xlnet_cl_model.save_pretrained("xlnet_finetuned_emotion")
xlnet_tokenizer.save_pretrained("xlnet_finetuned_emotion")

('xlnet_finetuned_emotion\\tokenizer_config.json',
 'xlnet_finetuned_emotion\\special_tokens_map.json',
 'xlnet_finetuned_emotion\\tokenizer.json')

In [7]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report

test_data = dataset["test"]

model_paths = {
    "bert": "bert_finetuned_emotion",   
    "roberta": "roberta_finetuned_emotion",  
    "xlnet": "xlnet_finetuned_emotion" 
}

# Define function for testing
def test_model(model_name, model_path, test_data):
    print(f"\nTesting {model_name.upper()} Model...")

    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    model.eval()  # Set model to evaluation mode

    # Convert dataset to PyTorch DataLoader
    test_dataloader = DataLoader(test_data, batch_size=16)

    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch in tqdm(test_dataloader):
            # Tokenize input texts properly
            text = batch["text"]  # Extract text from batch
            inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)

            # Move tensors to model's device (CPU/GPU)
            inputs = {key: val.to(model.device) for key, val in inputs.items()}
            labels = batch["label"].to(model.device)

            # Get model outputs
            outputs = model(**inputs)
            preds = torch.argmax(outputs.logits, dim=1)

            # Store predictions and labels
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    all_preds = np.array(all_preds)
    all_labels = np.array(all_labels)

    # Calculate overall metrics
    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')

    print(f"{model_name.upper()} Test Metrics:")
    print(f"  Accuracy: {accuracy:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall: {recall:.4f}")
    print(f"  F1-score: {f1:.4f}")

    # Per-class metrics
    print("\nPer-Class Metrics:")
    report = classification_report(all_labels, all_preds, digits=4)
    print(classification_report(all_labels, all_preds, digits=4))

    with open("classification_report_" + model_name + ".txt", "w") as f:
        f.write(report)
    
    return all_preds, all_labels

# Run testing for each model
results = {}
for model_name, model_path in model_paths.items():
    preds, labels = test_model(model_name, model_path, test_data)
    results[model_name] = (preds, labels)



Testing BERT Model...


100%|██████████| 125/125 [00:51<00:00,  2.44it/s]


BERT Test Metrics:
  Accuracy: 0.9295
  Precision: 0.9304
  Recall: 0.9295
  F1-score: 0.9298

Per-Class Metrics:
              precision    recall  f1-score   support

           0     0.9672    0.9639    0.9655       581
           1     0.9481    0.9468    0.9474       695
           2     0.8397    0.8239    0.8317       159
           3     0.9273    0.9273    0.9273       275
           4     0.9091    0.8929    0.9009       224
           5     0.7237    0.8333    0.7746        66

    accuracy                         0.9295      2000
   macro avg     0.8859    0.8980    0.8913      2000
weighted avg     0.9304    0.9295    0.9298      2000


Testing ROBERTA Model...


100%|██████████| 125/125 [00:49<00:00,  2.51it/s]


ROBERTA Test Metrics:
  Accuracy: 0.9245
  Precision: 0.9267
  Recall: 0.9245
  F1-score: 0.9246

Per-Class Metrics:
              precision    recall  f1-score   support

           0     0.9651    0.9518    0.9584       581
           1     0.9610    0.9223    0.9413       695
           2     0.7889    0.8931    0.8378       159
           3     0.9176    0.9309    0.9242       275
           4     0.8594    0.9554    0.9049       224
           5     0.8269    0.6515    0.7288        66

    accuracy                         0.9245      2000
   macro avg     0.8865    0.8842    0.8825      2000
weighted avg     0.9267    0.9245    0.9246      2000


Testing XLNET Model...


100%|██████████| 125/125 [01:11<00:00,  1.75it/s]

XLNET Test Metrics:
  Accuracy: 0.8055
  Precision: 0.8061
  Recall: 0.8055
  F1-score: 0.8051

Per-Class Metrics:
              precision    recall  f1-score   support

           0     0.8487    0.8399    0.8443       581
           1     0.8799    0.8647    0.8723       695
           2     0.6053    0.5786    0.5916       159
           3     0.7897    0.7782    0.7839       275
           4     0.7034    0.8259    0.7598       224
           5     0.5536    0.4697    0.5082        66

    accuracy                         0.8055      2000
   macro avg     0.7301    0.7262    0.7267      2000
weighted avg     0.8061    0.8055    0.8051      2000






In [15]:
def get_embedding(text, model_name):
    
    tokenizer = AutoTokenizer.from_pretrained(model_name + "_finetuned_emotion")
    model = AutoModelForSequenceClassification.from_pretrained(model_name + "_finetuned_emotion")
    model.eval()

    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs,  output_hidden_states=True)

    # Extract hidden states
    hidden_states = outputs.hidden_states

    if model_name in ["bert", "roberta"]:
    # Get CLS token embedding (pooled output)
        cls_embedding = hidden_states[-1][:, 0, :]  # (batch_size, hidden_dim)

    elif model_name == "xlnet":
    # XLNet doesn't have CLS token, so take last token's hidden state
        cls_embedding = hidden_states[-1][:, -1, :] 
    
    return cls_embedding.squeeze(0).numpy()

# explanation : https://chatgpt.com/share/67dd43dc-27d4-8002-bae8-fc18bc074e0d


In [16]:

# Prepare dataset embeddings
def prepare_data(dataset, model_name):
    texts = dataset['text']
    labels = dataset['label']
    
    # embeddings = np.array([get_embedding(text, model_name) for text in texts])
    embeddings_list = []
    tem=0
    for text in texts:
        tem=tem+1
        print(f"{model_name} - sentence {tem}")  # Print message
        embedding = get_embedding(text, model_name)
        embeddings_list.append(embedding)

    embeddings = np.array(embeddings_list)

    # dimensions for embeddings ---> (no.of samples * 768)
    return embeddings, np.array(labels)

# explanation : https://chatgpt.com/share/67dd486b-28c8-8002-83a0-3412887e8924


In [17]:
# Get embeddings for all three datasets

bert_train, y_train = prepare_data(dataset["train"], "bert")
bert_val, y_val  = prepare_data(dataset["validation"], "bert")
bert_test, y_test = prepare_data(dataset["test"], "bert")


bert - sentence 1
bert - sentence 2
bert - sentence 3
bert - sentence 4
bert - sentence 5
bert - sentence 6
bert - sentence 7
bert - sentence 8
bert - sentence 9
bert - sentence 10
bert - sentence 11
bert - sentence 12
bert - sentence 13
bert - sentence 14
bert - sentence 15
bert - sentence 16
bert - sentence 17
bert - sentence 18
bert - sentence 19
bert - sentence 20
bert - sentence 21
bert - sentence 22
bert - sentence 23
bert - sentence 24
bert - sentence 25
bert - sentence 26
bert - sentence 27
bert - sentence 28
bert - sentence 29
bert - sentence 30
bert - sentence 31
bert - sentence 32
bert - sentence 33
bert - sentence 34
bert - sentence 35
bert - sentence 36
bert - sentence 37
bert - sentence 38
bert - sentence 39
bert - sentence 40
bert - sentence 41
bert - sentence 42
bert - sentence 43
bert - sentence 44
bert - sentence 45
bert - sentence 46
bert - sentence 47
bert - sentence 48
bert - sentence 49
bert - sentence 50
bert - sentence 51
bert - sentence 52
bert - sentence 53
be

In [18]:

roberta_train, _ = prepare_data(dataset["train"], "roberta")
roberta_val, _ = prepare_data(dataset["validation"], "roberta")
roberta_test, _ = prepare_data(dataset["test"], "roberta")

roberta - sentence 1
roberta - sentence 2
roberta - sentence 3
roberta - sentence 4
roberta - sentence 5
roberta - sentence 6
roberta - sentence 7
roberta - sentence 8
roberta - sentence 9
roberta - sentence 10
roberta - sentence 11
roberta - sentence 12
roberta - sentence 13
roberta - sentence 14
roberta - sentence 15
roberta - sentence 16
roberta - sentence 17
roberta - sentence 18
roberta - sentence 19
roberta - sentence 20
roberta - sentence 21
roberta - sentence 22
roberta - sentence 23
roberta - sentence 24
roberta - sentence 25
roberta - sentence 26
roberta - sentence 27
roberta - sentence 28
roberta - sentence 29
roberta - sentence 30
roberta - sentence 31
roberta - sentence 32
roberta - sentence 33
roberta - sentence 34
roberta - sentence 35
roberta - sentence 36
roberta - sentence 37
roberta - sentence 38
roberta - sentence 39
roberta - sentence 40
roberta - sentence 41
roberta - sentence 42
roberta - sentence 43
roberta - sentence 44
roberta - sentence 45
roberta - sentence 

In [20]:

xlnet_train, _ = prepare_data(dataset["train"], "xlnet")
xlnet_val, _ = prepare_data(dataset["validation"], "xlnet")
xlnet_test, _ = prepare_data(dataset["test"], "xlnet")

xlnet - sentence 1
xlnet - sentence 2
xlnet - sentence 3
xlnet - sentence 4
xlnet - sentence 5
xlnet - sentence 6
xlnet - sentence 7
xlnet - sentence 8
xlnet - sentence 9
xlnet - sentence 10
xlnet - sentence 11
xlnet - sentence 12
xlnet - sentence 13
xlnet - sentence 14
xlnet - sentence 15
xlnet - sentence 16
xlnet - sentence 17
xlnet - sentence 18
xlnet - sentence 19
xlnet - sentence 20
xlnet - sentence 21
xlnet - sentence 22
xlnet - sentence 23
xlnet - sentence 24
xlnet - sentence 25
xlnet - sentence 26
xlnet - sentence 27
xlnet - sentence 28
xlnet - sentence 29
xlnet - sentence 30
xlnet - sentence 31
xlnet - sentence 32
xlnet - sentence 33
xlnet - sentence 34
xlnet - sentence 35
xlnet - sentence 36
xlnet - sentence 37
xlnet - sentence 38
xlnet - sentence 39
xlnet - sentence 40
xlnet - sentence 41
xlnet - sentence 42
xlnet - sentence 43
xlnet - sentence 44
xlnet - sentence 45
xlnet - sentence 46
xlnet - sentence 47
xlnet - sentence 48
xlnet - sentence 49
xlnet - sentence 50
xlnet - s

In [27]:
# Save embeddings and labels

np.save("bert_train.npy", bert_train)
np.save("bert_val.npy", bert_val)
np.save("bert_test.npy", bert_test)

np.save("roberta_train.npy", roberta_train)
np.save("roberta_val.npy", roberta_val)
np.save("roberta_test.npy", roberta_test)

np.save("xlnet_train.npy", xlnet_train)
np.save("xlnet_val.npy", xlnet_val)
np.save("xlnet_test.npy", xlnet_test)

np.save("y_train.npy", y_train)
np.save("y_val.npy", y_val)
np.save("y_test.npy", y_test)
