In [119]:
import torch
from tqdm import tqdm  # Import tqdm for the progress bar

from transformers import BertTokenizer, BertModel
from transformers import LongformerTokenizer, LongformerModel
from transformers import MPNetTokenizer, MPNetModel
from transformers import ElectraTokenizer, ElectraModel

# Load pre-trained BERT model and tokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = 'bert-base-uncased'
model = BertModel.from_pretrained(model_name).to(device)
tokenizer = BertTokenizer.from_pretrained(model_name)

# model_name = 'allenai/longformer-base-4096'
# model = LongformerModel.from_pretrained(model_name).to(device)
# tokenizer = LongformerTokenizer.from_pretrained(model_name)

# model_name = 'google/electra-base-discriminator'
# model = ElectraModel.from_pretrained(model_name).to(device)
# tokenizer = ElectraTokenizer.from_pretrained(model_name)

In [116]:
import pandas as pd
# data = pd.read_csv('/home/woody/iwso/iwso092h/student_summaries/commonlit-evaluate-student-summaries/summaries_train.csv')
data = {'text': ["This is the first sentence.", "Another example sentence.", "Longformer can handle long documents efficiently."]}
df = pd.DataFrame(data)

max_length = 512  # Maximum token length for the ELECTRA model
df['tokens'] = df['text'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=max_length, truncation=True))
df['tokens_tensor'] = df['tokens'].apply(lambda x: torch.tensor(x).to(device))  # Convert to tensor

In [120]:
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [122]:
embeddings_list = []

with tqdm(total=len(df)) as pbar:
    # Get ELECTRA embeddings for each row in the DataFrame
    with torch.no_grad():
        for index, row in df.iterrows():
            tokens_tensor = torch.tensor(row['tokens']).unsqueeze(0).to(device)  # Add batch dimension
            embeddings = model(tokens_tensor).last_hidden_state
            embeddings_list.append(embeddings.cpu().numpy())
            pbar.update(1)  # Update progress bar
    
# df.to_csv('/home/woody/iwso/iwso092h/student_summaries/long_former_embeddings.csv', index=False)

100%|██████████| 3/3 [00:00<00:00, 75.11it/s]


In [65]:
# import numpy as np
# embeddings = np.load('embeddings.npy')

In [None]:
["This is the first sentence.", 
 "Another example sentence.", 
 "Longformer can handle long documents efficiently."]

In [130]:
embeddings_list[0].shape

(1, 8, 768)

In [115]:
sub_emb = embeddings_list[0][0]
print(sub_emb.shape)
print(sub_emb)


(71, 768)
[[ 0.20459935  0.0734762  -0.27840707 ...  0.13848153 -0.31278545
   0.3603602 ]
 [ 0.17001134 -0.09808327  0.25172633 ...  0.02908664 -0.2305985
  -0.01166437]
 [ 0.24359564 -0.07064409  0.28362384 ... -0.18333077 -0.34113634
  -0.07045442]
 ...
 [ 0.1271995  -0.19659182 -0.42378888 ... -0.6986803   0.24811108
   0.10199846]
 [ 0.6065582   0.11888039  0.4191618  ...  0.05457393 -0.52178663
   1.1179227 ]
 [ 0.2045996   0.07347596 -0.27840698 ...  0.13848154 -0.31278557
   0.36036035]]


In [100]:
for i in range(0,20):
    print(embeddings_list[i].shape)

(1, 71, 768)
(1, 58, 768)
(1, 289, 768)
(1, 41, 768)
(1, 263, 768)
(1, 50, 768)
(1, 91, 768)
(1, 57, 768)
(1, 61, 768)
(1, 39, 768)
(1, 33, 768)
(1, 129, 768)
(1, 91, 768)
(1, 86, 768)
(1, 76, 768)
(1, 53, 768)
(1, 72, 768)
(1, 200, 768)
(1, 96, 768)
(1, 68, 768)


In [101]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

# Define the neural network model
class RegressionModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(RegressionModel, self).__init__()
        self.rnn = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        _, (h_n, _) = self.rnn(x)
        output = self.fc(h_n[-1])
        return output

# Define a custom dataset
class CustomDataset(Dataset):
    def __init__(self, embeddings, targets):
        self.embeddings = embeddings
        self.targets = targets
    
    def __len__(self):
        return len(self.embeddings)
    
    def __getitem__(self, idx):
        return self.embeddings[idx], self.targets[idx]

# Define hyperparameters
input_dim = 768  # Dimensionality of the embeddings
hidden_dim = 128
output_dim = 2  # Number of target values
learning_rate = 0.001
batch_size = 32
num_epochs = 10

# Create model, loss function, and optimizer
model = RegressionModel(input_dim, hidden_dim, output_dim)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Create dataset and data loader
train_dataset = CustomDataset(embeddings_list, data.content)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    
    for embeddings, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(embeddings)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    average_loss = total_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {average_loss:.4f}")

RuntimeError: stack expects each tensor to be equal size, but got [1, 174, 768] at entry 0 and [1, 164, 768] at entry 1