<a href="https://colab.research.google.com/github/BuyiseloMonne/nucleusbot/blob/NLP/preparing_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [95]:
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer
import pandas as pd
from google.colab import files

# Upload CSV file to Colab
uploaded = files.upload()
filename = list(uploaded.keys())[0]

# Load dataset
df = pd.read_csv(filename)
print(df.columns)

# Tokenizer definition
class BertTokenizerWrapper:
    def __init__(self, model_name='bert-base-uncased'):
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.pad_id = self.tokenizer.pad_token_id
        self.unk_id = self.tokenizer.unk_token_id

    def encode(self, text, max_length=None):
        encodings = self.tokenizer.encode_plus(
            text,
            max_length=max_length,
            padding='max_length' if max_length else False,
            truncation=True,
            return_tensors='pt'
        )
        return encodings['input_ids'].squeeze(0).tolist()

    def decode(self, ids):
        return self.tokenizer.decode(ids, skip_special_tokens=True)

# Dataset definition
class CustomDataset(Dataset):
    def __init__(self, inputs, outputs, tokenizer, max_length):
        self.inputs = inputs
        self.outputs = outputs
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_text = self.inputs[idx]
        output_text = self.outputs[idx]

        input_encodings = self.tokenizer.encode(input_text, max_length=self.max_length)
        output_encodings = self.tokenizer.encode(output_text, max_length=self.max_length)

        input_ids = torch.tensor(input_encodings, dtype=torch.long)
        target_ids = torch.tensor(output_encodings, dtype=torch.long)

        input_attention_mask = [1 if id != self.tokenizer.pad_id else 0 for id in input_encodings]
        output_attention_mask = [1 if id != self.tokenizer.pad_id else 0 for id in output_encodings]

        input_attention_mask = torch.tensor(input_attention_mask, dtype=torch.bool)
        output_attention_mask = torch.tensor(output_attention_mask, dtype=torch.bool)

        return input_ids, target_ids, input_attention_mask, output_attention_mask

# Model definition
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, ff_dim, num_layers):
        super(TransformerModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)

        self.encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads, dim_feedforward=ff_dim, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers)
        self.decoder_layer = nn.TransformerDecoderLayer(d_model=embed_dim, nhead=num_heads, dim_feedforward=ff_dim, batch_first=True)
        self.transformer_decoder = nn.TransformerDecoder(self.decoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(embed_dim, vocab_size)

    def forward(self, src, tgt, src_key_padding_mask=None, tgt_key_padding_mask=None):
        src = self.embedding(src)
        tgt = self.embedding(tgt)

        memory = self.transformer_encoder(src, src_key_padding_mask=src_key_padding_mask)
        output = self.transformer_decoder(tgt, memory, memory_key_padding_mask=src_key_padding_mask, tgt_key_padding_mask=tgt_key_padding_mask)
        return self.fc(output)

# Training setup
batch_size = 32
sequence_length = 128
tokenizer = BertTokenizerWrapper()

# Initialize dataset and dataloader
dataset = CustomDataset(inputs=df['Input'].tolist(), outputs=df['Output'].tolist(), tokenizer=tokenizer, max_length=sequence_length)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Initialize model
model = TransformerModel(
    vocab_size=len(tokenizer.tokenizer),  # BERT vocab size
    embed_dim=256,
    num_heads=4,
    ff_dim=1024,
    num_layers=4
)

optimizer = AdamW(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.95)

# Training loop
num_epochs = 5
for epoch in range(num_epochs):
    total_loss = 0.0
    model.train()
    for batch_idx, (input_ids, output_ids, input_attention_mask, output_attention_mask) in enumerate(dataloader):
        # Prepare target input for the model
        tgt_input = torch.roll(output_ids, shifts=-1, dims=1)
        tgt_input[:, -1] = tokenizer.pad_id

        optimizer.zero_grad()

        # Forward pass
        outputs = model(
            src=input_ids,
            tgt=tgt_input,
            src_key_padding_mask=input_attention_mask,
            tgt_key_padding_mask=output_attention_mask
        )

        # Reshape outputs and calculate loss
        outputs = outputs.view(-1, len(tokenizer.tokenizer))
        loss = criterion(outputs, output_ids.view(-1))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    scheduler.step()
    print(f'Epoch {epoch+1}, Loss: {total_loss / len(dataloader)}')


Saving sheet.csv to sheet (2).csv
Index(['Input', 'Output'], dtype='object')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Epoch 2, Loss: 3.5950420796871185
Epoch 3, Loss: 3.1906862556934357
Epoch 4, Loss: 2.9761333763599396
Epoch 5, Loss: 2.7899497151374817


In [96]:
# Save the model's state dictionary
torch.save(model.state_dict(), 'transformer_model.pth')

In [98]:
# Load the saved model
# Initialize model
model = TransformerModel(
    vocab_size=tokenizer.tokenizer.vocab_size,  # Correctly get vocab size from BERT tokenizer
    embed_dim=256,
    num_heads=4,
    ff_dim=1024,
    num_layers=4
)

model.load_state_dict(torch.load('transformer_model.pth'))
model.eval()


  model.load_state_dict(torch.load('transformer_model.pth'))


TransformerModel(
  (embedding): Embedding(30522, 256)
  (encoder_layer): TransformerEncoderLayer(
    (self_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
    )
    (linear1): Linear(in_features=256, out_features=1024, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (linear2): Linear(in_features=1024, out_features=256, bias=True)
    (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
    (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
    (dropout1): Dropout(p=0.1, inplace=False)
    (dropout2): Dropout(p=0.1, inplace=False)
  )
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-3): 4 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
        )
        (linear1): Linear(in_features=256, out_features=1024, bias=True)
        (dro

In [112]:
import torch
from transformers import BertTokenizer  # Example for a generic BERT tokenizer

# Define padding token ID manually (commonly 0 for BERT models)
PAD_TOKEN_ID = 0

def generate_response(input_text, tokenizer, model, max_length=128):
    # Tokenize and encode the input text
    input_encodings = tokenizer.encode(input_text)  # No special tokens added

    # Manually handle padding and truncation
    if len(input_encodings) > max_length:
        input_encodings = input_encodings[:max_length]
    else:
        input_encodings += [tokenizer.pad_token_id] * (max_length - len(input_encodings))

    input_ids = torch.tensor(input_encodings).unsqueeze(0)  # Add batch dimension
    attention_mask = torch.tensor([1] * len(input_encodings) + [0] * (max_length - len(input_encodings))).unsqueeze(0)  # Create attention mask

    model.eval()
    with torch.no_grad():
        # Prepare target input for the model
        tgt_input = torch.full((1, max_length), tokenizer.pad_token_id, dtype=torch.long)  # Adjust if needed

        # Generate response
        outputs = model(input_ids=input_ids, decoder_input_ids=tgt_input, attention_mask=attention_mask)

        # Check if output is as expected
        print("Model Output Shape:", outputs.logits.shape)

        # Apply softmax to get probabilities
        output_probs = torch.softmax(outputs.logits, dim=-1)
        output_probs = output_probs.squeeze(0)  # Remove batch dimension

        # Check if output_probs is valid
        print("Output Probabilities Shape:", output_probs.shape)

        # Get the predicted token IDs
        output_ids = torch.argmax(output_probs, dim=-1).squeeze().tolist()

        # Debugging output IDs
        print("Predicted Output IDs:", output_ids)

        # Remove padding tokens by ID
        output_ids = [id for id in output_ids if id != tokenizer.pad_token_id]

        # Check if output_ids is not empty
        if not output_ids:
            return "No valid tokens generated"

        # Decode the output IDs to text
        response = tokenizer.decode(output_ids, skip_special_tokens=True)

    return response


In [122]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load pre-trained T5 model and tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')

input_text = "How does the T5 model work?"

response = generate_response(input_text, tokenizer, model)
print("Generated Response:", response)


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Model Output Shape: torch.Size([1, 128, 32128])
Output Probabilities Shape: torch.Size([128, 32128])
Predicted Output IDs: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Generated Response: No valid tokens generated
