<a href="https://colab.research.google.com/github/BuyiseloMonne/nucleusbot/blob/NLP/preparing_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import re
from collections import Counter
from google.colab import files

# Upload CSV file to Colab
uploaded = files.upload()
filename = list(uploaded.keys())[0]

# Load dataset
df = pd.read_csv(filename)
print(df.columns)

# Tokenizer definition
class SimpleTokenizer:
    def __init__(self, texts, vocab_size=10000, unk_token='<unk>', pad_token='<pad>'):
        self.pad_token = pad_token
        self.unk_token = unk_token
        self.vocab = self.build_vocab(texts, vocab_size)
        self.word_to_id = {word: idx for idx, word in enumerate(self.vocab)}
        self.id_to_word = {idx: word for word, idx in self.word_to_id.items()}
        self.unk_id = self.word_to_id.get(unk_token, None)
        self.pad_id = self.word_to_id.get(pad_token, None)

    def build_vocab(self, texts, vocab_size):
        tokens = [word for text in texts for word in self.tokenize(text)]
        token_counts = Counter(tokens)
        vocab = [self.pad_token, self.unk_token] + [word for word, _ in token_counts.most_common(vocab_size - 2)]
        return vocab

    def tokenize(self, text):
        return re.findall(r'\b\w+\b', text.lower())

    def encode(self, text, max_length=None):
        tokens = self.tokenize(text)
        encoded = [self.word_to_id.get(token, self.unk_id) for token in tokens]
        if max_length:
            if len(encoded) > max_length:
                encoded = encoded[:max_length]
            else:
                encoded += [self.pad_id] * (max_length - len(encoded))
        return encoded

    def decode(self, ids):
        return ' '.join([self.id_to_word.get(idx, self.unk_token) for idx in ids])

# Dataset definition
class CustomDataset(Dataset):
    def __init__(self, inputs, outputs, tokenizer, max_length):
        self.inputs = inputs
        self.outputs = outputs
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_text = self.inputs[idx]
        output_text = self.outputs[idx]

        input_encodings = self.tokenizer.encode(input_text, max_length=self.max_length)
        output_encodings = self.tokenizer.encode(output_text, max_length=self.max_length)

        input_attention_mask = [1 if id != self.tokenizer.pad_id else 0 for id in input_encodings]
        output_attention_mask = [1 if id != self.tokenizer.pad_id else 0 for id in output_encodings]

        input_ids = torch.tensor(input_encodings, dtype=torch.long)
        target_ids = torch.tensor(output_encodings, dtype=torch.long)
        input_attention_mask = torch.tensor(input_attention_mask, dtype=torch.bool)
        output_attention_mask = torch.tensor(output_attention_mask, dtype=torch.bool)

        return input_ids, target_ids, input_attention_mask, output_attention_mask

# Model definition
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, ff_dim, num_layers):
        super(TransformerModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)

        # Set batch_first=True for the Transformer layers
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads, dim_feedforward=ff_dim, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers)
        self.decoder_layer = nn.TransformerDecoderLayer(d_model=embed_dim, nhead=num_heads, dim_feedforward=ff_dim, batch_first=True)
        self.transformer_decoder = nn.TransformerDecoder(self.decoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(embed_dim, vocab_size)

    def forward(self, src, tgt, src_key_padding_mask=None, tgt_key_padding_mask=None):
        src = self.embedding(src)
        tgt = self.embedding(tgt)

        # No need to squeeze if batch_first=True
        memory = self.transformer_encoder(src, src_key_padding_mask=src_key_padding_mask)
        output = self.transformer_decoder(tgt, memory, memory_key_padding_mask=src_key_padding_mask, tgt_key_padding_mask=tgt_key_padding_mask)
        return self.fc(output)


# Training setup
batch_size = 32
sequence_length = 128
tokenizer = SimpleTokenizer(texts=df['Input'].tolist() + df['Output'].tolist())
dataset = CustomDataset(inputs=df['Input'].tolist(), outputs=df['Output'].tolist(), tokenizer=tokenizer, max_length=sequence_length)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

model = TransformerModel(
    vocab_size=len(tokenizer.word_to_id),
    embed_dim=256,  # Reduced from 512
    num_heads=4,    # Reduced from 8
    ff_dim=1024,    # Reduced from 2048
    num_layers=4    # Reduced from 6
)

optimizer = AdamW(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.95)  # Optional scheduler

# Training loop
num_epochs = 5
for epoch in range(num_epochs):
    total_loss = 0.0
    model.train()
    for batch_idx, (input_ids, output_ids, input_attention_mask, output_attention_mask) in enumerate(dataloader):
        tgt_input = torch.roll(output_ids, shifts=-1, dims=1)
        tgt_input[:, -1] = tokenizer.pad_id

        optimizer.zero_grad()

        outputs = model(
            src=input_ids,
            tgt=tgt_input,
            src_key_padding_mask=input_attention_mask,
            tgt_key_padding_mask=output_attention_mask
        )

        outputs = outputs.view(-1, len(tokenizer.word_to_id))
        loss = criterion(outputs, output_ids.view(-1))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    scheduler.step()
    print(f'Epoch {epoch+1}, Loss: {total_loss / len(dataloader)}')


Saving sheet.csv to sheet (1).csv
Index(['Input', 'Output'], dtype='object')
Epoch 1, Loss: 3.0717904567718506
Epoch 2, Loss: 1.4587529003620148
Epoch 3, Loss: 1.383529469370842
Epoch 4, Loss: 1.3047321736812592
Epoch 5, Loss: 1.1655197739601135


In [3]:
# Save the model's state dictionary
torch.save(model.state_dict(), 'transformer_model.pth')

In [None]:
# Load the saved model
model = model = TransformerModel(
    vocab_size=len(tokenizer.word_to_id),
    embed_dim=256,  # Reduced from 512
    num_heads=4,    # Reduced from 8
    ff_dim=1024,    # Reduced from 2048
    num_layers=4    # Reduced from 6
)
model.load_state_dict(torch.load('transformer_model.pth'))
model.eval()


In [28]:
def generate_response(input_text, tokenizer, model, max_length=128):
    # Tokenize the input text
    input_encodings = tokenizer.encode(input_text, max_length=max_length)
    input_ids = torch.tensor(input_encodings).unsqueeze(0)  # Add batch dimension

    model.eval()
    with torch.no_grad():
        # Prepare target input for the model (if needed)
        tgt_input = torch.zeros_like(input_ids)  # Start token can be used if you have one

        # Generate response
        output = model(input_ids, tgt_input)  # Modify as per your model's architecture

        # Get the predicted token IDs
        output_ids = torch.argmax(output, dim=-1).squeeze().tolist()

        # Remove padding tokens
        output_ids = [id for id in output_ids if id != tokenizer.pad_id]

        # Decode the output IDs to text
        response = tokenizer.decode(output_ids)

    return response


In [29]:
# Define the input text
input_text = "Where is Nucleus Devs located?"

# Generate a response using the model
response = generate_response(input_text, tokenizer, model)

# Print the generated response
print(f"Response: {response}")

Response: 


In [19]:
print("Sample input-output pairs:")
for i in range(5):  # Print first 3 samples
    print(f"Input: {df['Input'][i]}")
    print(f"Output: {df['Output'][i]}")


Sample input-output pairs:
Input: What does Nucleus Devs do?
Output: Nucleus Devs specializes in providing innovative software solutions, including custom development, AI integration, and technology consulting.
Input: Where is Nucleus Devs located?
Output: Nucleus Devs is based on Constitution Road, LNIG road, Lesotho Housing in Maseru, Lesotho.
Input: How long has Nucleus Devs been in business?
Output: Nucleus Devs has been operating for several years, with a proven track record in delivering high-quality software solutions.
Input: What industries does Nucleus Devs serve?
Output: We serve various industries including technology, finance, healthcare, and education, providing tailored software solutions to meet their unique needs.
Input: What are the core values of Nucleus Devs?
Output: Our core values include innovation, integrity, customer focus, and excellence. We strive to deliver solutions that not only meet but exceed our clients' expectations.
