<a href="https://colab.research.google.com/github/BuyiseloMonne/nucleusbot/blob/NLP/preparing_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [32]:
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import re
from collections import Counter
from google.colab import files

# Upload CSV file to Colab
uploaded = files.upload()
filename = list(uploaded.keys())[0]

# Load dataset
df = pd.read_csv(filename)
print(df.columns)

class SimpleTokenizer:
    def __init__(self, texts, vocab_size=10000, unk_token='<unk>', pad_token='<pad>'):
        # Initialize vocab tokens
        self.pad_token = pad_token
        self.unk_token = unk_token

        # Build vocabulary from texts
        self.vocab = self.build_vocab(texts, vocab_size)
        self.word_to_id = {word: idx for idx, word in enumerate(self.vocab)}
        self.id_to_word = {idx: word for word, idx in self.word_to_id.items()}
        self.unk_id = self.word_to_id.get(unk_token, None)
        self.pad_id = self.word_to_id.get(pad_token, None)

    def build_vocab(self, texts, vocab_size):
        tokens = [word for text in texts for word in self.tokenize(text)]
        token_counts = Counter(tokens)
        # Add special tokens
        vocab = [self.pad_token, self.unk_token] + [word for word, _ in token_counts.most_common(vocab_size - 2)]
        return vocab

    def tokenize(self, text):
        return re.findall(r'\b\w+\b', text.lower())

    def encode(self, text, max_length=None):
        tokens = self.tokenize(text)
        encoded = [self.word_to_id.get(token, self.unk_id) for token in tokens]
        if max_length:
            if len(encoded) > max_length:
                encoded = encoded[:max_length]
            else:
                encoded += [self.pad_id] * (max_length - len(encoded))
        return encoded

    def decode(self, ids):
        return ' '.join([self.id_to_word.get(idx, self.unk_token) for idx in ids])


# Define your dataset
class CustomDataset(Dataset):
    def __init__(self, inputs, outputs, tokenizer, max_length):
        self.inputs = inputs
        self.outputs = outputs
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_text = self.inputs[idx]
        output_text = self.outputs[idx]

        input_encodings = self.tokenizer.encode(input_text, max_length=self.max_length)
        output_encodings = self.tokenizer.encode(output_text, max_length=self.max_length)

        input_ids = torch.tensor(input_encodings, dtype=torch.long)
        target_ids = torch.tensor(output_encodings, dtype=torch.long)
        attention_mask = torch.ones_like(input_ids)  # Dummy attention mask

        return input_ids, target_ids, attention_mask


# Define your Transformer model
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, ff_dim, num_layers):
        super(TransformerModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.transformer = nn.Transformer(
            d_model=embed_dim,
            nhead=num_heads,
            num_encoder_layers=num_layers,
            num_decoder_layers=num_layers,
            dim_feedforward=ff_dim,
            batch_first=True  # Set batch_first=True
        )
        self.fc = nn.Linear(embed_dim, vocab_size)

    def forward(self, src, tgt):
        src = self.embedding(src)
        tgt = self.embedding(tgt)
        output = self.transformer(src, tgt)
        return self.fc(output)

# Initialize tokenizer and dataset
tokenizer = SimpleTokenizer(texts=df['Input'].tolist() + df['Output'].tolist())
dataset = CustomDataset(inputs=df['Input'].tolist(), outputs=df['Output'].tolist(), tokenizer=tokenizer, max_length=128)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)


# Initialize model, optimizer, and loss function
model = TransformerModel(vocab_size=len(tokenizer.word_to_id), embed_dim=512, num_heads=8, ff_dim=2048, num_layers=6)
optimizer = AdamW(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()

# Training loop
num_epochs = 5

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for input_ids, output_ids, attention_mask in dataloader:
        # Shift target sequences by one for the decoder
        tgt_input = torch.roll(input_ids, shifts=-1, dims=1)
        tgt_input[:, -1] = tokenizer.pad_id  # Ensure the last token is a pad token

        optimizer.zero_grad()

        # Forward pass
        outputs = model(input_ids, tgt_input)

        # Compute loss
        loss = criterion(outputs.view(-1, len(tokenizer.word_to_id)), output_ids.view(-1))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f'Epoch {epoch+1}, Loss: {total_loss / len(dataloader)}')


Saving sheet.csv to sheet (1).csv
Index(['Input', 'Output'], dtype='object')
Epoch 1, Loss: 2.3087978661060333
Epoch 2, Loss: 1.4120764881372452
Epoch 3, Loss: 1.3629428148269653
Epoch 4, Loss: 1.3358401209115982
Epoch 5, Loss: 1.3180294930934906


In [33]:
# Save the model's state dictionary
torch.save(model.state_dict(), 'transformer_model.pth')


In [34]:
# Load the saved model
model = TransformerModel(vocab_size=len(tokenizer.word_to_id), embed_dim=512, num_heads=8, ff_dim=2048, num_layers=6)
model.load_state_dict(torch.load('transformer_model.pth'))
model.eval()


  model.load_state_dict(torch.load('transformer_model.pth'))


TransformerModel(
  (embedding): Embedding(990, 512)
  (transformer): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0-5): 6 x TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
          )
          (linear1): Linear(in_features=512, out_features=2048, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=2048, out_features=512, bias=True)
          (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
      )
      (norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    )
    (decoder): TransformerDecoder(
      (layers): ModuleList(
        (0-5): 6 x TransformerDecoderLayer(
   

In [45]:
def generate_response(input_text, tokenizer, model, max_length=128):
    # Tokenize the input text
    input_encodings = tokenizer.encode(input_text, max_length=max_length)
    input_ids = torch.tensor(input_encodings).unsqueeze(0)  # Add batch dimension

    model.eval()
    with torch.no_grad():
        # Prepare target input for the model (if needed)
        tgt_input = torch.zeros_like(input_ids)  # Start token can be used if you have one

        # Generate response
        output = model(input_ids, tgt_input)  # Modify as per your model's architecture

        # Get the predicted token IDs
        output_ids = torch.argmax(output, dim=-1).squeeze().tolist()

        # Remove padding tokens
        output_ids = [id for id in output_ids if id != tokenizer.pad_id]

        # Decode the output IDs to text
        response = tokenizer.decode(output_ids)

    return response


In [46]:
# Define the input text
input_text = "Where is Nucleus Devs located?"

# Generate a response using the model
response = generate_response(input_text, tokenizer, model)

# Print the generated response
print(f"Response: {response}")

Response: 


In [42]:
print("Sample input-output pairs:")
for i in range(10):  # Print first 3 samples
    print(f"Input: {df['Input'][i]}")
    print(f"Output: {df['Output'][i]}")


Sample input-output pairs:
Input: What does Nucleus Devs do?
Output: Nucleus Devs specializes in providing innovative software solutions, including custom development, AI integration, and technology consulting.
Input: Where is Nucleus Devs located?
Output: Nucleus Devs is based on Constitution Road, LNIG road, Lesotho Housing in Maseru, Lesotho.
Input: How long has Nucleus Devs been in business?
Output: Nucleus Devs has been operating for several years, with a proven track record in delivering high-quality software solutions.
Input: What industries does Nucleus Devs serve?
Output: We serve various industries including technology, finance, healthcare, and education, providing tailored software solutions to meet their unique needs.
Input: What are the core values of Nucleus Devs?
Output: Our core values include innovation, integrity, customer focus, and excellence. We strive to deliver solutions that not only meet but exceed our clients' expectations.
Input: Can you describe your team?
O