<a href="https://colab.research.google.com/github/Armancollab/GoogleCollab/blob/main/Untitled2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
from torch.utils.data import DataLoader
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import pandas as pd

class YourDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, targets, tokenizer):
        self.inputs = inputs
        self.targets = targets
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_text = self.inputs[idx]
        target_text = self.targets[idx]

        # Tokenize and convert the input and target texts to tensors
        inputs = self.tokenizer.encode(input_text, add_special_tokens=True, truncation=True, max_length=256)  # Reduce max_length
        targets = self.tokenizer.encode(target_text, add_special_tokens=True, truncation=True, max_length=256)  # Reduce max_length

        return inputs, targets

def collate_fn(batch):
    inputs, targets = zip(*batch)

    # Pad the sequences to the same length
    max_length = max(len(seq) for seq in inputs + targets)
    padded_inputs = [seq + [0] * (max_length - len(seq)) for seq in inputs]
    padded_targets = [seq + [0] * (max_length - len(seq)) for seq in targets]

    return torch.tensor(padded_inputs), torch.tensor(padded_targets)

def main():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    print("Loading the dataset...")
    df = pd.read_parquet('/content/sample_data/train-00003-of-00004-ef0356d35c1172f0.parquet')
    print("Dataset loaded successfully.")

    print("Accessing columns...")
    input_texts = df['prompt'].tolist()
    target_texts = df['response'].tolist()
    print("Columns accessed successfully.")

    print(f"Number of input texts: {len(input_texts)}")
    print(f"Number of target texts: {len(target_texts)}")

    print("Preprocessing the data...")
    # Your preprocessing code here
    print("Data preprocessed successfully.")

    print("Creating the training dataset...")
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    train_data = YourDataset(input_texts, target_texts, tokenizer)
    print("Training dataset created successfully.")

    model = GPT2LMHeadModel.from_pretrained('gpt2')
    model.to(device)

    # Set up other training parameters
    batch_size = 4  # Reduce batch size
    learning_rate = 1e-4
    num_epochs = 10
    gradient_accumulation_steps = 4  # Accumulate gradients every 4 batches

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    criterion = torch.nn.CrossEntropyLoss()

    print("Starting the training loop...")
    total_batches = 0
    print_interval = 100  # Adjust the interval to your preference

    for epoch in range(num_epochs):
        running_loss = 0.0
        # Create the DataLoader for training
        train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

        for i, batch in enumerate(train_loader):
            inputs, targets = batch

            inputs = inputs.to(device)
            targets = targets.to(device)

            optimizer.zero_grad()

            # Forward pass
            outputs = model(inputs, labels=targets)
            loss = outputs.loss

            # Backward pass and optimization
            loss.backward()

            total_batches += 1
            if total_batches % gradient_accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()

            running_loss += loss.item()

            # Print the average loss at regular intervals
            if (i + 1) % print_interval == 0:
                average_loss = running_loss / print_interval
                print(f'Epoch {epoch+1}/{num_epochs} - Batch {i+1}/{len(train_loader)} - Loss: {average_loss:.4f}')
                running_loss = 0.0

        epoch_loss = running_loss / len(train_loader)
        print(f'Epoch {epoch+1}/{num_epochs} - Loss: {epoch_loss:.4f}')

    print("Training completed.")

    # Save the trained model
    print("Saving the trained model...")
    model.save_pretrained('/content/models')
    print("Model saved successfully.")

if __name__ == '__main__':
    main()


Using device: cuda
Loading the dataset...
Dataset loaded successfully.
Accessing columns...
Columns accessed successfully.
Number of input texts: 202203
Number of target texts: 202203
Preprocessing the data...
Data preprocessed successfully.
Creating the training dataset...


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Training dataset created successfully.


Downloading model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Starting the training loop...
Epoch 1/10 - Batch 100/50551 - Loss: 5.1510
Epoch 1/10 - Batch 200/50551 - Loss: 4.1704
Epoch 1/10 - Batch 300/50551 - Loss: 4.2081
Epoch 1/10 - Batch 400/50551 - Loss: 3.9009
Epoch 1/10 - Batch 500/50551 - Loss: 4.1758
Epoch 1/10 - Batch 600/50551 - Loss: 3.9663
Epoch 1/10 - Batch 700/50551 - Loss: 3.9095
Epoch 1/10 - Batch 800/50551 - Loss: 4.1139
Epoch 1/10 - Batch 900/50551 - Loss: 3.9101
Epoch 1/10 - Batch 1000/50551 - Loss: 3.9735
Epoch 1/10 - Batch 1100/50551 - Loss: 3.9773
Epoch 1/10 - Batch 1200/50551 - Loss: 3.9460
Epoch 1/10 - Batch 1300/50551 - Loss: 3.8896
Epoch 1/10 - Batch 1400/50551 - Loss: 3.8908
Epoch 1/10 - Batch 1500/50551 - Loss: 3.7159
Epoch 1/10 - Batch 1600/50551 - Loss: 4.2274
Epoch 1/10 - Batch 1700/50551 - Loss: 3.9539
Epoch 1/10 - Batch 1800/50551 - Loss: 3.8903
Epoch 1/10 - Batch 1900/50551 - Loss: 3.8171
Epoch 1/10 - Batch 2000/50551 - Loss: 3.8406
Epoch 1/10 - Batch 2100/50551 - Loss: 4.0802
Epoch 1/10 - Batch 2200/50551 - Lo