In [6]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Wed Jul 10 09:56:31 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P0              43W / 400W |      2MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [3]:
# Install the datasets library
!pip install datasets


Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/547.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m204.8/547.8 kB[0m [31m6.1 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m542.7/547.8 kB[0m [31m8.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m41.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB

In [1]:
# Import necessary libraries
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AdamW, DataCollatorForLanguageModeling
import torch
import torch.nn as nn
import torch.optim as optim
from datasets import load_dataset
from torch.utils.data import DataLoader, Dataset
import random
import pandas as pd


**2- DEFINE LORA Layer**

Define the custom Low-Rank Adaptation (LoRA) layer.

In [10]:
# Define the LoRA Layer
class LoRALayer(nn.Module):
    def __init__(self, input_dim, rank=4):
        super(LoRALayer, self).__init__()
        self.rank = rank
        self.A = nn.Parameter(torch.randn(input_dim, rank))
        self.B = nn.Parameter(torch.randn(rank, input_dim))

    def forward(self, x):
        return x + torch.matmul(torch.matmul(x, self.A), self.B)


In [None]:
3. Modify GPT-2 to Include LoRA Layers
Create a new class to modify the GPT-2 model to include the LoRA layers.

In [11]:
# Modify GPT-2 to include LoRA layers
class GPT2WithLoRA(nn.Module):
    def __init__(self, model_name='gpt2', rank=4):
        super(GPT2WithLoRA, self).__init__()
        self.model = GPT2LMHeadModel.from_pretrained(model_name)
        self.rank = rank
        self.add_lora_layers()

    def add_lora_layers(self):
        for name, module in self.model.named_modules():
            if isinstance(module, nn.Linear):
                input_dim = module.in_features
                lora_layer = LoRALayer(input_dim, self.rank)
                module.add_module('lora', lora_layer)

    def forward(self, input_ids, attention_mask=None, labels=None):
        return self.model(input_ids, attention_mask=attention_mask, labels=labels)


4. Load Pre-trained Model and Tokenizer
Load the pre-trained GPT-2 model and tokenizer, and set the padding token.

In [12]:
# Load the pre-trained GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token  # Set EOS token as pad token
model = GPT2WithLoRA('gpt2')


5. Define Optimizer and Loss Function
Set up the optimizer and loss function.

In [13]:
# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=5e-5)
criterion = nn.CrossEntropyLoss()




6. Load and Prepare Dataset
Load the WikiText-103 dataset, sample a subset, and prepare it for training.

In [14]:
# Load WikiText-103 dataset
wikitext = load_dataset('wikitext', 'wikitext-103-v1')

# Convert the dataset to a list of dictionaries
train_samples = [sample for sample in wikitext['train']]

# Use a small sample of 100 rows for quick testing
sample_size = 100
train_sampled = random.sample(train_samples, sample_size)
train_df = pd.DataFrame(train_sampled)



7. Prepare Dataset for the Model
Prepare the dataset using a custom TextDataset class and a data loader with padding handled by a data collator.

In [18]:
# Prepare the dataset for the model
class TextDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=512):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]['text']
        inputs = self.tokenizer(text, max_length=self.max_length, truncation=True, padding='max_length', return_tensors="pt")
        input_ids = inputs['input_ids'].squeeze().long()  # Ensure LongTensor
        attention_mask = inputs['attention_mask'].squeeze().long()  # Ensure LongTensor
        return input_ids, attention_mask, input_ids


train_dataset = TextDataset(train_df, tokenizer)


In [19]:
# Custom collator function to handle padding and formatting
def custom_collate_fn(batch):
    input_ids = [item[0] for item in batch]
    attention_masks = [item[1] for item in batch]
    labels = [item[2] for item in batch]

    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id).long()
    attention_masks = torch.nn.utils.rnn.pad_sequence(attention_masks, batch_first=True, padding_value=0).long()
    labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=tokenizer.pad_token_id).long()

    return {
        'input_ids': input_ids,
        'attention_mask': attention_masks,
        'labels': labels
    }


8. Training Loop
Train the model with LoRA for a specified number of epochs and print the average loss.

In [21]:
# Training loop for the model with LoRA
model.train()
num_epochs = 3

for epoch in range(num_epochs):
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}, Loss: {avg_loss:.4f}")



Epoch 1, Loss: 0.4187
Epoch 2, Loss: 0.3918
Epoch 3, Loss: 0.3676


9. Save the Model
Save the fine-tuned model and tokenizer.

In [22]:
# Save the model's state dictionary

torch.save(model.state_dict(), 'gpt2_with_lora_state_dict.pth')



In [23]:
# Save the tokenizer

tokenizer.save_pretrained('gpt2_with_lora_tokenizer')

('gpt2_with_lora_tokenizer/tokenizer_config.json',
 'gpt2_with_lora_tokenizer/special_tokens_map.json',
 'gpt2_with_lora_tokenizer/vocab.json',
 'gpt2_with_lora_tokenizer/merges.txt',
 'gpt2_with_lora_tokenizer/added_tokens.json')

Explanation:
-Imports: Import necessary libraries for model handling, dataset loading, and training.

-LoRA Layer: Define a custom layer that adds low-rank adaptation to the model.

-Model Modification: Modify GPT-2 to include the LoRA layers.

-Load Model: Load the pre-trained GPT-2 model and set the padding token.

-Optimizer and Loss: Set up the optimizer and loss function for training.

-Dataset Loading: Load and sample the WikiText-103 dataset for quick testing.

-Dataset Preparation: Prepare the dataset and data loader with padding handled by a data collator.

-Training Loop: Train the model and print the loss for each epoch.

-Save Model: Save the fine-tuned model and tokenizer for later use.
