In [1]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

'nvidia-smi' is not recognized as an internal or external command,
operable program or batch file.


In [3]:
# Install the datasets library
! pip install datasets





[notice] A new release of pip is available: 24.1 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
# Import necessary libraries
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AdamW, DataCollatorForLanguageModeling
import torch
import torch.nn as nn
import torch.optim as optim
from datasets import load_dataset
from torch.utils.data import DataLoader, Dataset
import random
import pandas as pd
from sklearn.model_selection import train_test_split


**2- DEFINE LORA Layer**

Define the custom Low-Rank Adaptation (LoRA) layer.

In [5]:
# Define the LoRA Layer
class LoRALayer(nn.Module):
    def __init__(self, input_dim, rank=4):
        super(LoRALayer, self).__init__()
        self.rank = rank
        self.A = nn.Parameter(torch.randn(input_dim, rank))
        self.B = nn.Parameter(torch.randn(rank, input_dim))

    def forward(self, x):
        return x + torch.matmul(torch.matmul(x, self.A), self.B)


3. Modify GPT-2 to Include LoRA Layers
Create a new class to modify the GPT-2 model to include the LoRA layers.

In [6]:
# Modify GPT-2 to include LoRA layers
class GPT2WithLoRA(nn.Module):
    def __init__(self, model_name='gpt2', rank=4):
        super(GPT2WithLoRA, self).__init__()
        self.model = GPT2LMHeadModel.from_pretrained(model_name)
        self.rank = rank
        self.add_lora_layers()

    def add_lora_layers(self):
        for name, module in self.model.named_modules():
            if isinstance(module, nn.Linear):
                input_dim = module.in_features
                lora_layer = LoRALayer(input_dim, self.rank)
                module.add_module('lora', lora_layer)

    def forward(self, input_ids, attention_mask=None, labels=None):
        return self.model(input_ids, attention_mask=attention_mask, labels=labels)


4. Load Pre-trained Model and Tokenizer
Load the pre-trained GPT-2 model and tokenizer, and set the padding token.

In [7]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')

Using device: cpu


In [8]:
# Load the pre-trained GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token  # Set EOS token as pad token
model = GPT2WithLoRA('gpt2').to(device)



5. Define Optimizer and Loss Function
Set up the optimizer and loss function.

In [9]:
# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=5e-5)
criterion = nn.CrossEntropyLoss()




6. Load and Prepare Dataset
Load the WikiText-103 dataset, sample a subset, and prepare it for training.

In [10]:
# Load WikiText-103 dataset
wikitext = load_dataset('wikitext', 'wikitext-103-v1')

# Convert the dataset to a list of dictionaries
train_samples = [sample for sample in wikitext['train']]

# Use a small sample of 500 rows for quick testing
sample_size = 500
train_sampled = random.sample(train_samples, sample_size)
train_df = pd.DataFrame(train_sampled)



Downloading data:   0%|          | 0.00/722k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/156M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/156M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/655k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/1801350 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

In [None]:
train_df

Unnamed: 0,text
192,In accordance with the Marxist doctrine that ...
4,"Some women trained as gymnasts and dancers , ..."
16,= = Personnel = = \n
115,The song created controversy over the writing...
138,= = Parks and recreation = = \n
...,...
8,Irene also caused severe agricultural damage ...
117,
238,= = Lyrics = = \n
132,


In [None]:
# Split the dataset into training and validation sets
train_df, val_df = train_test_split(train_df, test_size=0.1)

7. Prepare Dataset for the Model
Prepare the dataset using a custom TextDataset class and a data loader with padding handled by a data collator.

In [34]:
# Prepare the dataset for the model
class TextDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=512):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]['text']
        inputs = self.tokenizer(text, max_length=self.max_length, truncation=True, padding='max_length', return_tensors="pt")
        input_ids = inputs['input_ids'].squeeze().long()  # Ensure LongTensor
        attention_mask = inputs['attention_mask'].squeeze().long()  # Ensure LongTensor
        return input_ids, attention_mask, input_ids

# Create DataLoader for training and validation datasets
train_dataset = TextDataset(train_df, tokenizer)
val_dataset = TextDataset(val_df, tokenizer)

In [35]:

# Custom collator function to handle padding and formatting
def custom_collate_fn(batch):
    input_ids = [item[0] for item in batch]
    attention_masks = [item[1] for item in batch]
    labels = [item[2] for item in batch]

    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id).long()
    attention_masks = torch.nn.utils.rnn.pad_sequence(attention_masks, batch_first=True, padding_value=0).long()
    labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=tokenizer.pad_token_id).long()

    return {
        'input_ids': input_ids,
        'attention_mask': attention_masks,
        'labels': labels
    }




It is important that we increase batch_size from 4 to 16 for better results



In [36]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=custom_collate_fn)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, collate_fn=custom_collate_fn)

8. Training Loop
Train the model with LoRA for a specified number of epochs and print the average loss.

In [38]:
# Training loop for the model with LoRA
model.train()
num_epochs = 10
patience = 2
best_val_loss = float('inf')
patience_counter = 0


for epoch in range(num_epochs):
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Skip empty batches
        if input_ids.size(0) == 0 or input_ids.size(1) == 0:
            continue

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}, Loss: {avg_loss:.4f}")

    # Validation step
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Skip empty batches
            if input_ids.size(0) == 0 or input_ids.size(1) == 0:
                continue

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_loader)
    print(f"Epoch {epoch+1}, Validation Loss: {avg_val_loss:.4f}")



    model.train()


Epoch 1, Loss: 0.6147
Epoch 1, Validation Loss: 0.2732
Epoch 2, Loss: 0.5267
Epoch 2, Validation Loss: 0.2718
Epoch 3, Loss: 0.5297
Epoch 3, Validation Loss: 0.2964
Epoch 4, Loss: 0.6990
Epoch 4, Validation Loss: 0.2965
Epoch 5, Loss: 0.5706
Epoch 5, Validation Loss: 0.2849
Epoch 6, Loss: 0.5355
Epoch 6, Validation Loss: 0.2807
Epoch 7, Loss: 0.4948
Epoch 7, Validation Loss: 0.2776
Epoch 8, Loss: 0.4778
Epoch 8, Validation Loss: 0.2755
Epoch 9, Loss: 0.4686
Epoch 9, Validation Loss: 0.2748
Epoch 10, Loss: 0.4506
Epoch 10, Validation Loss: 0.2737


9. Save the Model
Save the fine-tuned model and tokenizer.

In [39]:

# Save the model's state dictionary

torch.save(model.state_dict(), 'gpt2_with_lora_state_dict.pth')





In [40]:
# Save the tokenizer

tokenizer.save_pretrained('gpt2_with_lora_tokenizer')

('gpt2_with_lora_tokenizer/tokenizer_config.json',
 'gpt2_with_lora_tokenizer/special_tokens_map.json',
 'gpt2_with_lora_tokenizer/vocab.json',
 'gpt2_with_lora_tokenizer/merges.txt',
 'gpt2_with_lora_tokenizer/added_tokens.json')

Explanation:
-Imports: Import necessary libraries for model handling, dataset loading, and training.

-LoRA Layer: Define a custom layer that adds low-rank adaptation to the model.

-Model Modification: Modify GPT-2 to include the LoRA layers.

-Load Model: Load the pre-trained GPT-2 model and set the padding token.

-Optimizer and Loss: Set up the optimizer and loss function for training.

-Dataset Loading: Load and sample the WikiText-103 dataset for quick testing.

-Dataset Preparation: Prepare the dataset and data loader with padding handled by a data collator.

-Training Loop: Train the model and print the loss for each epoch.

-Save Model: Save the fine-tuned model and tokenizer for later use.
