**LLaMA**

In [4]:
TRAIN_PATH = '../data/'
# TEST_PATH = '/home/patrick/Documents/CSE_599/HW/2/testset/test.jsonl'
# VAL_PATH = '/home/patrick/Documents/CSE_599/HW/2/valset/val.jsonl'
MODEL_PATH = '/Users/anderson/Desktop/Project/LLaMA-From-Inference-to-Training/' #folder with generation.py, model.py, and tokenizer.py
TRAINED_SPM_PATH = './tokenizer.model' #downloaded from Ed post

**Init**

In [5]:
import torch

In [6]:
import sys
sys.path.append(MODEL_PATH)

# Copyright (c) Meta Platforms, Inc. and affiliates.
# This software may be used and distributed according to the terms of the GNU General Public License version 3.
from generation import LLaMA
from llama.model import ModelArgs, Transformer #ctrl+f and comment out cuda, all else same
#from model import ModelArgs, Transformer #use this one if you have NVIDIA GPU
from tokenizer import Tokenizer

**Data**

In [7]:
import json
from torch.utils.data import Dataset, DataLoader

Ingestion

In [8]:
def make_data_list(filepath:str, maxiter:int):
    '''ingests JSON into list (with tripwire parameter to prevent computer from crashing)'''
    data = []
    with open(filepath, 'r') as f:
        for i, line in enumerate(f):
            if i >= maxiter:
                break
            data.append(json.loads(line))
    return data

Data Model

In [9]:
from torch.nn.utils.rnn import pad_sequence

class TextDataset(Dataset):
    def __init__(self, texts, tokenizer):
        self.inputs = []
        self.targets = []

        for text in texts:
            encodings = tokenizer.encode(text, bos=True, eos=True)

            # TODO: Why this? 
            #takes all but the last token as input and all but the first token as target
            self.inputs.append(torch.tensor(encodings[:-1], dtype=torch.long))
            self.targets.append(torch.tensor(encodings[1:], dtype=torch.long))

    def __getitem__(self, idx):
        return {"input_ids": self.inputs[idx],
                "target_ids": self.targets[idx]}

    def __len__(self):
        return len(self.inputs)

def collate_fn(batch):
    input_ids = [item['input_ids'] for item in batch]
    target_ids = [item['target_ids'] for item in batch]
    
    max_seq_len = 2048 #Truncate sequences
    input_ids = [ids[:max_seq_len] for ids in input_ids]
    target_ids = [ids[:max_seq_len] for ids in target_ids]
    
    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0) #Add padding
    target_ids = pad_sequence(target_ids, batch_first=True, padding_value=0)
    return {'input_ids': input_ids, 'target_ids': target_ids}

# train_data = make_data_list(TRAIN_PATH, 10)
# test_data = make_data_list(TEST_PATH, 2)
# val_data = make_data_list(VAL_PATH, 1)

# def extract_texts(data_list):
#     '''gets rid of the metadata'''
#     return [item['text'] for item in data_list]

def train_test_split(file_path, train_n, valid_n, test_n): 
    with open(file_path, "r") as file: 
        train_texts = []
        val_texts = []
        test_texts = []
        for idx, line in enumerate(file.readlines()): 
            if idx < train_n: 
                train_texts.append(line)
            elif idx < valid_n: 
                val_texts.append(line)
            elif idx < test_n: 
                test_texts.append(line) 
            else: 
                break 
    return train_texts, val_texts, test_texts

train_texts, val_texts, test_texts = train_test_split("../data/11.txt", 20, 10, 10)

Processed Data

In [10]:
tokenizer = Tokenizer(TRAINED_SPM_PATH)
train_dataset = TextDataset(train_texts, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

**Training**

Configure environment for CPU

In [11]:
import torch.distributed as dist
import fairscale.nn.model_parallel.initialize as fs_init

%env RANK=0
%env WORLD_SIZE=1
%env MASTER_ADDR=localhost
%env MASTER_PORT=0

torch.distributed.init_process_group(backend='gloo')
fs_init.initialize_model_parallel(1) #1 worker

env: RANK=0
env: WORLD_SIZE=1
env: MASTER_ADDR=localhost
env: MASTER_PORT=0
> initializing model parallel with size 1
> initializing ddp with size 1
> initializing pipeline with size 1


Instantiate model

In [12]:
from model import ModelArgs, Transformer

model_args = ModelArgs(
    dim=512,
    n_layers=8,
    n_heads=8,
    vocab_size=tokenizer.n_words,
    multiple_of=256,
    norm_eps=1e-5,
    max_batch_size=32,
    max_seq_len=2048
)

model = Transformer(model_args)
optimizer = torch.optim.AdamW(model.parameters())
# TODO: How does the tokenizer have the pad_id? Isn't the padding coming from the collate_fn
loss_function = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_id)  #ignores padding token (0) for loss calculation

Training loop

In [13]:
def examine_tensor(tensor):
    '''debugging function'''
    print(tensor)
    print("Type:", tensor.type())
    print("Data Type:", tensor.dtype)
    print("Shape:", tensor.shape)
    print("Size:", tensor.size())
    print("Number of Dimensions:", tensor.ndim)
    print("Device:", tensor.device)
    print("Requires Grad:", tensor.requires_grad)
    print("Gradient:", tensor.grad)
    return

In [15]:
def train(model, dataloader, optimizer, loss_function, num_epochs):
    '''the training loop'''
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for batch in dataloader:
            optimizer.zero_grad()  # reset the gradients
            input_ids = batch['input_ids']
            target_ids = batch['target_ids']
            # examine_tensor(input_ids)
            examine_tensor(target_ids)
            outputs = model(input_ids, start_pos=0) #forward pass
            # examine_tensor(outputs)
            loss = loss_function(outputs.view(-1, outputs.size(-1)), target_ids.view(-1))
            loss.backward() #backward pass
            optimizer.step()
            total_loss += loss.item()
            
        print("Epoch: {}, Loss: {:.4f}".format(epoch, total_loss / len(dataloader)))
        
num_epochs = 1
train(model, train_dataloader, optimizer, loss_function, num_epochs)

tensor([[  512,  2058,   310,  ...,     0,     0,     0],
        [  739,   338,   553,  ...,     0,     0,     0],
        [29301,   310,  6615,  ...,     0,     0,     0],
        ...,
        [  379,  2121,   517,  ...,     0,     0,     0],
        [29871,    13,     2,  ...,     0,     0,     0],
        [ 1346,  4806,  1073,  ...,     0,     0,     0]])
Type: torch.LongTensor
Data Type: torch.int64
Shape: torch.Size([20, 342])
Size: torch.Size([20, 342])
Number of Dimensions: 2
Device: cpu
Requires Grad: False
Gradient: None
h shape = torch.Size([20, 342, 512])


ValueError: Expected input batch_size (20) to match target batch_size (6840).

In [None]:
torch.__version__

'2.0.1'