<a href="https://colab.research.google.com/github/Debangshu93/LLama-Ensemble/blob/main/Optimized_Llama.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets

In [None]:
import torch
from torch import nn
from torch.nn import functional as F
import numpy as np
import math

In [None]:
config = {
    "vocab_size": 32000,
    'batch_size': 16,
    'context_window': 1024,
    'd_model': 4096,
    'n_heads' : 32,
    'ensembles' : 50,
    'multiple_of' : 256,
    'n_layers' : 25,
    'device' : 'cuda'
}


In [None]:
def precompute_theta_pos_frequencies(head_dim: int, seq_len: int, device: str, theta: float = 10000.0):
    assert head_dim % 2 == 0, "Dimension must be divisible by 2"
    theta_numerator = torch.arange(0, head_dim, 2).float()
    theta = 1.0 / (theta ** (theta_numerator / head_dim)).to(device) # (Dim / 2)
    m = torch.arange(seq_len, device=device)
    freqs = torch.outer(m, theta).float()
    freqs_complex = torch.polar(torch.ones_like(freqs), freqs)
    return freqs_complex

def apply_rotary_embeddings(x: torch.Tensor, freqs_complex: torch.Tensor, device: str):
    x_complex = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))
    freqs_complex = freqs_complex.unsqueeze(0).unsqueeze(1).unsqueeze(3)
    x_rotated = x_complex * freqs_complex
    x_out = torch.view_as_real(x_rotated)
    x_out = x_out.reshape(*x.shape)
    return x_out.type_as(x).to(device)

def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor:
    batch_size, ensembles, seq_len, n_kv_heads, head_dim = x.shape
    if n_rep == 1:
        return x
    return (
        x[:, :, :, :, None, :].expand(batch_size, ensembles, seq_len, n_kv_heads, n_rep, head_dim).reshape(batch_size, ensembles, seq_len, n_kv_heads * n_rep, head_dim)
    )

In [None]:
class LoraLayer(nn.Module):
  def __init__(self, features_in, features_out, name, enabled, rank = 1, alpha = 1):
    super().__init__()

    self.lora_A = nn.Parameter(torch.zeros((rank, features_out)))
    self.lora_B = nn.Parameter(torch.zeros((features_in, rank)))
    nn.init.normal_(self.lora_A, mean = 0, std = 1)
    self.name = name

    self.scale = alpha/rank
    self.enabled = enabled

  def forward(self, original_weights):
    if self.enabled:
      return (original_weights + torch.matmul(self.lora_B, self.lora_A).view(original_weights.shape)*self.scale)
    else:
      return original_weights

class Ensemble(nn.Module):
  def __init__(self, features_in, features_out, enabled):
    super().__init__()
    self.num_ensembles = config['ensembles']
    self.enabled = enabled
    self.adapters = nn.ModuleList()
    if enabled :
      for i in range(self.num_ensembles):
        self.adapters.append(LoraLayer(features_in, features_out, "adapter_"+str(i), enabled))

    else :
      self.adapter = None

  def forward(self, original_weights):
    if self.enabled :
      return torch.concatenate([self.adapters[i](original_weights).unsqueeze(2) for i in range(self.num_ensembles)], dim = 2)
       #a three dimensional tensor
    else :
      return original_weights.unsqueeze(2)

class Linear(nn.Module):
  def __init__(self, features_in, features_out, enabled, bias = True):
    super().__init__()

    self.weight = nn.Parameter(torch.zeros((features_out, features_in)))
    self.is_bias = bias
    if self.is_bias :
      self.bias = nn.Parameter(torch.zeros(features_out,))
      nn.init.normal_(self.bias, mean = 0, std = 1)
    nn.init.normal_(self.weight, mean = 0, std = 1)
    self.adapters =  Ensemble(features_in, features_out, enabled)

  def forward(self, x): #input is a 4d tensor [batch, 1, feat_in, feat_out]
    self.parallel_weights = self.adapters(self.weight)
    if self.is_bias :
      return torch.matmul(x, self.parallel_weights.transpose(0,2)) + self.bias
    else:
      return torch.matmul(x, self.parallel_weights.transpose(0,2))  #output is a 4d tensor [batch, d, feat_in, out_dim]


class RMSNorm(nn.Module):
    def __init__(self, dim: int, eps: float = 1e-6):
        super().__init__()
        self.eps = eps
        self.weight = nn.Parameter(torch.ones(dim))

    def _norm(self, x: torch.Tensor):
        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)

    def forward(self, x: torch.Tensor):
        return self.weight * self._norm(x.float()).type_as(x)


class SelfAttention(nn.Module):
    def __init__(self, enabled, config):
        super().__init__()


        self.n_kv_heads = config['n_heads']
        self.n_heads_q = config['n_heads']
        self.n_rep = self.n_heads_q // self.n_kv_heads
        self.head_dim = config['d_model'] // config['n_heads']
        if enabled:
          self.ensembles = config['ensembles']
        else:
          self.ensembles = 1
        self.device = config['device']

        self.wq = Linear(config['d_model'], config['n_heads'] * self.head_dim, enabled, bias=False)
        self.wk = Linear(config['d_model'], self.n_kv_heads * self.head_dim, enabled, bias=False)
        self.wv = Linear(config['d_model'], self.n_kv_heads * self.head_dim, enabled, bias=False)
        self.wo = Linear(config['n_heads'] * self.head_dim, config['d_model'], enabled, bias=False)

    def forward(self, x: torch.Tensor, start_pos: int, freqs_complex: torch.Tensor):

        batch_size, ensembles, seq_len, _ = x.shape

        xq = self.wq(x)
        xk = self.wk(x)
        xv = self.wv(x)

        xq = xq.view(batch_size, self.ensembles, seq_len, self.n_heads_q, self.head_dim)
        xk = xk.view(batch_size, self.ensembles, seq_len, self.n_kv_heads, self.head_dim)
        xv = xv.view(batch_size, self.ensembles, seq_len, self.n_kv_heads, self.head_dim)
        xq = apply_rotary_embeddings(xq, freqs_complex, device = x.device)
        xk = apply_rotary_embeddings(xk, freqs_complex, device = x.device)

        keys = xk
        values = xv

        keys = repeat_kv(keys, self.n_rep)
        values = repeat_kv(values, self.n_rep)

        xq = xq.transpose(2, 3)
        keys = keys.transpose(2, 3)
        values = values.transpose(2, 3)
        scores = torch.matmul(xq, keys.transpose(3, 4)) / np.sqrt(self.head_dim)
        scores = F.softmax(scores.float(), dim=-1).type_as(xq)

        output = torch.matmul(scores, values)
        output = (output.transpose(2, 3).contiguous().view(batch_size, self.ensembles, seq_len, -1))
        return self.wo(output)


class FeedForward(nn.Module):
    def __init__(self, enabled, config):
        super().__init__()

        hidden_dim = 4 * config["d_model"]
        hidden_dim = int(2 * hidden_dim / 3)
        hidden_dim = config["multiple_of"] * ((hidden_dim + config["multiple_of"] - 1) // config["multiple_of"])

        self.w1 = Linear(config["d_model"], hidden_dim, enabled, bias=False)
        self.w2 = Linear(hidden_dim, config["d_model"], enabled, bias=False)
        self.w3 = Linear(config["d_model"], hidden_dim, enabled, bias=False)

    def forward(self, x: torch.Tensor):
        swish = F.silu(self.w1(x))
        x_V = self.w3(x)
        x = swish * x_V
        x = self.w2(x)
        return x

class EncoderBlock(nn.Module):

    def __init__(self, config, enabled):
        super().__init__()

        self.n_heads = config["n_heads"]
        self.dim = config["d_model"]
        self.head_dim = config["d_model"] // config["n_heads"]

        self.attention = SelfAttention(enabled, config)
        self.feed_forward = FeedForward(enabled, config)

        self.attention_norm = RMSNorm(config["d_model"])
        self.ffn_norm = RMSNorm(config["d_model"])

    def forward(self, x: torch.Tensor, start_pos: int, freqs_complex: torch.Tensor):
        h = x + self.attention.forward(
            self.attention_norm(x), start_pos, freqs_complex
        )
        out = h + self.feed_forward.forward(self.ffn_norm(h))
        return out


class Transformer(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.vocab_size = config['vocab_size']
        self.n_layers = config['n_layers']
        self.tok_embeddings = nn.Embedding(config['vocab_size'], config['d_model'])

        self.layers = nn.ModuleList()
        for layer_id in range(config['n_layers']):
            self.layers.append(EncoderBlock(config, enabled = False))

        self.norm = RMSNorm(config['d_model'])
        self.output = Linear(config['d_model']*config['context_window'], 1, enabled = True, bias=False)

        self.freqs_complex = precompute_theta_pos_frequencies(config['d_model'] // config['n_heads'], config['context_window'] * 2, device = config['device'])

    def forward(self, tokens: torch.Tensor):
        start_pos = 0
        batch_size, _, seq_len = tokens.shape
        h = self.tok_embeddings(tokens)
        freqs_complex = self.freqs_complex[start_pos:start_pos + seq_len]

        for layer in self.layers:
            h = layer(h, start_pos, freqs_complex)
        h = self.norm(h)
        interim = torch.flatten(h, start_dim= 2, end_dim = -1).unsqueeze(2)
        reward = self.output(interim).view(config['batch_size'], config['ensembles'])
        return reward


In [None]:
from llama.tokenizer import Tokenizer #Local File

In [None]:
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union
import numpy as np
import torch
import torch.nn as nn
from datasets import load_dataset

In [None]:
train_dataset, test_dataset = load_dataset("Anthropic/hh-rlhf", data_dir="helpful-base", split=["train", "test"])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/5.77k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/16.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/875k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [None]:
tokenizer_path='/content/drive/MyDrive/Llama/tokenizer.model'
tokenizer = Tokenizer(tokenizer_path)


In [None]:
def encode(prompts, tokenizer, config):

  bsz = len(prompts)

  prompt_tokens = [tokenizer.encode(x, bos=True, eos=True) for x in prompts]
  min_prompt_len = min(len(t) for t in prompt_tokens)
  max_prompt_len = max(len(t) for t in prompt_tokens)

  total_len = config["context_window"]

  pad_id = tokenizer.eos_id
  tokens = torch.full((bsz, total_len), pad_id, dtype=torch.long, device=config["device"])
  for k, t in enumerate(prompt_tokens):
    if len(t) <= total_len:
      tokens[k, : len(t)] = torch.tensor(t, dtype=torch.long, device=config["device"])
    else:
      continue

  return tokens

In [None]:
def get_batches(data, tokenizer, config):
    ix = torch.randint(0, data.num_rows, (config['batch_size'],)) #pick batch_size number of random starting points
    y_1 = [data['chosen'][i] for i in ix]
    y_2 = [data['rejected'][i] for i in ix]
    tokens_1 = encode(y_1, tokenizer, config)
    tokens_2 = encode(y_2, tokenizer, config)
    tokens = {"accept" : tokens_1,
              "reject" : tokens_2}
    return tokens

In [None]:
def compute_loss(model, inputs):
  rewards_accept = model(inputs["accept"].unsqueeze(1).to('cuda'))
  rewards_reject = model(inputs["reject"].unsqueeze(1).to('cuda'))
  loss = -nn.functional.logsigmoid(rewards_accept - rewards_reject).sum(axis = 1).mean(axis = 0)

  return loss


In [None]:
@torch.no_grad()  # don't compute gradients for this function
def evaluate_loss(model, tokenizer, config):
    out = {}
    model.eval()
    for split in ["train", "test"]:
        losses = []
        for _ in range(10):
            if split == "train":
              dataset = train_dataset
            else:
              dataset = test_dataset
            tokens = get_batches(dataset, tokenizer, config)
            loss = compute_loss(model, tokens)
            losses.append(loss.item())
        out[split] = np.mean(losses) #average over 10 epochs
    model.train()
    return out

In [None]:
from pathlib import Path

checkpoints_dir= 'llama-2-7b'
checkpoints = sorted(Path(checkpoints_dir).glob("*.pth"))
ckpt_path = checkpoints[0]
checkpoint = torch.load(ckpt_path, map_location='cpu')

In [None]:
model = Transformer(config)
del checkpoint['output.weight']
model.load_state_dict(checkpoint, strict = False, assign = True)

model = model.to('cuda')
optimizer = torch.optim.Adam(model.parameters())

train_config = {
    'epochs' : 1000,
    'log_interval' : 10
}

In [None]:
import time
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
for name, param in model.named_parameters():
  if 'output' not in name:
    param.requires_grad = False

for param in model.parameters():
    param.data = param.data.float()


In [None]:
def train(model, tokenizer, optimizer, config, train_config, print_logs=False):
    losses = []
    start_time = time.time()
    for epoch in range(train_config['epochs']):
        optimizer.zero_grad()

        tokens = get_batches(train_dataset, tokenizer, config)
        loss = compute_loss(model, tokens)
        loss.backward()
        optimizer.step()

        if epoch % train_config['log_interval'] == 0:
            batch_time = time.time() - start_time
            x = evaluate_loss(model, tokenizer, config)
            losses += [x]
            if print_logs:
                print(f"Epoch {epoch} | val loss {x['test']:.3f} | Time {batch_time:.3f} | ETA in seconds {batch_time * (train_config['epochs'] - epoch)/train_config['log_interval'] :.3f}")
            start_time = time.time()



    print("validation loss: ", losses[-1]['test'])
    return pd.DataFrame(losses).plot()




In [None]:
train(model, tokenizer, optimizer, config, train_config, print_logs = True)

Epoch 0 | val loss 773.744 | Time 2.673 | ETA in seconds 133.641
Epoch 10 | val loss 700.034 | Time 29.865 | ETA in seconds 1463.398
Epoch 20 | val loss 689.889 | Time 29.110 | ETA in seconds 1397.266
Epoch 30 | val loss 617.156 | Time 29.972 | ETA in seconds 1408.661
