In [1]:
import re
import sys
import torch
from torch.utils.data import Dataset,DataLoader
import pickle
import os
import time 
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import warnings
import gc
from accelerate import Accelerator
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
warnings.simplefilter('ignore')
from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM,AutoConfig
from tqdm import tqdm
from torch.amp import custom_fwd, custom_bwd


In [2]:
 !pip install -q -U triton --no-index --find-links ../input/triton-3-0-0/triton-3.0.0-1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl

In [3]:
@torch.compile(fullgraph=True)
def rms_forward(x, weight, eps):
    # Calculate the RMS
    rms = x.float().pow(2).mean(-1, keepdim=True).add(eps).sqrt()
    # Normalize
    x_norm = x / rms
    # Apply the gain (weight)
    output = weight * x_norm.type_as(weight)
    return output

@torch.compile(fullgraph=True)
def rms_backward(grad_output, x, weight,eps):
    
    rms = x.float().pow(2).mean(-1, keepdim=True).add(eps).sqrt()
    # Normalize
    x_norm = x / rms
    # Gradients calculations
    grad_weight = torch.sum(grad_output * x_norm, dim=(0))

    # Compute grad_x: we need to backpropagate through normalization and RMS
    grad_x_norm = grad_output * weight  # Gradients w.r.t normalized input
    grad_rms = (grad_x_norm * (-x_norm)).sum(
        -1, keepdim=True
    ) / rms  # Gradients w.r.t RMS

    # Gradient w.r.t input x: the gradient of x involves the gradient of the normalization (grad_x_norm)
    grad_x = grad_x_norm / rms + grad_rms * x_norm / x_norm.size(
        -1
    )  # RMS backpropagation

    return grad_x, grad_weight
    
class RMSNormFn(torch.autograd.Function):
    @staticmethod
    @custom_fwd(device_type='cuda')
    def forward(ctx, x, weight, eps):
        output = rms_forward(x, weight, eps)
        ctx.save_for_backward(x, weight)
        ctx.eps = eps
        return output
        
    @staticmethod
    @custom_bwd(device_type='cuda')
    def backward(ctx, grad_output):
        x, weight = ctx.saved_tensors
        eps = ctx.eps
        grad_x, grad_weight = rms_backward(grad_output,x, weight,eps)

        return grad_x, grad_weight, None


class RMSNorm(torch.nn.Module):
    def __init__(self, dim, eps=1e-6):
        super().__init__()
        self.weight = torch.nn.Parameter(torch.ones(dim))
        self.eps = eps

    def forward(self, x):
        return RMSNormFn.apply(x, self.weight, self.eps)


In [4]:
@torch.compile(fullgraph=True)
def gelu_bwd(x):
    cdf = 0.5 * (1.0 + torch.erf(x / 2**0.5))
    pdf = torch.exp(-0.5 * x**2) / (2 * 3.14159265359) ** 0.5
    return (cdf + x * pdf)

@torch.compile(fullgraph=True)
def gelu_fwd(x):
    cdf = 0.5 * (1.0 + torch.erf(x / 2**0.5))
    return x * cdf
    
@torch.compile(fullgraph=True)
def ffn_gelu_fwd(x,weight1, bias1, weight2, bias2=None):

    output1 = x.mm(weight1.t())

    if bias1 is not None:
        output1 += bias1.unsqueeze(0).expand_as(output1)

    gelu_output = gelu_fwd(output1)  # output.clamp(min=0)  # ReLU activation
    output2 = gelu_output.mm(weight2.t())

    if bias2 is not None:
        output2 += bias2.unsqueeze(0).expand_as(output2)

    return output2,output1
    
@torch.compile(fullgraph=True)
def ffn_gelu_bwd(grad_output,x,weight1, bias1, weight2, bias2,gelu_input):
    grad_bias1 = grad_bias2 = None

    grad_gelu = gelu_bwd(gelu_input)

    grad_x2 = grad_output.mm(weight2) * grad_gelu
    grad_weight2 = grad_output.T.mm(gelu_input)

    if bias2 is not None:
        grad_bias2 = grad_output.sum(0)

    grad_x1 = grad_x2.mm(weight1)
    grad_weight1 = grad_x2.T.mm(x)

    if bias1 is not None:
        grad_bias1 = grad_x2.sum(0)

    return grad_x1, grad_weight1, grad_bias1, grad_weight2, grad_bias2

class GELUFunction(torch.autograd.Function):
    @staticmethod
    @custom_fwd(device_type='cuda')
    def forward(ctx, x):
        ctx.save_for_backward(x)
        return gelu_fwd(x)

    @staticmethod
    @custom_bwd(device_type='cuda')
    def backward(ctx, grad_output):
        x = ctx.saved_tensors
        return grad_output *  gelu_bwd(x)


class GELU(torch.nn.Module):
    def forward(self, x):
        return GELUFunction.apply(x)
        
class FFNGeLU(torch.autograd.Function):
    @staticmethod
    @custom_fwd(device_type='cuda')
    def forward(ctx, x, weight1, bias1, weight2, bias2=None):
        output2,output1 = ffn_gelu_fwd(x,weight1, bias1, weight2, bias2)
        ctx.save_for_backward(x, weight1, bias1, weight2, bias2, output1)
        return output2

    @staticmethod
    @custom_bwd(device_type='cuda')
    def backward(ctx, grad_output):
        x, weight1, bias1, weight2, bias2, gelu_input = ctx.saved_tensors
        grad_x1, grad_weight1, grad_bias1, grad_weight2, grad_bias2 = ffn_gelu_bwd(grad_output,x,weight1, bias1, weight2, bias2,gelu_input)

        return grad_x1, grad_weight1, grad_bias1, grad_weight2, grad_bias2


class FFNGeluModule(torch.nn.Module):
    def __init__(self, in_features, mid_feature, out_features):
        super(FFNGeluModule, self).__init__()
        self.weight1 = torch.nn.Parameter(
            torch.Tensor(mid_feature, in_features)
        )
        self.bias1 = torch.nn.Parameter(torch.Tensor(mid_feature))

        self.weight2 = torch.nn.Parameter(
            torch.Tensor(out_features, mid_feature)
        )
        self.bias2 = torch.nn.Parameter(torch.Tensor(out_features))

        torch.nn.init.xavier_uniform_(self.weight1)
        torch.nn.init.zeros_(self.bias1)
        torch.nn.init.xavier_uniform_(self.weight2)
        torch.nn.init.zeros_(self.bias2)

    def forward(self, input):
        return FFNGeLU.apply(input, self.weight1, self.bias1, self.weight2, self.bias2)


In [5]:
@torch.compile(fullgraph=True)
def rotate_half(x):
    x1, x2 = x.chunk(2, dim=-1)
    return torch.cat((-x2, x1), dim=-1)

@torch.compile(fullgraph=True)
def rope_fwd(q, k, freqs):
        # q, k: (batch_size, num_heads, seq_len, head_dim)
        # cos, sin: (seq_len, head_dim) or (1, seq_len, head_dim) or (batch_size, num_heads, seq_len, head_dim)
        emb = torch.cat((freqs, freqs), dim=-1)
        cos = emb.cos()
        sin = emb.sin()
        cos = cos.unsqueeze(1)
        sin = sin.unsqueeze(1)
        # forward Method
        #qleft,qright = q1,q2
        #qleft out = q1*cos-q2*sin
        #qright out = q2*cos+q1*sin
        #final out1 = concat(qleft out, qright out)
        #final out2 = concat(q1,q2)*cos+concat(-q2,q1)*sin
        #final out1 and final out2 both are same
        q_rotated = (q * cos) + (rotate_half(q) * sin)
        k_rotated = (k * cos) + (rotate_half(k) * sin)

        return q_rotated, k_rotated, cos, sin

@torch.compile(fullgraph=True)
def rope_bwd(grad_q_rotated, grad_k_rotated, cos, sin):
        # backward Method
        #qleft,qright = q1,q2
        # y=(x1*cos+ x2*cos)+(−x2*sin + x1*sin)
        # y = x1*(cos+sin)+x2(cos-sin)
        # dy/dx1 = cos+sin
        # dy/dx1 = cos-sin
        #do/dq qleft out = out_grad*(cos+sin)
        #do/dq right out = out_grad*(cos-sin)
        #do/dq final out1 = concat(do/dq qleft, do/dq right out)
        #do/dq final out2 = concat(q1,q2)*cos-concat(-q2,q1)*sin
        #final out1 and final out2 both are same

        grad_q = (grad_q_rotated * cos) - (
            rotate_half(grad_q_rotated) * sin
        )
        grad_k = (grad_k_rotated * cos) - (
           rotate_half(grad_k_rotated) * sin
        )

        #if freq is a parameter  we will need to calculate its grad as well

        # grad_cos_q = torch.sum(grad_q_rotated * q, dim=(0, 1, 2), keepdim=True)
        # grad_sin_q = torch.sum(
        #     grad_q_rotated * RotaryEmbeddingFunction.rotate_half(q),
        #     dim=(0, 1, 2),
        #     keepdim=True,
        # )

        # grad_cos_k = torch.sum(grad_k_rotated * k, dim=(0, 1, 2), keepdim=True)
        # grad_sin_k = torch.sum(
        #     grad_k_rotated * RotaryEmbeddingFunction.rotate_half(k),
        #     dim=(0, 1, 2),
        #     keepdim=True,
        # )

        # grad_cos = grad_cos_q + grad_cos_k
        # grad_sin = grad_sin_q + grad_sin_k

        return grad_q, grad_k
    
class RotaryEmbeddingFunction(torch.autograd.Function):
    @staticmethod
    @custom_fwd(device_type='cuda',cast_inputs=torch.float32)
    def forward(ctx, q, k, freqs):
        q_rotated, k_rotated, cos, sin = rope_fwd(q, k, freqs)

        ctx.save_for_backward(cos, sin)
        return q_rotated, k_rotated

    @staticmethod
    @custom_bwd(device_type='cuda')
    def backward(ctx, grad_q_rotated, grad_k_rotated):
        cos, sin = ctx.saved_tensors
        grad_q, grad_k = rope_bwd(grad_q_rotated, grad_k_rotated, cos, sin)

        return grad_q, grad_k, None


class RotaryEmbedding(nn.Module):
    """
    RotaryEmbedding is a PyTorch module that implements rotary positional embeddings for attention mechanisms.
    Args:
        config (object): Configuration object containing the following attributes:
            hidden_size (int): The hidden size of the model.
            num_attention_heads (int): The number of attention heads.
    Attributes:
        inv_freq (torch.Tensor): A tensor containing the inverse frequencies for the rotary embeddings.
    Methods:
        forward(seq_len):
            Computes the rotary positional embeddings for a given sequence length.
            Args:
                seq_len (int): The length of the input sequence.
            Returns:
                torch.Tensor: A tensor containing the rotary positional embeddings with shape (1, seq_len, dim).
    """

    def __init__(self,config):
        super().__init__()
        dim = int(config.hidden_size // config.num_attention_heads)
        inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim))
        self.register_buffer("inv_freq", inv_freq)

    def forward(self, seq_len):
        t = torch.arange(seq_len, device=self.inv_freq.device).type_as(self.inv_freq)
        freqs = torch.einsum("i, j -> i j", t, self.inv_freq)

        return freqs[None, :, :]


In [6]:
from einops import rearrange, reduce
from typing import Optional, Tuple,Union,List
from dataclasses import dataclass

In [7]:
class LinearRms(torch.autograd.Function):
    @staticmethod
    # @custom_fwd
    def forward(ctx, x, prev, weight, bias=None, weight_rms=None, eps=None):
        output = x.mm(weight.t())
        if bias is not None:
            output += bias.unsqueeze(0).expand_as(output)
        ctx.save_for_backward(x, weight, bias, output + prev)
        output = rms_forward(output + prev, weight_rms, eps)
        ctx.save_for_backward(
            x, weight, bias, output + prev, weight_rms
        )
        ctx.eps = eps
        return output

    @staticmethod
    # @custom_bwd
    def backward(ctx, grad_output):
        x, weight, bias, rms_x, weight_rms = ctx.saved_tensors
        eps = ctx.eps
        grad_weight = grad_bias = None

        grad_rms, grad_weight_rms = rms_backward(
            grad_output, rms_x, weight_rms,eps
        )  

        grad_input = grad_rms.mm(weight)
        grad_prev = grad_rms

        grad_weight = grad_rms.t().mm(x)
        grad_bias = None
        if bias is not None:
            grad_bias = grad_rms.sum(0)

        return grad_input, grad_prev, grad_weight, grad_bias, grad_weight_rms, None

class LinearRMSFused(torch.nn.Module):

    def __init__(self, in_features, out_features, eps=1e-6):
        super(LinearRMSFused, self).__init__()
        self.weight = torch.nn.Parameter(torch.Tensor(out_features, in_features))
        self.bias = torch.nn.Parameter(torch.Tensor(out_features))
        torch.nn.init.xavier_uniform_(self.weight)
        torch.nn.init.zeros_(self.bias)
        self.weight_rms = torch.nn.Parameter(torch.ones(out_features))
        self.eps = eps

    def forward(self, input, prev):
        return LinearRms.apply(
            input, prev, self.weight, self.bias, self.weight_rms, self.eps
        )


In [8]:
@torch.compile(fullgraph=True)
def attention_forward(Q, K, V, mask=None):
    d_k = Q.shape[-1]
    scale = 1.0 / torch.sqrt(torch.tensor(d_k, dtype=Q.dtype, device=Q.device))  # avoid NumPy
    EPSILON = 1e-10

    Q_scaled = Q * scale
    S = torch.einsum("... i d, ... j d -> ... i j", Q_scaled, K)

    if mask is not None:
        S = S + mask

    softmax = F.softmax(S, dim=-1)
    P_V = torch.einsum("... i j, ... j d -> ... i d", softmax, V)

    return P_V, softmax
    
@torch.compile(fullgraph=True)
def attention_backward(Q, K, V, O, dO, softmax):
    scale = 1.0 / torch.sqrt(torch.tensor(Q.shape[-1], dtype=Q.dtype, device=Q.device))  # avoid NumPy


    P = softmax  # (1 / l) * torch.exp(S - m)
   

    dV = torch.einsum("... r c, ... r d -> ... c d", P, dO)
    dP = torch.einsum("... r d, ... c d -> ... r c", dO, V)

    D = torch.sum(dO * O, dim=-1, keepdim=True)
    dS = P * (dP - D)

    dQ = scale * torch.einsum("... r c, ... c d -> ... r d", dS, K)
    dK = scale * torch.einsum("... r c, ... r d -> ... c d", dS, Q)
    return dQ, dK, dV


class ScaledDotProductAttention(torch.autograd.Function):
    @staticmethod
    @custom_fwd(device_type='cuda')
    def forward(ctx, q, k, v, mask=None):
        out, attn_weights = attention_forward(q,k,v, mask)
        ctx.save_for_backward(q, k, v, mask,out, attn_weights)
        return out

    @staticmethod
    @custom_bwd(device_type='cuda')
    def backward(ctx, do):
        Q, K, V, mask, O, softmax = ctx.saved_tensors
        dq, dk, dv = attention_backward(Q, K, V,O, do, softmax)
        return dq, dk, dv, None


In [9]:

class DecoderAttention(nn.Module):
    def __init__(self, config, layer_idx: int) -> None:
        super().__init__()
        if config.hidden_size % config.num_attention_heads != 0:
            raise ValueError(
                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
                f"heads ({config.num_attention_heads})"
            )
        self.head_size = int(config.hidden_size // config.num_attention_heads)
        self.attention_bias = getattr(config, "attention_bias", True)
        self.layer_idx = layer_idx
        # self.qkv = nn.Linear(config.hidden_size,3*config.hidden_size)
        self.query = nn.Linear(
            config.hidden_size, config.hidden_size, bias=self.attention_bias
        )
        self.key = nn.Linear(
            config.hidden_size, config.hidden_size, bias=self.attention_bias
        )
        self.value = nn.Linear(
            config.hidden_size, config.hidden_size, bias=self.attention_bias
        )
        self.out = LinearRMSFused(config.hidden_size,config.hidden_size)
        self.num_attention_heads = config.num_attention_heads
        self.apply_rotary_pos_emb =  RotaryEmbeddingFunction.apply
        self.dot_attn = ScaledDotProductAttention.apply
        

    def forward(
        self,
        hidden_state: torch.Tensor,
        attention_mask: torch.Tensor,
        freqs: Optional[torch.Tensor] = None,
        use_cache: Optional[bool] = False,
        kv_cache: List[torch.FloatTensor] = None,
        start_pos: Optional[int] = 0,
    ) -> torch.Tensor:
        """
        Args:
            hidden_states: torch.Tensor of shape (batch, seq_len, embed_dim)`
            Attention_mask: torch.Tensor of shape (batch,1, seq_len, seqlen)`
            freqs: Positional freqs in case of RoPE embedding
            use_cace: Optional to use kvCache
            start_pos: in case of kvCache to get store kv-cache at start_pos
        return:
               hidden_states: torch.Tensor of shape (batch, seq_len, embed_dim)

        """
        q = self.query(hidden_state)
        k = self.key(hidden_state)
        v = self.value(hidden_state)
        # transform it into batch_size x no_of_heads x seqlen x head_dim for Multihead Attention
        q = rearrange(q, "b l (h d) -> b h l d", h=self.num_attention_heads)
        k = rearrange(k, "b l (h d) -> b h l d", h=self.num_attention_heads)
        v = rearrange(v, "b l (h d) -> b h l d", h=self.num_attention_heads)

        if freqs is not None:
            q, k = self.apply_rotary_pos_emb(q, k, freqs)  # apply RoPE if freqs is available

        if use_cache:
            if kv_cache is None:
                raise ValueError("you need to pass kv_cache")
            k, v = kv_cache.update(self.layer_idx, k, v, start_pos)

        # out = torch.nn.functional.scaled_dot_product_attention(
        #     query=q, key=k, value=v, attn_mask=attention_mask
        # )
        out = self.dot_attn(q,k,v,attention_mask)
        # transform it back into batch_size x seqlen x hidden_dim
        out = rearrange(out, "b h l d -> b l (h d)")
        b,l,d = out.size()
        
        out = out.view(-1,d).contiguous()
        hidden_state = hidden_state.view(-1,d).contiguous()
        
        out = self.out(out,hidden_state)
        
        return out.view(b,l,d).contiguous(), kv_cache

In [10]:
# mainly 2way to do one keep it into the model init like llama https://github.com/meta-llama/llama/blob/main/llama/model.py
# every attention layer have its own kv-cache storage
# or keep all attention layer kv-cache into single storage like Huggingface Transformer


from dataclasses import dataclass
from typing import Any, Dict, Generator, List, Optional, Tuple
import torch


class DynamicCache:
    """
    A cache that grows dynamically as more tokens are generated.

    It stores the Key and Value states as a list of tensors, one for each layer. The expected shape for each tensor is
    `[batch_size, num_heads, seq_len, head_dim]`.
    """

    def __init__(self, config, is_gqa: bool = False) -> None:
        self.key_cache: List[torch.Tensor] = []
        self.value_cache: List[torch.Tensor] = []
        self._seen_tokens = False

        self.layers = config.num_hidden_layers
        for _ in range(self.layers):
            self.key_cache.append([])
            self.value_cache.append([])

    def __len__(self) -> int:
        """
        Support for backwards-compatible `past_key_value` length, e.g. `len(past_key_value)`. This value corresponds
        to the number of layers in the model.
        """
        if len(self.key_cache) == 0:
            return 0
        return self.key_cache[0].shape[-2]

    def update(
        self,
        index: int,
        key_states: torch.Tensor,
        value_states: torch.Tensor,
        start_pos: int = 0,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.

        Parameters:
            key_states (`torch.Tensor`):
                The new key states to cache.
            value_states (`torch.Tensor`):
                The new value states to cache.
            layer_idx (`int`):
                The index of the layer to cache the states for.
            cache_kwargs (`Dict[str, Any]`, `optional`):
                Additional arguments for the cache subclass. No additional arguments are used in `DynamicCache`.

        Return:
            A tuple containing the updated key and value states.
        """

        # Update the cache first iteration'

        if len(self.key_cache[index]) == 0:
            self._seen_tokens = True
            self.key_cache[index] = key_states.clone()
            self.value_cache[index] = value_states.clone()
        else:
            self.key_cache[index] = torch.cat(
                [self.key_cache[index], key_states], dim=-2
            )
            self.value_cache[index] = torch.cat(
                [self.value_cache[index], value_states], dim=-2
            )

        return self.key_cache[index], self.value_cache[index]

    def get(self, index: int) -> Tuple[torch.Tensor]:
        if self._seen_tokens:
            return self.key_cache[index], self.value_cache[index]
        else:
            raise ValueError("there is no token available in kv-cache")

    def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
        """Returns the sequence length of the cached states. A layer index can be optionally passed."""
        if self.key_cache is None:
            return 0
        return self.key_cache[layer_idx].shape[-2]

    def get_max_length(self) -> Optional[int]:
        """Returns the maximum sequence length of the cached states. DynamicCache does not have a maximum length."""
        return self.max_cache_len


class StaticCache:
    """
    A cache that grows dynamically as more tokens are generated.

    It stores the Key and Value states as a list of tensors, one for each layer. The expected shape for each tensor is
    `[batch_size, num_heads, seq_len, head_dim]`.
    """

    def __init__(
        self,
        config,
        max_cache_len: int = None,
        dtype: torch.dtype = torch.float32,
        batch_size: int = 1,
        is_gqa: bool = False,
    ) -> None:
        self.head_size = int(config.hidden_size // config.num_attention_heads)
        self.heads = None
        self.batch_size = batch_size
        # if is_gqa:
        self.heads = getattr(config, "num_key_value_heads", None)
        # if self.heads is None:
        #     raise ValueError(
        #         "you are using is_gqa=True and config.num_key_value_heads is not available"
        #     )
        if self.heads is None:

            self.heads = config.num_attention_heads

        self.max_cache_len = (
            config.max_position_embeddings if max_cache_len is None else max_cache_len
        )

        self.dtype = dtype

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.key_cache: List[torch.Tensor] = []
        self.value_cache: List[torch.Tensor] = []

        self.cache_shape = (
            self.batch_size,
            self.heads,
            self.max_cache_len,
            self.head_size,
        )

        self._seen_tokens = False
        self.layers = config.num_hidden_layers
        for _ in range(self.layers):
            blank_key_cache = torch.zeros(
                self.cache_shape, dtype=self.dtype, device=self.device
            )
            blank_value_cache = torch.zeros(
                self.cache_shape, dtype=self.dtype, device=self.device
            )
            self.key_cache.append(blank_key_cache)
            self.value_cache.append(blank_value_cache)

    def __len__(self) -> int:
        if self.key_cache is None:
            return 0
        """
        Support for backwards-compatible `past_key_value` length, e.g. `len(past_key_value)`. This value corresponds
        to the number of layers in the model.
        """
        return self.key_cache.shape[-2]

    def update(
        self,
        index: int,
        key_states: torch.Tensor,
        value_states: torch.Tensor,
        start_pos: int = 0,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.

        Parameters:
            key_states (`torch.Tensor`):
                The new key states to cache.
            value_states (`torch.Tensor`):
                The new value states to cache.
            layer_idx (`int`):
                The index of the layer to cache the states for.
            cache_kwargs (`Dict[str, Any]`, `optional`):
                Additional arguments for the cache subclass. No additional arguments are used in `DynamicCache`.

        Return:
            A tuple containing the updated key and value states.
        """

        # Update the cache first iteration'

        bsz, head, seqlen, _ = key_states.shape
        if seqlen > self.key_cache[index].size()[2]:
            raise ValueError(
                f"{k.shape} is more than init k_cache size {self.key_cache}"
            )

        self.key_cache[index][:bsz, :, start_pos : start_pos + seqlen] = key_states
        self.value_cache[index][:bsz, :, start_pos : start_pos + seqlen] = value_states

        k = self.key_cache[index][:bsz, :, : start_pos + seqlen]
        v = self.value_cache[index][:bsz, :, : start_pos + seqlen]

        return k, v

    def get(self, index: int) -> Tuple[torch.Tensor]:
        if self._seen_tokens:
            return self.key_cache[index], self.value_cache[index]
        else:
            raise ValueError("there is no token available in kv-cache")

    def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
        """Returns the sequence length of the cached states. A layer index can be optionally passed."""
        if self.key_cache is None:
            return 0
        return self.key_cache[layer_idx].shape[-2]

    def get_max_length(self) -> Optional[int]:
        """Returns the maximum sequence length of the cached states. DynamicCache does not have a maximum length."""
        return None


In [11]:
@dataclass
class DecoderOutput(object):
    logits: torch.Tensor
    past_key_value: Optional[object]


@dataclass
class CLMOutput(object):
    hidden_state: torch.Tensor
    logits: torch.Tensor
    kv_cache: List[torch.FloatTensor] = None


class DecoderLayer(nn.Module):

    def __init__(self, config, layer_idx: int, attention_type: str = None) -> None:
        super().__init__()
        self.attention = (DecoderAttention(config, layer_idx=layer_idx)
        )
        if attention_type == "gqa" and layer_idx == 0:  # avoid to print m times
            print("Decoder Using GQA Attention")
        self.feed_forward = FFNGeluModule(config.hidden_size, 4*config.hidden_size, config.hidden_size)
        self.layer_idx = layer_idx
        self.layernorm = RMSNorm(
            config.hidden_size, eps=getattr(config, "layer_norm_eps", 1e-6)
        )

    def forward(
        self,
        hidden_state: torch.Tensor,
        attention_mask: torch.Tensor,
        freqs: Optional[torch.Tensor] = None,
        use_cache: Optional[bool] = False,
        kv_cache: List[torch.FloatTensor] = None,
        start_pos: Optional[int] = 0,
    ) -> torch.Tensor:
        out,kv_cache = self.attention(
            hidden_state=hidden_state,
            attention_mask=attention_mask,
            freqs=freqs,
            use_cache=use_cache,
            kv_cache=kv_cache,
            start_pos=start_pos,
        )
        b,l,d = out.size()
        out1 = out.view(-1,d).contiguous()
        out1 = self.feed_forward(out1)
        out1 = out1.view(b,l,d).contiguous() #out # 
        out = self.layernorm(out1 + out)
        return out, kv_cache


class LMHead(nn.Module):
    """Head for masked language modelling"""

    def __init__(self, config) -> None:
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.layerNorm = RMSNorm(
            config.hidden_size, eps=getattr(config, "layer_norm_eps", 1e-6)
        )

        self.decoder = nn.Linear(config.hidden_size, config.vocab_size)
        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
        self.decoder.bias = self.bias
        self.gelu = GELU()

    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
        x = self.dense(hidden_state)
        x = self.gelu(x)
        x = self.layerNorm(x)

        # project back to size of vocabulary with bias
        x = self.decoder(x)

        return x

class LinearCrossEntropyIgnoreIndex(torch.autograd.Function):
    @staticmethod
    def forward(ctx, inputs, weight, bias, targets,chunk_size=32, ignore_index=-100):
        logits = F.linear(inputs, weight, bias)
        shape = logits.size()
        logits_flat = logits.view(-1, shape[-1])
        targets_flat = targets.view(-1)
        valid_mask = targets_flat != ignore_index
        valid_logits = logits_flat[valid_mask]
        valid_targets = targets_flat[valid_mask]

        if chunk_size is not None:
            logit_chunks = valid_logits.split(chunk_size)
            target_chunks = valid_targets.split(chunk_size)
            loss_chunks = [
                torch.nn.functional.cross_entropy(
                    logit_chunk, target_chunk, ignore_index=ignore_index, reduction="none"
                )
                for logit_chunk, target_chunk in zip(logit_chunks, target_chunks)]
            
            loss = torch.cat(loss_chunks).mean()


        else:
            softmax = F.softmax(valid_logits, dim=-1)
            log_probs = torch.log(softmax + 1e-12)  # Numerical stability
            target_log_probs = log_probs[torch.arange(valid_logits.size(0)), valid_targets]
            loss = -target_log_probs.mean()

        # softmax = F.softmax(valid_logits, dim=-1)
        # log_probs = torch.log(softmax + 1e-12)  # Numerical stability
        # target_log_probs = log_probs[torch.arange(valid_logits.size(0)), valid_targets]
        # loss = -target_log_probs.mean()

        ctx.save_for_backward(inputs, weight, valid_targets, valid_mask, logits_flat)
        ctx.shape = shape
        ctx.ignore_index = ignore_index
        return loss

    @staticmethod
    def backward(ctx, grad_outputs):
        inputs, weight, valid_targets, valid_mask, logits_flat = ctx.saved_tensors

        grad_logits = torch.zeros_like(logits_flat,dtype = weight.dtype)
        valid_logits = logits_flat[valid_mask]
        valid_grad_logits = F.softmax(valid_logits, dim=-1)
        valid_grad_logits[torch.arange(valid_grad_logits.size(0)), valid_targets] -= 1
        valid_grad_logits /= valid_grad_logits.size(0)  # Normalize by batch size

        grad_logits[valid_mask] = valid_grad_logits
        grad_logits = grad_logits.view(*ctx.shape)
        grad_input = grad_weight = grad_bias = None

        grad_loss = grad_logits * grad_outputs

        grad_input = grad_loss.matmul(weight)
        grad_weight = grad_loss.transpose(-2, -1).matmul(inputs)
        grad_bias = grad_loss.sum(dim=0)

        return grad_input, grad_weight, grad_bias, None, None, None



class MyLinearCrossEntropy(torch.nn.Module):
    def __init__(self, in_features=768, out_features=50265, ignore_index=-100,chunk_size=32):
        super(MyLinearCrossEntropy, self).__init__()
        self.weight = torch.nn.Parameter(torch.Tensor(out_features, in_features))
        self.bias = torch.nn.Parameter(torch.Tensor(out_features))
        self.ignore_index = ignore_index
        self.chunk_size = chunk_size
        torch.nn.init.xavier_uniform_(self.weight)
        torch.nn.init.zeros_(self.bias)

    def forward(self, x, target=None):
        if target is not None:
            x = x[:,:-1,:].contiguous()
            target = target[:,1:].contiguous()
            return LinearCrossEntropyIgnoreIndex.apply(x, self.weight, self.bias, target,self.chunk_size, self.ignore_index)
        else:
            if x.dim() != 2:
                B, l, d = x.shape
                x = x.view(-1, d).contiguous()
            output = x.mm(self.weight.t())
            # if bias is not None:
            output += self.bias.unsqueeze(0).expand_as(output)
            
            return output.view(B,l,50265)
class DecoderModel(nn.Module):

    def __init__(
        self,
        config,
        pos_embedding_type: Optional[str] = "absolute",
        attention_type: str = None,
    ) -> None:
        super().__init__()
        self.word_embeddings = nn.Embedding(
            config.vocab_size,
            config.hidden_size,
            padding_idx=getattr(config, "pad_token_id", None),
        )
        self.emb_freq = RotaryEmbedding(config)(config.max_position_embeddings)
        print(
                "Encoder Ignoring sinusoidal or absolute position embeddings because rope,is enable"
            )
        self.all_layer = nn.ModuleList(
            [
                DecoderLayer(config, layer_idx, attention_type)
                for layer_idx in range(config.num_hidden_layers)
            ]
        )
        self.lm_head = MyLinearCrossEntropy()
        self.config = config

    def _init_weights(self, module: nn.Module) -> None:
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(
                module.weight, mean=0.0, std=0.02 / torch.sqrt(2 * len(self.all_layer))
            )
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(
                module.weight, mean=0.0, std=0.02 / torch.sqrt(2 * len(self.all_layer))
            )

    def forward(
        self,
        input_ids: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        use_cache: Optional[bool] = False,
        target: Optional[torch.Tensor] = None,
        kv_cache: List[torch.FloatTensor] = None,
        start_pos: Optional[int] = 0,
    ) -> torch.Tensor:
        _bsz, seqlen = input_ids.shape
        hidden_state = self.word_embeddings(input_ids)
        freqs = self.emb_freq[:, start_pos : start_pos + seqlen].to(
            input_ids.device
        )
        mask = None
        if seqlen > 1:
            mask = self.create_mask_for_decoder(
                input_ids=input_ids, attention_mask=attention_mask, start_pos=start_pos
            )
            mask = (1.0 - mask) * torch.finfo(
                hidden_state.dtype
            ).min  # invert it to to add directly to attention score

        for layer in self.all_layer:
            hidden_state, kv_cache = layer(
                hidden_state,
                mask,
                freqs=freqs,
                use_cache=use_cache,
                kv_cache=kv_cache,
                start_pos=start_pos,
            )
        logits = self.lm_head(hidden_state,target)
        if target is not None:
            return logits
        return CLMOutput(hidden_state=hidden_state, logits=logits, kv_cache=kv_cache)

    def create_mask_for_decoder(
        self,
        input_ids,
        attention_mask: Optional[torch.Tensor] = None,
        start_pos: Optional[int] = 0,
    ) -> torch.Tensor:
        device = input_ids.device
        batch_size, seq_length = input_ids.shape
        if attention_mask is None:
            attention_mask = (
                torch.ones(seq_length + start_pos).repeat(batch_size, 1).to(device)
            )
        seq_ids = torch.arange(seq_length).to(device)
        causal_mask = (
            seq_ids[None, None, :].repeat(batch_size, seq_length, 1)
            <= seq_ids[None, :, None]
        )  # 1x1xl repeat bxlxl compare to 1xlx1

        causal_mask = causal_mask.to(attention_mask.dtype)

        if start_pos > 0:  # correct the attention mask  for kv-cache operation
            causal_mask = torch.cat(
                [
                    torch.ones(
                        (batch_size, seq_length, start_pos),
                        device=device,
                        dtype=causal_mask.dtype,
                    ),
                    causal_mask,
                ],
                axis=-1,
            )

        extended_attention_mask = (
            causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
        )  # # this is mainly if batch contains <PAD> tokens. stop casual procees before <PAD>
        return extended_attention_mask

    @classmethod
    def from_config(
        cls,
        config,
        pos_embedding_type: Optional[str] = "absolute",
        attention_type: Optional[str] = None,
    ) -> nn.Module:
        return cls(config, pos_embedding_type, attention_type)

    def _setup_cache(self, config, cls: Optional[object] = StaticCache) -> None:
        for layer in self.all_layer:
            layer.attention.cache = cls(config)

    def _clean_cache(self) -> None:
        for layer in self.all_layer:
            layer.attention.cache = None


In [12]:
from einops import rearrange

In [13]:
m =AutoModelForCausalLM.from_pretrained("../input/transformer-distilation-gpt-2/gpt2_6L")

2025-07-05 08:55:04.065976: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751705704.256077      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751705704.315930      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [14]:
tokenizer = AutoTokenizer.from_pretrained('gpt2')


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [15]:
tokenizer.model_max_length

1024

In [16]:
if tokenizer.pad_token_id is None:
  tokenizer.pad_token_id = tokenizer.eos_token_id

# Set reasonable default for models without max length
if tokenizer.model_max_length > 512:
  tokenizer.model_max_length = 512

In [17]:
state_dict = m.state_dict()

In [18]:
string = open('/kaggle/input/mark-twain-books/Combine.txt',encoding='utf8',errors='ignore').read()
new_str = re.sub('�', '', string)
open('Train.txt', 'w').write(new_str)

6588596

In [19]:
model_ckpt = "roberta-base"
config = AutoConfig.from_pretrained(model_ckpt)

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [20]:
config

RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.51.3",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

In [21]:
from types import SimpleNamespace
from collections import namedtuple


In [22]:
config = SimpleNamespace(**config.__dict__)
config.vocab_size = len(tokenizer)
config.num_hidden_layers = 6

In [23]:
model = DecoderModel.from_config(config,pos_embedding_type='rope')

Encoder Ignoring sinusoidal or absolute position embeddings because rope,is enable


In [24]:
model.word_embeddings = nn.Embedding.from_pretrained(
    state_dict["transformer.wte.weight"], freeze=False
)

**Data Source**

https://www.kaggle.com/datasets/msinger007/mark-twain-books

In [25]:
train_path = 'Train.txt'

In [26]:
class TextDataset(Dataset):

    def __init__(
        self,
        tokenizer,
        file_path: str,
        block_size: int):
        if os.path.isfile(file_path) is False:
            raise ValueError(f"Input file path {file_path} not found")

        block_size = block_size - tokenizer.num_special_tokens_to_add(pair=False)
        saved = False
        cache_dir = None
        directory, filename = os.path.split(file_path)
        cached_features_file = os.path.join(
            cache_dir if cache_dir is not None else directory,
            f"cached_lm_{tokenizer.__class__.__name__}_{block_size}_{filename}",
        )

     
        if os.path.exists(cached_features_file) and saved :
                start = time.time()
                with open(cached_features_file, "rb") as handle:
                    self.examples = pickle.load(handle)
#                 logger.info(
#                     f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start
#                 )

        else:
#                 logger.info(f"Creating features from dataset file at {directory}")

                self.examples = []
                with open(file_path, encoding="utf-8") as f:
                    text = f.read()

                tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))

                for i in range(0, len(tokenized_text) - block_size + 1, block_size):  # Truncate in block of block_size
                    self.examples.append(tokenizer.build_inputs_with_special_tokens(tokenized_text[i : i + block_size]))
                    # )
                # Note that we are losing the last truncated example here for the sake of simplicity (no padding)
                # If your dataset is small, first you should look for a bigger one :-) and second you
                # can change this behavior by adding (model specific) padding.

                start = time.time()
                with open(cached_features_file, "wb") as handle:
                    pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)
                    saved = True
#                 logger.info(
#                     f"Saving features into cached file {cached_features_file} [took {time.time() - start:.3f} s]"
#                 )

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i) -> torch.Tensor:
        return {"input_ids":torch.tensor(self.examples[i], dtype=torch.long)}

In [27]:
def collate(batch):
    labels = batch["input_ids"].clone()
    if tokenizer.pad_token_id is not None:
        labels[labels == tokenizer.pad_token_id] = -100
    batch["labels"] = labels
    return batch

In [28]:
train_loader = torch.utils.data.DataLoader(TextDataset(tokenizer,train_path,128), batch_size=16, shuffle=True, num_workers=2)

Token indices sequence length is longer than the specified maximum sequence length for this model (1580900 > 512). Running this sequence through the model will result in indexing errors


In [31]:

def train(model, train_loader, EPOCHS=3):
    EPOCHS = 5
    accumulation_steps = 2
    lr=5e-5
    # train it fully 
    no_decay = ['bias', 'layerNorm.weight','layerNorm.bias']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=lr)
    num_train_optimization_steps = int(EPOCHS * len(train_loader) / accumulation_steps)
    # scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0.05 * num_train_optimization_steps,
    #                                     num_training_steps=num_train_optimization_steps)
    best_epoch_loss = np.inf
    accelerator = Accelerator(log_with="tensorboard",project_dir=".",gradient_accumulation_steps=2)
    Config = {
        "num_epoch": EPOCHS,
        "learning_rate": lr,
        "loss_function": str(torch.nn.CrossEntropyLoss),
    }
    epoch_check = len(train_loader)
    total_step = epoch_check * EPOCHS
    accelerator.init_trackers("CLM_project", config=Config)
    # train_bar =tqdm(total=total_step, dynamic_ncols=True, disable=not accelerator.is_main_process)
    train_loader,model,optimizer =  accelerator.prepare(train_loader,model, optimizer)
    model.train()
    t_step = 0
    # loss_fn = CustomLoss.apply
    
    for epoch in range(EPOCHS):
        loss_list = []
        for step, data in enumerate(train_loader):
            # train_bar.update(1)
            with accelerator.accumulate(model):
                data =  collate(data)
                x = data["input_ids"] #.to(device)
                y = data['labels'] #.to(device)
                optimizer.zero_grad()
                loss = model(input_ids = x,target=y)
                # pred = pred[:, :-1, :].contiguous()
                # y = y[:, 1:].contiguous()
                # loss = loss_fn(pred,y)
                accelerator.backward(loss)
                optimizer.step()
                # scheduler.step()
                accelerator.log({"training_loss_step": loss}, step=t_step)
                t_step+=1
            
            loss_list.append(loss.detach().cpu().item())
        
        avg_loss = np.round(np.mean(loss_list), 4)

        accelerator.print(f'Epoch--{epoch+1} ### Train loss---{avg_loss}')
        
    PATH = f"decoder__{epoch}.pth"
    model = accelerator.unwrap_model(model)
    torch.save(model.state_dict(), PATH)
    accelerator.end_training()
    accelerator.free_memory(train_loader,model, optimizer)

In [32]:
train(model,train_loader)

W0705 08:55:39.077000 35 torch/_inductor/utils.py:1137] [1/0] Not enough SMs to use max_autotune_gemm mode


Epoch--1 ### Train loss---6.8396
Epoch--2 ### Train loss---5.5833
Epoch--3 ### Train loss---5.2548
Epoch--4 ### Train loss---5.0357
Epoch--5 ### Train loss---4.8594


In [33]:
# from accelerate import notebook_launcher
# notebook_launcher(train, (model,train_loader), num_processes=2)

In [34]:
model = DecoderModel.from_config(config,pos_embedding_type='rope')

Encoder Ignoring sinusoidal or absolute position embeddings because rope,is enable


In [35]:
model.load_state_dict(torch.load('/kaggle/working/decoder__4.pth', weights_only=True))

<All keys matched successfully>

In [36]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [37]:
model.to(device)
model.eval()

DecoderModel(
  (word_embeddings): Embedding(50257, 768, padding_idx=1)
  (all_layer): ModuleList(
    (0-5): 6 x DecoderLayer(
      (attention): DecoderAttention(
        (query): Linear(in_features=768, out_features=768, bias=True)
        (key): Linear(in_features=768, out_features=768, bias=True)
        (value): Linear(in_features=768, out_features=768, bias=True)
        (out): LinearRMSFused()
      )
      (feed_forward): FFNGeluModule()
      (layernorm): RMSNorm()
    )
  )
  (lm_head): MyLinearCrossEntropy()
)

In [38]:
@torch.no_grad()
def generate(
    model,
    input_ids: torch.Tensor,
    attention_mask: torch.Tensor,
    max_len: int = 20,
    temperature: float = 1.0,
    use_cache: bool = True,
    do_sample: bool = False,
    use_static_cache: bool = False,
) -> torch.Tensor:

    device = input_ids.device

    all_prompt_size = [t.size()[0] for t in input_ids]

    min_prompt_len = min(all_prompt_size)
    max_prompt_len = max(all_prompt_size)

    max_len = (
        max_len + max_prompt_len
    )  # get  max len (prompt + to be generated token combined)

    pad_id = getattr(model.config, "pad_token_id", 50256)
    bsz, _ = input_ids.size()
    tokens = torch.full((bsz, max_len), pad_id, dtype=torch.long, device=device)

    kv_cache = None
    if use_cache:
        if use_static_cache:
            kv_cache = StaticCache(model.config, max_cache_len=max_len, batch_size=bsz)
        else:
            kv_cache = DynamicCache(model.config)

    for k, t in enumerate(input_ids):
        tokens[k, : t.size()[0]] = t

    prev_pos = torch.tensor(0, device=device)
    eos_reached = torch.tensor([False] * bsz, device=device)
    # to break generation if eos reached for all  prompt

    input_text_mask = tokens != pad_id  # mask to fill generated values into batch

    stop_tokens = torch.tensor(getattr(model.config, "eos_token_id", 50256), device=device)
    for cur_pos in range(min_prompt_len, max_len):

        # Get the model output
        with torch.no_grad():
            outputs = model(
                input_ids=tokens[:, prev_pos:cur_pos],
                attention_mask=attention_mask,
                use_cache=use_cache,
                kv_cache=kv_cache,
                start_pos=prev_pos,
            )
        kv_cache = outputs.kv_cache
        next_token_logits = outputs.logits[:, -1] / temperature

        if do_sample:
            next_token = torch.multinomial(next_token_logits, num_samples=1)
        else:
            _, next_token = torch.topk(next_token_logits, k=1, dim=-1)

        next_token = next_token.reshape(-1)
        # only replace token if prompt has already been generated
        next_token = torch.where(
            input_text_mask[:, cur_pos], tokens[:, cur_pos], next_token
        )
        tokens[:, cur_pos] = next_token
        eos_reached |= (~input_text_mask[:, cur_pos]) & (
            torch.isin(next_token, stop_tokens)
        )

        if use_cache:
            prev_pos = cur_pos

        attention_mask = torch.cat(
            [attention_mask, torch.ones((bsz, 1), device=device)], dim=-1
        )
        if all(eos_reached):
            break
    return tokens

In [39]:
text = tokenizer(["this is a test, blue","Well, sir, you could"], return_tensors="pt", padding=True)

In [40]:
text

{'input_ids': tensor([[ 5661,   318,   257,  1332,    11,  4171],
        [ 5779,    11, 15967,    11,   345,   714]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1]])}

In [41]:
input_ids , attention_mask = text['input_ids'],text['attention_mask']
input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)

In [42]:
out = generate(model,input_ids=input_ids , attention_mask=attention_mask,use_cache=False)
tokenizer.batch_decode(out)

['this is a test, blue-room, and a  little, and a little, and a little, and a little,',
 'Well, sir, you could see the  old man, and you are going to see you.  The man was a good']

In [43]:
out = generate(model,input_ids=input_ids , attention_mask=attention_mask,use_cache=True)
tokenizer.batch_decode(out)

['this is a test, blue-room, and a  little, and a little, and a little, and a little,',
 'Well, sir, you could see the  old man, and you are going to see you.  The man was a good']