# Imports

In [1]:
from __future__ import annotations
from typing import Tuple

import os
import torch
import torch.nn as nn
import torch.nn.functional as F

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    AutoConfig,
    BitsAndBytesConfig
)

import helper_utils.utils as utl

import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


# Directories

In [2]:
AI_DIR:str = utl.make_dir('Decoders/allenai')
NR_DIR:str = utl.make_dir('Decoders/NousResearch')
GPT_DIR:str = utl.make_dir('Decoders/GPT')
PTQ_DIR:str = utl.make_dir('Decoders/PTQ')

Directory Decoders/allenai already exists!
Directory Decoders/NousResearch already exists!
Directory Decoders/GPT already exists!
Directory Decoders/PTQ already exists!


# Load - and Save Models Helper

In [3]:
GPT_KEY = 'gpt2'
LLAMA_KEY = 'NousResearch/Llama-3.2-1B'
BITNET_KEY = 'NousResearch/OLMo-Bitnet-1B'
OLMO_KEY = 'allenai/OLMo-1B-0724-hf'
DEEP3B_KEY = 'NousResearch/DeepHermes-3-Llama-3-3B-Preview'
DEEP8B_KEY = 'NousResearch/DeepHermes-3-Llama-3-8B-Preview'

In [4]:
def load_fp_auto_tokenizer(KEY:str, hs:bool, r_dict:bool, precision:torch.dtype, mem:bool, dmap:str, sf:bool, trust_remote:bool, configs:bool) -> Tuple[AutoModelForCausalLM, AutoTokenizer]: 
    """ Load auto model and tokenizer
        KEY: model transformer endpoint
        hs: hidden states = True - important for snapper snalysis
        precision: torch_dtype=torch.float32 or torch_dtype=torch.float16 or torch.bfloat16
        mem: cpu memory usage = True - important for memory efficiency
        dmap: e.g., 'auto' """
    
    if configs is True:
        config = AutoConfig.from_pretrained(KEY)
        config.output_hidden_states = hs
        config.low_cpu_mem_usage = mem

        model = AutoModelForCausalLM.from_pretrained(
            KEY,
            config=config
        )
        
        tokenizer = AutoTokenizer.from_pretrained(KEY)

        return model, tokenizer
    
    else:
        model = AutoModelForCausalLM.from_pretrained(
            KEY,
            return_dict=r_dict,
            output_hidden_states=hs,
            torch_dtype=precision,
            low_cpu_mem_usage=mem,
            device_map=dmap,
            use_safetensors=sf,
            #trust_remote_code=trust_remote 
        )
        
        tokenizer = AutoTokenizer.from_pretrained(KEY)

        return model, tokenizer


def load_fp_auto(KEY:str, hs:bool, r_dict:bool, precision:torch.dtype, mem:bool, dmap:str, sf:bool, trust_remote:bool, configs:bool) -> AutoModelForCausalLM:
    """ Load auto model
        KEY: model transformer endpoint
        hs: hidden states = True - important for snapper snalysis
        precision: torch_dtype=torch.float32 or torch_dtype=torch.float16 or torch.bfloat16
        mem: cpu memory usage = True - important for memory efficiency
        dmap: e.g., 'auto' """

    if configs is True:
        config = AutoConfig.from_pretrained(KEY)
        config.output_hidden_states = hs
        config.low_cpu_mem_usage = mem

        model = AutoModelForCausalLM.from_pretrained(
            KEY,
            config=config
        )

        return model
    
    else:
        model = AutoModelForCausalLM.from_pretrained(
            KEY,
            return_dict=r_dict,
            output_hidden_states=hs,
            torch_dtype=precision,
            low_cpu_mem_usage=mem,
            device_map=dmap,
            use_safetensors=sf,
            #trust_remote_code=trust_remote 
        )
        
        return model


def load_8bit_auto(KEY:str, hs:bool, r_dict:bool, precision:torch.dtype, bnb_precision:torch.dtype, dmap:str, sf:bool, trust_remote:bool) -> AutoModelForCausalLM:
    """ Load auto model in 8-bit precision
        KEY: model transformer endpoint
        hs: hidden states = True - important for snapper snalysis
        dmap: e.g., 'auto' """

    bnb_config = BitsAndBytesConfig(
        load_in_8bit=True,  # Native 8-bit quantization
        llm_int8_enable_fp32_cpu_offload=True,
        llm_int8_has_fp16_weight=False,
        bnb_8bit_compute_dtype=bnb_precision,  # Ensure compute dtype is float16
        bnb_8bit_use_double_quant=True
    )

    model = AutoModelForCausalLM.from_pretrained(
        KEY,
        return_dict=r_dict,
        output_hidden_states=hs,
        torch_dtype=precision,
        quantization_config=bnb_config,
        device_map=dmap,
        use_safetensors=sf,
        #trust_remote_code=trust_remote 
    )
    
    return model


def load_4bit_auto(KEY:str, hs:bool, r_dict:bool, precision:torch.dtype, dmap:str, sf:bool, trust_remote:bool) -> AutoModelForCausalLM:
    """ Load auto model in 4-bit precision
        KEY: model transformer endpoint
        hs: hidden states = True - important for snapper snalysis
        precision: torch_dtype=torch.float32 or torch_dtype=torch.float16 or torch.bfloat16
        dmap: e.g., 'auto' """

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=precision,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type='nf4', # e.g., int4 for further efficiency
    )

    model = AutoModelForCausalLM.from_pretrained(
        KEY,
        return_dict=r_dict,
        output_hidden_states=hs,
        quantization_config=bnb_config,
        device_map=dmap,
        use_safetensors=sf,
        #trust_remote_code=trust_remote 
    )
    
    return model

# 4. NousResearch/DeepHermes-3-Llama-3-3B-Preview

## 4.a. Base Deep Hermes Model

In [8]:
deep3b_base, deep3b_tokenizer = load_fp_auto_tokenizer(
    KEY=DEEP3B_KEY,
    hs=True,
    r_dict=True,
    precision=torch.float16,
    mem=True,
    dmap='auto',
    sf=True,
    trust_remote=True,
    configs=False
)

Loading checkpoint shards: 100%|██████████| 2/2 [00:09<00:00,  4.85s/it]


In [9]:
utl.save_tokenizer(deep3b_tokenizer, NR_DIR+str('/DeepHermes-3-Llama-3-3B-Preview-Tokenizer'))

Tokenizer saved to Decoders/NousResearch/DeepHermes-3-Llama-3-3B-Preview-Tokenizer


In [10]:
utl.save_model(deep3b_base, NR_DIR+str('/DeepHermes-3-Llama-3-3B-Preview-Base'))

Model saved to Decoders/NousResearch/DeepHermes-3-Llama-3-3B-Preview-Base


## 4.b. Custom quant Deep Hermes Models: 8-bit, 4-bit, 2-bit, 1.58-bit and 1-bit

In [5]:
deep_2bit = utl.ptq_2bit(
    load_fp_auto(
        KEY=DEEP3B_KEY,
        hs=True,
        r_dict=True,
        precision=torch.float16,
        mem=True,
        dmap='auto',
        sf=True,
        trust_remote=True, 
        configs=False
    )
)

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.26s/it]


Quantized Layer: model.layers.0.self_attn.q_proj
 tensor([[-1.0000,  1.0000,  1.0000,  ...,  1.0000, -1.0000, -1.0000],
        [-0.3301, -1.0000,  1.0000,  ...,  1.0000,  0.3301, -1.0000],
        [ 1.0000,  1.0000,  1.0000,  ...,  1.0000, -1.0000, -0.3301],
        ...,
        [ 0.3301, -0.3301,  1.0000,  ..., -0.3301,  0.3301,  0.3301],
        [ 0.3301, -0.3301,  1.0000,  ..., -0.3301, -1.0000,  0.3301],
        [-0.3301,  1.0000, -0.3301,  ...,  0.3301,  0.3301, -0.3301]],
       device='cuda:0', dtype=torch.float16)
Quantized Layer: model.layers.0.self_attn.k_proj
 tensor([[ 0.3301, -1.0000,  1.0000,  ..., -1.0000, -1.0000,  0.3301],
        [ 0.3301, -1.0000,  1.0000,  ..., -1.0000, -1.0000, -1.0000],
        [-0.3301, -1.0000,  1.0000,  ..., -0.3301, -0.3301,  1.0000],
        ...,
        [ 0.3301,  1.0000, -1.0000,  ..., -1.0000,  0.3301,  0.3301],
        [ 0.3301,  0.3301,  1.0000,  ...,  1.0000,  1.0000,  1.0000],
        [-0.3301, -0.3301,  1.0000,  ...,  1.0000, -1.0000

In [6]:
utl.save_model(deep_2bit, PTQ_DIR+str('/DeepHermes-3-Llama-3-3B-Preview-2bit'))

Model saved to Decoders/PTQ/DeepHermes-3-Llama-3-3B-Preview-2bit


In [5]:
deep_158bit = utl.ptq_1_58bit(
    load_fp_auto(
        KEY=DEEP3B_KEY,
        hs=True,
        r_dict=True,
        precision=torch.float16,
        mem=True,
        dmap='auto',
        sf=True,
        trust_remote=True, 
        configs=False
    )
)

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.27s/it]


Quantized Layer: model.layers.0.self_attn.q_proj
 tensor([[-1.,  1.,  1.,  ...,  1., -1., -1.],
        [-1., -1.,  1.,  ...,  1.,  0., -1.],
        [ 1.,  1.,  1.,  ...,  1., -1., -1.],
        ...,
        [ 0., -1.,  1.,  ..., -1.,  1.,  1.],
        [ 1., -1.,  1.,  ..., -1., -1.,  1.],
        [-1.,  1., -1.,  ...,  1.,  0., -1.]], device='cuda:0')
Quantized Layer: model.layers.0.self_attn.k_proj
 tensor([[ 0., -1.,  1.,  ..., -1., -1.,  1.],
        [ 0., -1.,  1.,  ..., -1., -1., -1.],
        [-1., -1.,  1.,  ..., -1., -1.,  1.],
        ...,
        [ 1.,  1., -1.,  ..., -1.,  1.,  0.],
        [ 0.,  1.,  1.,  ...,  1.,  1.,  1.],
        [ 0., -1.,  1.,  ...,  1., -1.,  1.]], device='cuda:0')
Quantized Layer: model.layers.0.self_attn.v_proj
 tensor([[ 1.,  1.,  1.,  ..., -1., -1.,  1.],
        [ 1.,  0.,  0.,  ..., -1., -1.,  1.],
        [ 1., -1.,  1.,  ..., -1., -1., -1.],
        ...,
        [ 1.,  1.,  0.,  ..., -1.,  0.,  1.],
        [ 1.,  1.,  1.,  ..., -1.,  1.,

In [6]:
utl.save_model(deep_158bit, PTQ_DIR+str('/DeepHermes-3-Llama-3-3B-Preview-1.58bit'))

Model saved to Decoders/PTQ/DeepHermes-3-Llama-3-3B-Preview-1.58bit


In [5]:
deep_1bit = utl.ptq_1bit(
    load_fp_auto(
        KEY=DEEP3B_KEY,
        hs=True,
        r_dict=True,
        precision=torch.float16,
        mem=True,
        dmap='auto',
        sf=True,
        trust_remote=True, 
        configs=False
    )
)

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.24s/it]


Quantized Layer: model.layers.0.self_attn.q_proj
 tensor([[-1.,  1.,  1.,  ...,  1., -1., -1.],
        [-1., -1.,  1.,  ...,  1.,  1., -1.],
        [ 1.,  1.,  1.,  ...,  1., -1., -1.],
        ...,
        [ 1., -1.,  1.,  ..., -1.,  1.,  1.],
        [ 1., -1.,  1.,  ..., -1., -1.,  1.],
        [-1.,  1., -1.,  ...,  1.,  1., -1.]], device='cuda:0',
       dtype=torch.float16)
Quantized Layer: model.layers.0.self_attn.k_proj
 tensor([[ 1., -1.,  1.,  ..., -1., -1.,  1.],
        [ 1., -1.,  1.,  ..., -1., -1., -1.],
        [-1., -1.,  1.,  ..., -1., -1.,  1.],
        ...,
        [ 1.,  1., -1.,  ..., -1.,  1.,  1.],
        [ 1.,  1.,  1.,  ...,  1.,  1.,  1.],
        [-1., -1.,  1.,  ...,  1., -1.,  1.]], device='cuda:0',
       dtype=torch.float16)
Quantized Layer: model.layers.0.self_attn.v_proj
 tensor([[ 1.,  1.,  1.,  ..., -1., -1.,  1.],
        [ 1., -1., -1.,  ..., -1., -1.,  1.],
        [ 1., -1.,  1.,  ..., -1., -1., -1.],
        ...,
        [ 1.,  1.,  1.,  ...,

In [6]:
utl.save_model(deep_1bit, PTQ_DIR+str('/DeepHermes-3-Llama-3-3B-Preview-1bit'))

Model saved to Decoders/PTQ/DeepHermes-3-Llama-3-3B-Preview-1bit


In [5]:
deep_4bit = utl.ptq_4bit_uniform(
    load_fp_auto(
        KEY=DEEP3B_KEY,
        hs=True,
        r_dict=True,
        precision=torch.float16,
        mem=True,
        dmap='cpu',
        sf=True,
        trust_remote=True, 
        configs=False
    )
)

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.06it/s]


Quantized model.layers.0.self_attn.q_proj to 4-bit
 tensor([[-0.0703,  0.0703,  0.0703,  ...,  0.0703, -0.0703, -0.0703],
        [-0.0703, -0.0703,  0.0703,  ...,  0.0703,  0.0703, -0.0703],
        [ 0.0703,  0.0703,  0.0703,  ...,  0.0703, -0.0703, -0.0703],
        ...,
        [ 0.0703, -0.0703,  0.0703,  ..., -0.0703,  0.0703,  0.0703],
        [ 0.0703, -0.0703,  0.0703,  ..., -0.0703, -0.0703,  0.0703],
        [-0.0703,  0.0703, -0.0703,  ...,  0.0703,  0.0703, -0.0703]])
Quantized model.layers.0.self_attn.k_proj to 4-bit
 tensor([[ 0.0648, -0.0648,  0.0648,  ..., -0.0648, -0.0648,  0.0648],
        [ 0.0648, -0.0648,  0.0648,  ..., -0.0648, -0.0648, -0.0648],
        [-0.0648, -0.0648,  0.0648,  ..., -0.0648, -0.0648,  0.0648],
        ...,
        [ 0.0648,  0.0648, -0.0648,  ..., -0.0648,  0.0648,  0.0648],
        [ 0.0648,  0.0648,  0.0648,  ...,  0.0648,  0.0648,  0.0648],
        [-0.0648, -0.0648,  0.0648,  ...,  0.0648, -0.0648,  0.0648]])
Quantized model.layers.0.sel

In [6]:
utl.save_model(deep_4bit, PTQ_DIR+str('/DeepHermes-3-Llama-3-3B-Preview-4bit'))

Model saved to Decoders/PTQ/DeepHermes-3-Llama-3-3B-Preview-4bit


In [7]:
deep_8bit = utl.ptq_8bit_uniform(
    load_fp_auto(
        KEY=DEEP3B_KEY,
        hs=True,
        r_dict=True,
        precision=torch.float16,
        mem=True,
        dmap='cpu',
        sf=True,
        trust_remote=True, 
        configs=False
    )
)

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.02s/it]


Quantized model.layers.0.self_attn.q_proj to 8-bit
 tensor([[-0.0290,  0.0290,  0.0620,  ...,  0.0869, -0.0372, -0.0207],
        [-0.0124, -0.0207,  0.0455,  ...,  0.0538,  0.0041, -0.0372],
        [ 0.0207,  0.0207,  0.0372,  ...,  0.0538, -0.0372, -0.0124],
        ...,
        [ 0.0041, -0.0041,  0.0703,  ..., -0.0124,  0.0041,  0.0124],
        [ 0.0041, -0.0124,  0.0207,  ..., -0.0124, -0.0207,  0.0124],
        [-0.0041,  0.0207, -0.0041,  ...,  0.0041,  0.0041, -0.0124]])
Quantized model.layers.0.self_attn.k_proj to 8-bit
 tensor([[ 0.0038, -0.1182,  0.0572,  ..., -0.0267, -0.0725,  0.0114],
        [ 0.0038, -0.0648,  0.0420,  ..., -0.0420, -0.0191, -0.0191],
        [-0.0114, -0.0725,  0.0801,  ..., -0.0191, -0.0191,  0.0267],
        ...,
        [ 0.0114,  0.0267, -0.0420,  ..., -0.0267,  0.0114,  0.0038],
        [ 0.0038,  0.0191,  0.0496,  ...,  0.0191,  0.0343,  0.0343],
        [-0.0038, -0.0114,  0.0801,  ...,  0.0267, -0.0801,  0.0114]])
Quantized model.layers.0.sel

In [8]:
utl.save_model(deep_8bit, PTQ_DIR+str('/DeepHermes-3-Llama-3-3B-Preview-8bit'))

Model saved to Decoders/PTQ/DeepHermes-3-Llama-3-3B-Preview-8bit


# 5. Deep Hermes LLaMA 8B

## 5.a. NousResearch/DeepHermes-3-Llama-3-8B-Preview Base Model

In [5]:
deep8b_base, deep8b_tokenizer = load_fp_auto_tokenizer(
    KEY=DEEP8B_KEY,
    hs=True,
    r_dict=True,
    precision=torch.float16,
    mem=True,
    dmap='cpu',
    sf=True,
    trust_remote=True,
    configs=False
)

Loading checkpoint shards: 100%|██████████| 4/4 [00:20<00:00,  5.15s/it]


In [6]:
utl.save_tokenizer(deep8b_tokenizer, NR_DIR+str('/DeepHermes-3-Llama-3-8B-Preview-Tokenizer'))

Tokenizer saved to Decoders/NousResearch/DeepHermes-3-Llama-3-8B-Preview-Tokenizer


In [7]:
utl.save_model(deep8b_base, NR_DIR+str('/DeepHermes-3-Llama-3-8B-Preview-Base'))

Model saved to Decoders/NousResearch/DeepHermes-3-Llama-3-8B-Preview-Base


## 5.b. 8B Models: 8-bit, 4-bit, 2-bit, 1.58-bit and 1-bit custom quant

In [5]:
deep8b_2bit = utl.ptq_2bit(
    load_fp_auto(
        KEY=DEEP8B_KEY,
        hs=True,
        r_dict=True,
        precision=torch.float16,
        mem=True,
        dmap='cpu',
        sf=True,
        trust_remote=True, 
        configs=False
    )
)

Loading checkpoint shards: 100%|██████████| 4/4 [00:04<00:00,  1.24s/it]


Quantized Layer: model.layers.0.self_attn.q_proj
 tensor([[ 0.3301, -1.0000, -0.3301,  ...,  1.0000, -1.0000, -1.0000],
        [-1.0000, -1.0000, -0.3301,  ..., -1.0000, -1.0000,  1.0000],
        [-1.0000, -1.0000, -0.3301,  ...,  1.0000, -1.0000,  1.0000],
        ...,
        [-0.3301, -1.0000,  1.0000,  ...,  0.3301, -0.3301,  0.3301],
        [-0.3301, -1.0000,  1.0000,  ...,  0.3301, -0.3301,  0.3301],
        [-0.3301, -1.0000,  1.0000,  ...,  1.0000,  0.3301,  0.3301]],
       dtype=torch.float16)
Quantized Layer: model.layers.0.self_attn.k_proj
 tensor([[-1.0000, -1.0000,  1.0000,  ...,  1.0000, -1.0000,  1.0000],
        [-1.0000, -1.0000,  1.0000,  ...,  1.0000, -0.3301,  1.0000],
        [-1.0000, -1.0000,  1.0000,  ..., -0.3301, -1.0000,  1.0000],
        ...,
        [ 0.3301,  1.0000, -1.0000,  ..., -1.0000, -1.0000,  0.3301],
        [ 1.0000,  1.0000, -1.0000,  ..., -1.0000,  0.3301,  1.0000],
        [-0.3301, -0.3301,  1.0000,  ..., -1.0000,  0.3301, -1.0000]],
    

In [6]:
utl.save_model(deep8b_2bit, PTQ_DIR+str('/DeepHermes-3-Llama-3-8B-Preview-2bit'))

Model saved to Decoders/PTQ/DeepHermes-3-Llama-3-8B-Preview-2bit


In [7]:
deep8b_158bit = utl.ptq_1_58bit(
    load_fp_auto(
        KEY=DEEP8B_KEY,
        hs=True,
        r_dict=True,
        precision=torch.float16,
        mem=True,
        dmap='cpu',
        sf=True,
        trust_remote=True, 
        configs=False
    )
)

Loading checkpoint shards: 100%|██████████| 4/4 [00:14<00:00,  3.59s/it]


Quantized Layer: model.layers.0.self_attn.q_proj
 tensor([[ 1., -1., -1.,  ...,  1., -1., -1.],
        [-1., -1., -1.,  ..., -1., -1.,  1.],
        [-1., -1., -1.,  ...,  1., -1.,  1.],
        ...,
        [-1., -1.,  1.,  ...,  1., -1.,  1.],
        [-1., -1.,  1.,  ...,  1., -1.,  1.],
        [-1., -1.,  1.,  ...,  1.,  0.,  1.]])
Quantized Layer: model.layers.0.self_attn.k_proj
 tensor([[-1., -1.,  1.,  ...,  1., -1.,  1.],
        [-1., -1.,  1.,  ...,  1., -1.,  1.],
        [-1., -1.,  1.,  ...,  0., -1.,  1.],
        ...,
        [ 1.,  1., -1.,  ..., -1., -1.,  0.],
        [ 1.,  1., -1.,  ..., -1.,  0.,  1.],
        [ 0., -1.,  1.,  ..., -1.,  1., -1.]])
Quantized Layer: model.layers.0.self_attn.v_proj
 tensor([[ 1., -1.,  0.,  ...,  1.,  0.,  1.],
        [ 0., -1.,  0.,  ..., -1., -1.,  1.],
        [ 1.,  1.,  1.,  ..., -1.,  1.,  1.],
        ...,
        [ 1.,  0.,  0.,  ..., -1., -1., -1.],
        [ 1.,  1.,  1.,  ...,  1., -1., -1.],
        [ 0., -1.,  0.,  ..

In [8]:
utl.save_model(deep8b_158bit, PTQ_DIR+str('/DeepHermes-3-Llama-3-8B-Preview-1.58bit'))

Model saved to Decoders/PTQ/DeepHermes-3-Llama-3-8B-Preview-1.58bit


In [5]:
deep8b_1bit = utl.ptq_1bit(
    load_fp_auto(
        KEY=DEEP8B_KEY,
        hs=True,
        r_dict=True,
        precision=torch.float16,
        mem=True,
        dmap='cpu',
        sf=True,
        trust_remote=True, 
        configs=False
    )
)

Loading checkpoint shards: 100%|██████████| 4/4 [00:20<00:00,  5.03s/it]


Quantized Layer: model.layers.0.self_attn.q_proj
 tensor([[ 1., -1., -1.,  ...,  1., -1., -1.],
        [-1., -1., -1.,  ..., -1., -1.,  1.],
        [-1., -1., -1.,  ...,  1., -1.,  1.],
        ...,
        [-1., -1.,  1.,  ...,  1., -1.,  1.],
        [-1., -1.,  1.,  ...,  1., -1.,  1.],
        [-1., -1.,  1.,  ...,  1.,  1.,  1.]], dtype=torch.float16)
Quantized Layer: model.layers.0.self_attn.k_proj
 tensor([[-1., -1.,  1.,  ...,  1., -1.,  1.],
        [-1., -1.,  1.,  ...,  1., -1.,  1.],
        [-1., -1.,  1.,  ..., -1., -1.,  1.],
        ...,
        [ 1.,  1., -1.,  ..., -1., -1.,  1.],
        [ 1.,  1., -1.,  ..., -1.,  1.,  1.],
        [-1., -1.,  1.,  ..., -1.,  1., -1.]], dtype=torch.float16)
Quantized Layer: model.layers.0.self_attn.v_proj
 tensor([[ 1., -1., -1.,  ...,  1.,  1.,  1.],
        [-1., -1., -1.,  ..., -1., -1.,  1.],
        [ 1.,  1.,  1.,  ..., -1.,  1.,  1.],
        ...,
        [ 1., -1.,  1.,  ..., -1., -1., -1.],
        [ 1.,  1.,  1.,  ...,  

In [6]:
utl.save_model(deep8b_1bit, PTQ_DIR+str('/DeepHermes-3-Llama-3-8B-Preview-1bit'))

Model saved to Decoders/PTQ/DeepHermes-3-Llama-3-8B-Preview-1bit


In [5]:
deep8b_4bit = utl.ptq_4bit_uniform(
    load_fp_auto(
        KEY=DEEP8B_KEY,
        hs=True,
        r_dict=True,
        precision=torch.float16,
        mem=True,
        dmap='cpu',
        sf=True,
        trust_remote=True, 
        configs=False
    )
)


Loading checkpoint shards: 100%|██████████| 4/4 [00:04<00:00,  1.23s/it]


Quantized model.layers.0.self_attn.q_proj to 4-bit
 tensor([[ 0.0503, -0.0503, -0.0503,  ...,  0.0503, -0.0503, -0.0503],
        [-0.0503, -0.0503, -0.0503,  ..., -0.0503, -0.0503,  0.0503],
        [-0.0503, -0.0503, -0.0503,  ...,  0.0503, -0.0503,  0.0503],
        ...,
        [-0.0503, -0.0503,  0.0503,  ...,  0.0503, -0.0503,  0.0503],
        [-0.0503, -0.0503,  0.0503,  ...,  0.0503, -0.0503,  0.0503],
        [-0.0503, -0.0503,  0.0503,  ...,  0.0503,  0.0503,  0.0503]])
Quantized model.layers.0.self_attn.k_proj to 4-bit
 tensor([[-0.1406, -0.1406,  0.0469,  ...,  0.0469, -0.0469,  0.0469],
        [-0.0469, -0.0469,  0.0469,  ...,  0.0469, -0.0469,  0.0469],
        [-0.0469, -0.0469,  0.0469,  ..., -0.0469, -0.0469,  0.0469],
        ...,
        [ 0.0469,  0.0469, -0.0469,  ..., -0.0469, -0.0469,  0.0469],
        [ 0.0469,  0.0469, -0.0469,  ..., -0.0469,  0.0469,  0.0469],
        [-0.0469, -0.0469,  0.0469,  ..., -0.0469,  0.0469, -0.0469]])
Quantized model.layers.0.sel

In [6]:
utl.save_model(deep8b_4bit, PTQ_DIR+str('/DeepHermes-3-Llama-3-8B-Preview-4bit'))

Model saved to Decoders/PTQ/DeepHermes-3-Llama-3-8B-Preview-4bit


In [5]:
deep8b_8bit = utl.ptq_8bit_uniform(
    load_fp_auto(
        KEY=DEEP8B_KEY,
        hs=True,
        r_dict=True,
        precision=torch.float16,
        mem=True,
        dmap='cpu',
        sf=True,
        trust_remote=True, 
        configs=False
    )
)

Loading checkpoint shards: 100%|██████████| 4/4 [00:05<00:00,  1.33s/it]


Quantized model.layers.0.self_attn.q_proj to 8-bit
 tensor([[ 0.0030, -0.0266, -0.0089,  ...,  0.0089, -0.0443, -0.0266],
        [-0.0148, -0.0680, -0.0030,  ..., -0.0148, -0.0503,  0.0207],
        [-0.0148, -0.0384, -0.0030,  ...,  0.0089, -0.0148,  0.0089],
        ...,
        [-0.0030, -0.0384,  0.0798,  ...,  0.0089, -0.0030,  0.0030],
        [-0.0030, -0.0089,  0.0443,  ...,  0.0030, -0.0030,  0.0030],
        [-0.0030, -0.0148,  0.0325,  ...,  0.0089,  0.0030,  0.0030]])
Quantized model.layers.0.self_attn.k_proj to 8-bit
 tensor([[-0.1020, -0.1517,  0.0800,  ...,  0.0303, -0.0138,  0.0469],
        [-0.0579, -0.0855,  0.0193,  ...,  0.0138, -0.0028,  0.0303],
        [-0.0193, -0.0524,  0.0414,  ..., -0.0028, -0.0138,  0.0469],
        ...,
        [ 0.0083,  0.0193, -0.0248,  ..., -0.0138, -0.0083,  0.0028],
        [ 0.0138,  0.0358, -0.0193,  ..., -0.0138,  0.0028,  0.0083],
        [-0.0028, -0.0083,  0.0138,  ..., -0.0138,  0.0028, -0.0193]])
Quantized model.layers.0.sel

In [6]:
utl.save_model(deep8b_8bit, PTQ_DIR+str('/DeepHermes-3-Llama-3-8B-Preview-8bit'))

Model saved to Decoders/PTQ/DeepHermes-3-Llama-3-8B-Preview-8bit
