In [4]:
import os, json

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from dotenv import load_dotenv

%load_ext chime

The chime extension is already loaded. To reload it, use:
  %reload_ext chime


In [2]:
load_dotenv()
HF_TOKEN = os.getenv('HF_TOKEN')

In [5]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

In [6]:
%%chime
model = AutoModelForCausalLM.from_pretrained(
    "/home/denis/Models/LLM/Mistral-7B-Instruct-v0.3/",
    quantization_config=bnb_config,
    device_map="auto",
    local_files_only=True,
    token=HF_TOKEN
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3")

In [8]:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token 

In [9]:
path = '/home/denis/Models/LLM/Mistral-7B-Instruct-v0.3_quantized_4bit/'

In [10]:
model.save_pretrained(path)

In [11]:
tokenizer.save_pretrained(path)

('/home/denis/Models/LLM/Mistral-7B-Instruct-v0.3_quantized_4bit/tokenizer_config.json',
 '/home/denis/Models/LLM/Mistral-7B-Instruct-v0.3_quantized_4bit/special_tokens_map.json',
 '/home/denis/Models/LLM/Mistral-7B-Instruct-v0.3_quantized_4bit/chat_template.jinja',
 '/home/denis/Models/LLM/Mistral-7B-Instruct-v0.3_quantized_4bit/tokenizer.model',
 '/home/denis/Models/LLM/Mistral-7B-Instruct-v0.3_quantized_4bit/added_tokens.json',
 '/home/denis/Models/LLM/Mistral-7B-Instruct-v0.3_quantized_4bit/tokenizer.json')

In [12]:
with open(os.path.join(path, "quant_config.json"), "w") as f:
    json.dump(bnb_config.to_dict(), f)