# Data 
Change "mcqa_dataset.jsonl" to chat templet formate for awq quantization calibration

ex. 
```
[
    
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Tell me who you are."},
    {"role": "assistant", "content": "I am a large language model named Qwen..."}
]
```

In [None]:
#Code origin
#Author: Alexander Valentini

#This is to preprocess the data for the quantization. It sets up the data in the format with "message" and different 
#roles but does not add chattemplate.

# !pip install datasets
# !pip install transformers
# !pip install -U huggingface_hub
# !pip install filelock

In [3]:
from datasets import load_dataset,DatasetDict,Dataset

# dataset_path="/datasets/mcqa_dataset.jsonl"
train_dataset_path="datasets/mcqa/no_chattemplate/all_subjects_no_chattemplate/mcqa_train_dataset_full.jsonl"
test_dataset_path="datasets/mcqa/no_chattemplate/all_subjects_no_chattemplate/mcqa_test_dataset_full.jsonl"
# read the dataset
mcqa_dataset=load_dataset("json", data_files={"train":train_dataset_path, "test":test_dataset_path})

def preprocess_function(examples):
    # print(examples)
    question=examples['question']
    answer=examples['answer']
    system=examples['subject']
    final={"message":[
        {"role": "system", "content": f"You are a helpful {system} assistant."},
        {"role": "user", "content": question},
        {"role": "assistant", "content": answer}
    ]}
    return final

#Dataset is loaded in as a dictionary of datasets. We create a new dataset dict from this:
new_dataset=DatasetDict()
for set in mcqa_dataset:
    data=mcqa_dataset[set]
    new_dataset[set]=data.map(preprocess_function)
    

In [4]:
import os
print(os.path.exists('datasets/mcqa/no_chattemplate/all_subjects_no_chattemplate/mcqa_train_dataset_full.jsonl'))

True


In [3]:
#mcqa_dataset

In [4]:
#new_dataset['train'][0]

In [5]:
train_save_path="datasets/mcqa/message_format_no_chattemplate/all_subjects_full_dataset/mcqa_train_dataset_full_message_format_no_chattemplate.jsonl"
test_save_path="datasets/mcqa/message_format_no_chattemplate/all_subjects_full_dataset/mcqa_test_dataset_full_message_format_no_chattemplate.jsonl"
#save new dataset train list to jsonl file
#new_dataset['train'].to_json(train_save_path,orient='records',lines=True)
#new_dataset['test'].to_json(test_save_path,orient='records',lines=True)

In [6]:
# read the dataset
new_mcqa_dataset=load_dataset("json", data_files={"train":train_save_path, "test":test_save_path})
new_mcqa_dataset

DatasetDict({
    train: Dataset({
        features: ['subject', 'question', 'answer', 'message'],
        num_rows: 12686
    })
    test: Dataset({
        features: ['subject', 'question', 'answer', 'message'],
        num_rows: 1586
    })
})

# Quantization

In [7]:
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer
import time

# Specify paths and hyperparameters for quantization
model_path = "AlexVal/dpo_model"
current_time=time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())
quant_path = f"models/quantized_model_from_exploration_script/quantized_model_{current_time}"
quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMV" } # or version: GEMV

# Load your tokenizer and model with AutoAWQ
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoAWQForCausalLM.from_pretrained(model_path, device_map="auto", safetensors=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Fetching 10 files:   0%|          | 0/10 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [8]:
from datasets import load_dataset,DatasetDict,Dataset
train_save_path="datasets/mcqa/message_format_no_chattemplate/all_subjects_full_dataset/mcqa_train_dataset_full_message_format_no_chattemplate.jsonl"
test_save_path="datasets/mcqa/message_format_no_chattemplate/all_subjects_full_dataset/mcqa_test_dataset_full_message_format_no_chattemplate.jsonl"
mcqa_train_data=load_dataset("json", data_files=train_save_path)['train']

In [9]:
data = []
for content in mcqa_train_data:
    msg = content['message']
    text = tokenizer.apply_chat_template(msg, tokenize=False, add_generation_prompt=False)
    data.append(text.strip())

In [None]:
model.quantize(tokenizer, quant_config=quant_config, calib_data=data)

In [None]:
import time
current_time=time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())
quant_path = f"models/quantized_model_from_exploration_script/quantized_model_{current_time}"
model.save_quantized(quant_path, safetensors=True, shard_size="4GB")
tokenizer.save_pretrained(quant_path)

In [47]:
#Check quantized model size
import glob
import os
quantized = False
quantized_model_path="models/quantized_model_from_exploration_script/quantized_model_2024-06-05-19-23-32"
quantized_checkpoints = glob.glob(f"{quantized_model_path}/*.bin")
if len(quantized_checkpoints) == 0:
    quantized_checkpoints = glob.glob(f"{quantized_model_path}/*.safetensors")
if len(quantized_checkpoints) == 0:
    quantized_checkpoints = glob.glob(f"{quantized_model_path}/*.pt")

quantized_model_size_on_disk = 0
for name in quantized_checkpoints:
    quantized_model_size_on_disk += os.path.getsize(name)
quantized_model_size_gb=quantized_model_size_on_disk/(1024)**3
print("quantization model size (gb):",quantized_model_size_gb)

# orig_checkpoints = glob.glob(f"{self.policy_model_path}/*.bin")
# orig_model_size_on_disk = 0
# for name in orig_checkpoints:
#     orig_model_size_on_disk += os.path.getsize(name)
# print("original model size",orig_model_size_on_disk)
# if quantized_model_size_on_disk < orig_model_size_on_disk:
#     quantized = True



quantization model size (gb): 1.7079910039901733


In [44]:
# check generation
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "CPU")

quantized_model_repoid="AlexVal/mcqa_model_full_only_mcqa-awq_newest"
model = AutoModelForCausalLM.from_pretrained(
    quantized_model_repoid, # the quantized model
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(quantized_model_repoid)

model.to(device)

prompt = "Question: What value of y makes y + 2.9 = 11 true?\n\nOptions:\nA. 8.1\nB. 8.9\nC. 9.1\nD. 13.9\n\nAnswer:"
messages = [
    {"role": "system", "content": "You are a helpful elementary_mathematics assistant."},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(device)

generated_ids = model.generate(
    model_inputs.input_ids,
    max_new_tokens=512
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print("response:",response)

config.json:   0%|          | 0.00/1.16k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.83G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/5.43k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/581 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


# Push to hub

In [None]:
import huggingface_hub
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

safetensors_path = 'models/quantized_model_from_exploration_script/quantized_model_2024-06-05-19-23-32'

tokenizer = AutoTokenizer.from_pretrained(safetensors_path)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForCausalLM.from_pretrained(safetensors_path,use_safetensors = True).to(device)
repoid='AlexVal/mcqa_model_full_only_mcqa-awq_newest'

access_token=""

model.push_to_hub(
 repo_id=repoid,
 commit_message="Commiting model",
 token=access_token
)

tokenizer.push_to_hub(
 repo_id=repoid,
 commit_message="Commiting tokenizer",
 token=access_token
)