In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from google.colab import userdata

hf_token = key=userdata.get('Hug_TOKEN')
!hf auth login --token {hf_token}
!hf --add-to-git-credential

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `hf`CLI if you want to set the git credential as well.
Token is valid (permission: fineGrained).
The token `llamafactory-push` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `llamafactory-push`
usage: hf <command> [<args>]
hf: error: unrecognized arguments: --add-to-git-credential


In [None]:
# @title

# This code for converting JSONL into qwen structure (useless in our case)
from transformers import AutoTokenizer
import json

base_model_id = "Qwen/Qwen2.5-1.5B-Instruct"
input_file = "/content/drive/MyDrive/Data/all_MCQ.jsonl"
output_file = "/content/drive/MyDrive/Data/qwen_formatted_MCQ.jsonl"

tokenizer = AutoTokenizer.from_pretrained(base_model_id)

conversations = []
with open(input_file, 'r', encoding='utf-8') as f:
    for line in f:
        if line:
            conversations.append(json.loads(line))

formatted_conversations = []

for conv in conversations:
    # If data has "messages" key
    if "messages" in conv:
        messages = conv["messages"]
    # If data is already the messages list
    elif isinstance(conv, list):
        messages = conv
    # If data has different structure
    else:
        print(f"Unexpected format: {conv.keys() if isinstance(conv, dict) else type(conv)}")
        continue

    try:
        formatted_text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )

        formatted_conversations.append({
            "text": formatted_text,
            "original": conv
        })

    except Exception as e:
        print(f"Error processing: {e}")
        print(f"Data: {conv}")
        continue

# Save the formatted data
with open(output_file, 'w', encoding='utf-8') as f:
    for conv in formatted_conversations:
        f.write(json.dumps(conv, ensure_ascii=False) + '\n')

print(f"Successfully formatted {len(formatted_conversations)} out of {len(conversations)} conversations")

## Convert to alpaca format

In [None]:
# @title
import json

prompt = "\n".join([
            "You are a quiz parser. Output JSON for questions, \"SKIP\" for non-questions.",
            "",
            "Format: {\"question\": \"text\", \"answers\": {\"1\": \"opt1\", \"2\": \"opt2\"}, \"correct\": \"1\"}",
            "",
            "Key Rules:",
            "1. Clean: remove numbering (Q1:, 1., etc.) and formatting markers",
            "2. Answers: always use keys \"1\", \"2\", \"3\"",
            "3. Correct: find from bold/**text**, [highlight], ✅, [colored], or trailing indicators",
            "4. CRITICAL: All options same format → \"0\"",
            "5. CRITICAL: 2+ marked OR no marking → \"0\"",
            "6. Output ONLY JSON or \"SKIP\"",
            "",
            "Examples:",
            "Input: Q1: What is 2+2? **A) 4** B) 5",
            "Output: {\"question\": \"What is 2+2?\", \"answers\": {\"1\": \"4\", \"2\": \"5\"}, \"correct\": \"1\"}",
            "",
            "Input: Capital of Spain? a) Madrid b) Barcelona c) Valencia. Answer: a",
            "Output: {\"question\": \"Capital of Spain?\", \"answers\": {\"1\": \"Madrid\", \"2\": \"Barcelona\", \"3\": \"Valencia\"}, \"correct\": \"1\"}",
            "",
            "Input: Which is correct? **a) X** **b) Y** **c) Z**",
            "Output: {\"question\": \"Which is correct?\", \"answers\": {\"1\": \"X\", \"2\": \"Y\", \"3\": \"Z\"}, \"correct\": \"0\"}"
        ])
TASK = "\n".join([
            "Convert the following multiple-choice question into JSON.",
            "If it is not a valid question, output SKIP.",
        ])

def convert_to_alpaca(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as f_in, \
         open(output_file, 'w', encoding='utf-8') as f_out:

        for line in f_in:
            data = json.loads(line)
            messages = data.get("messages", [])

            user = ""
            assistant = ""

            for msg in messages:
                role = msg.get("role")
                content = msg.get("content")

                if role == "user":
                    user = content
                elif role == "assistant":
                    assistant = content

            alpaca_row = {
                "system": prompt,
                "instruction": TASK,
                "input": user,
                "output": assistant,
                "history": []
            }

            f_out.write(json.dumps(alpaca_row, ensure_ascii=False) + "\n")

    print(f"✅ Converted to Alpaca format: {output_file}")

# Convert your files
convert_to_alpaca(
    '/content/drive/MyDrive/data/all_MCQ.jsonl',
    '/content/drive/MyDrive/data/all_MCQs.jsonl'
)

convert_to_alpaca(
    '/content/drive/MyDrive/data/evaluation.jsonl',
    '/content/drive/MyDrive/data/evaluations.jsonl'
)

✅ Converted to Alpaca format: /content/drive/MyDrive/data/all_MCQs.jsonl
✅ Converted to Alpaca format: /content/drive/MyDrive/data/evaluations.jsonl


## Fine-tuning

In [None]:
!git clone --depth 1 https://github.com/hiyouga/LLaMA-Factory.git
!cd LLaMA-Factory && pip install -e .

In [None]:
!pip install -U "numpy<2.0" --quiet

!python -m pip install --upgrade pip setuptools wheel
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install transformers accelerate datasets peft trl
!pip install -e LLaMA-Factory/. --no-deps
!pip install gradio fire tyro shtab
!pip install -U "datasets>=2.16.0,<4.1.0"
!pip install -U "transformers>=4.49.0,<=4.57.1"

In [None]:
!pip install -U bitsandbytes

In [None]:
# LLaMA-Factory
# Go to /content/LLaMA-Factory/data/dataset_info.json and add the following:
"""
  "quiz_parser_train": {
    "file_name": "/content/drive/MyDrive/data/all_MCQs.jsonl",
    "columns": {
        "prompt": "instruction",
        "query": "input",
        "response": "output",
        "system": "system",
        "history": "history"
    }
  },
  "quiz_parser_eval": {
    "file_name": "/content/drive/MyDrive/data/evaluations.jsonl",
    "columns": {
        "prompt": "instruction",
        "query": "input",
        "response": "output",
        "system": "system",
        "history": "history"
    }
  }
"""

In [None]:
!rm -rf ~/.cache/huggingface

In [None]:
!git config --global credential.helper store
!hf auth logout
!hf auth login --add-to-git-credential

Not logged in!

    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) Y
Token is valid (permission: fineGrained).
The token `llamafactory-push` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved in your configured git credential helpers (store).
Your token has b

In [None]:
%%writefile /content/LLaMA-Factory/examples/train_lora/quiz_finetune.yaml

### Model
model_name_or_path: Qwen/Qwen2.5-1.5B-Instruct
trust_remote_code: true

### Method
stage: sft
do_train: true
do_eval: true
finetuning_type: lora
lora_rank: 8
lora_alpha: 16
lora_dropout: 0.05
lora_target: all

### Dataset
dataset: quiz_parser_train
eval_dataset: quiz_parser_eval
template: qwen
cutoff_len: 2048
overwrite_cache: true
preprocessing_num_workers: 4

### Training (T4 Optimized)
per_device_train_batch_size: 1
gradient_accumulation_steps: 16
learning_rate: 1.0e-4
num_train_epochs: 3.0
#max_steps: -1
lr_scheduler_type: cosine
warmup_ratio: 0.1
optim: adamw_8bit
fp16: true
gradient_checkpointing: true
logging_steps: 10
save_steps: 200
save_total_limit: 2

### Evaluation
#val_size: 0.1
per_device_eval_batch_size: 1
eval_strategy: steps
eval_steps: 100

### Output
output_dir: /content/drive/MyDrive/llm-finetuning/quiz_parser
overwrite_output_dir: true

### hf
push_to_hub: true
hub_model_id: "Fadhl0/quiz-parser-qwen-1.5b"
hub_private_repo: true
hub_strategy: checkpoint

### System
ddp_timeout: 180000000
dataloader_num_workers: 2
report_to: "none"

Overwriting /content/LLaMA-Factory/examples/train_lora/quiz_finetune.yaml


In [None]:
!cd LLaMA-Factory/ && llamafactory-cli train /content/LLaMA-Factory/examples/train_lora/quiz_finetune.yaml

2025-12-22 11:42:33.670692: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1766403753.709342   25457 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1766403753.720085   25457 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1766403753.745167   25457 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766403753.745198   25457 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766403753.745202   25457 computation_placer.cc:177] computation placer alr

#Run The Model with our Layer

In [None]:
!pip install -qU transformers
!pip install -qU json-repair
import json
import re
import json_repair
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

base_model_id = "Qwen/Qwen2.5-1.5B-Instruct"
device = "cuda"
torch_dtype = torch.float16

def parse_json(text):
    try:
        return json_repair.loads(text)
    except:
        return None

def trim_all(text):
  try:
    text = re.sub(r" {2,}", " ", text)
    text = re.sub(r"[ ]+\n|\n[ ]+", "\n", text)
    text = re.sub(r"\n{3,}|[ ]*\n[ ]+\n", "\n\n", text)
    text = re.sub(r"‘|’", "'", text)          # ‘ ’
    text = re.sub(r"“|”", "\"", text)         # “ ”
    text = re.sub(r"[–—]", "-", text)         # – —
    text = re.sub(r"[_-]{6,}", "_____", text) # _ -
    text = re.sub(r"[‥…]", "..", text)        # ‥ …
    text = re.sub(r",{6,}", ".....", text)    # ,
    text = re.sub(r"\.{6,}", ".....", text)   # .
    return text.split("\n\n")
  except:
    return None

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    device_map="auto",
    torch_dtype = torch_dtype
)

tokenizer = AutoTokenizer.from_pretrained(base_model_id)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
`torch_dtype` is deprecated! Use `dtype` instead!


In [None]:
finetuned_model_id = "/content/drive/MyDrive/llm-finetuning/quiz_parser"
model.load_adapter(finetuned_model_id)

In [None]:
messages = """
1. What is the value of π (pi) approximately equal to?
A) 2.14
B) 3.14
C) 4.13
D) 3.41
Correct is 3.14.

2. Who wrote "Romeo and Juliet"?
1) Charles Dickens
2) **William Shakespeare**
3) George Orwell
4) Mark Twain



3. The square root of 144 is:
A) 10
B) 11
C) 12
D) 14
Correct Option → c

This is an insane amount of knowledge shared for free on the internet, thank you !

35. After World War I, in European countries, in the Soviet Union and Japan powerfuldictatorships emerged.
a. [highlight]True[/highlight]
b. False
36........... promised a better future to Germans that were feeling humiliated by losing theWorld War I.
a. Benito Musoolini
b. [highlight]Adolph Hitler[/highlight]
c. Nicholas II
d. Peter the Great

19. What did Voltaire not support? A) freedom of religion B) freedom of expression C) separation of church and state. **D) State control over the economy**

GEP1006 FINAL GROUP A

30) ……… was the leader during the radical phase of the Fench Revelation. [/colored]a. louis 14[/colored] b. Danton  c. henry 8 d. Maximilian Robespierre

39. which of the  following was among the consequences of World War III
A Germany was divided into four regions of administration.
B East Germany became under the Soviet control.
C the world was divided in two a bi-polar system as Eastern and Western Bloc
[highlight]D  All of answers: A , B& C[/highlight]

1. Which of the following was the capital of the Lydians?
A) Olympia ✅
B) Miletus
C) Aspendos
D) Sardis



1- Seismic survey use ...C.. sound waves to produce a a "CATScan" of Earth's subsurface.
a- refraction
b-isolated.
c-reflected.
d-refracted

من هو النبي الذي كان يملك ملك لم يملكه بقية البشر؟
أ- **سليمان عليه السلام**
ب- عيسى عليه السلام
ج- يحيى عليه السلام
د- داود عليه السلام

3-Types of seismic waves...(c)
a-Body waves
b-surface waves
c-A&B

3-Types of seismic waves... (c)
a-Body waves
b-surface waves
c-A&B

4. What is HTML means in techniqal acpact?
A- it means CSS
B- it actually HTML
C- it means HyperText Markup Language
D- there is no such thing as HTML

109) what is <b> tag refare in HTML? (A)
A) bold
B) Italic

423. What is HTML means in techniqal acpact?
A- it means CSS
B- it actually HTML
D- there is no such thing as HTML

1. Which of the following was the capital of the Lydians?
A) **Olympia**
B) **Miletus**
C) **Aspendos**
D) **Sardis**

The color of tree is green and brown. True

1. Which of the following was the capital of the Lydians?
A) **Olympia**
B) [highlight]Miletus[/highlight]
C) **Aspendos**
D) Sardis

4. What is HTML means in techniqal acpact?
A- it means CSS
B- it actually HTML
C- it means HyperText Markup Language
D- there is no such thing as HTML
Answer: B and C are the correct answer
"""

In [None]:
MCQ = []
messages = trim_all(messages)
def generate_resp(messages):
  for message in messages:
    messagex = [
        {
            "role": "system",
            "content": "\n".join([
                "You are a quiz parser. Output JSON for questions, \"SKIP\" for non-questions.",
                "",
                "**For valid multiple-choice questions:**",
                "- Output JSON with format: {\"question\": \"text\", \"answers\": {\"1\": \"opt1\", \"2\": \"opt2\"}, \"correct\": \"1\"}",
                "- \"correct\" must be a string number matching an answer key, or \"0\" if uncertain.",
                "",
                "**Rules for parsing:**",
                "1. Extract only the question stem and options. Remove answer indicators (stars, bold, 'correct', etc.)",
                "2. Number answer options sequentially from \"1\" regardless of original labeling (a, b, 1, 2, etc.)",
                "3. If ANY uncertainty about correct answer → \"correct\": \"0\"",
                "4. If question indicates correct answer within text → remove indication but parse as normal",
                "5. If no answer is clearly marked correct → \"correct\": \"0\"",
                "6. If multiple answers appear marked → \"correct\": \"0\"",
                "7. If format is not multiple-choice (no discrete options) → \"SKIP\"",
                "8. If it's not a question (statement, explanation, etc.) → \"SKIP\"",
                "9. If the input does not contain any multiple-choice options, output \"SKIP\".",
                "10. Preserve all original formatting including tabs (\\t), spaces, and line breaks. Do not modify whitespace.",
                "",
                "**Critical:** If it is not a valid question, output \"SKIP\".",
                "**Critical:** NEVER guess or infer correct answers. Only use explicit markings from the input.",
                ""
            ])
        },
        {
            "role": "user",
            "content": message
        }
    ]
    text = tokenizer.apply_chat_template(
        messagex,
        tokenize=False,
        add_generation_prompt=True
    )

    model_inputs = tokenizer([text], return_tensors="pt").to(device)

    generated_ids = model.generate(
        model_inputs.input_ids,
        max_new_tokens=1024,
        do_sample=False, top_k=None, temperature=None, top_p=None,
    )

    generated_ids = [
        output_ids[len(input_ids):]
        for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    response = parse_json(response)
    if response and response != "SKIP":
      if isinstance(response, list):
          MCQ.extend(response)
      else:
          MCQ.append(response)

if messages is not None:
  generate_resp(messages)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In [None]:
json_output = json.dumps(MCQ, indent=2, ensure_ascii=False)
print(json_output)

[
  {
    "question": "What is the value of π (pi) approximately equal to?",
    "answers": {
      "1": "2.14",
      "2": "3.14",
      "3": "4.13",
      "4": "3.41"
    },
    "correct": "2"
  },
  {
    "question": "Who wrote \"Romeo and Juliet\"?",
    "answers": {
      "1": "Charles Dickens",
      "2": "William Shakespeare",
      "3": "George Orwell",
      "4": "Mark Twain"
    },
    "correct": "1"
  },
  {
    "question": "The square root of 144 is:",
    "answers": {
      "1": "10",
      "2": "11",
      "3": "12",
      "4": "14"
    },
    "correct": "3"
  },
  {
    "question": "After World War I, in European countries, in the Soviet Union and Japan powerfuldictatorships emerged.",
    "answers": {
      "1": "True",
      "2": "False"
    },
    "correct": "1"
  },
  {
    "question": "promised a better future to Germans that were feeling humiliated by losing theWorld War I.",
    "answers": {
      "1": "Benito Musoolini",
      "2": "Adolph Hitler",
      "3": "Ni

#Convert Pytorch into GGUF

###Step 1: Meraging

In [None]:
!pip install transformers peft accelerate bitsandbytes

from google.colab import drive
drive.mount('/content/drive')

from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch

adapter_path = "/content/drive/MyDrive/llm-finetuning/quiz_parser/checkpoint-414"
output_dir = "/content/drive/MyDrive/llm-finetuning/quiz_parser_merged"
GGUF_F16 = "/content/drive/MyDrive/llm-finetuning/quiz_parser_f16.gguf"
GGUF_Q4 = "/content/drive/MyDrive/llm-finetuning/quiz_parser_q4.gguf"

base_model = "Qwen/Qwen2.5-1.5B-Instruct"


Collecting bitsandbytes
  Downloading bitsandbytes-0.49.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.49.0-py3-none-manylinux_2_24_x86_64.whl (59.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.49.0
Mounted at /content/drive


In [None]:
fp16 = AutoModelForCausalLM.from_pretrained(
    base_model,
    torch_dtype=torch.float16,
    device_map="auto",
    low_cpu_mem_usage=True,
    trust_remote_code=True
)

tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [None]:
model = PeftModel.from_pretrained(fp16, adapter_path)
model = model.merge_and_unload()

In [None]:
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('/content/drive/MyDrive/llm-finetuning/quiz_parser_merged/tokenizer_config.json',
 '/content/drive/MyDrive/llm-finetuning/quiz_parser_merged/special_tokens_map.json',
 '/content/drive/MyDrive/llm-finetuning/quiz_parser_merged/chat_template.jinja',
 '/content/drive/MyDrive/llm-finetuning/quiz_parser_merged/vocab.json',
 '/content/drive/MyDrive/llm-finetuning/quiz_parser_merged/merges.txt',
 '/content/drive/MyDrive/llm-finetuning/quiz_parser_merged/added_tokens.json',
 '/content/drive/MyDrive/llm-finetuning/quiz_parser_merged/tokenizer.json')

### Step 2: GGUF Proccess

In [None]:
del model
del base_model
torch.cuda.empty_cache()

In [None]:
!git clone https://github.com/ggerganov/llama.cpp.git

Cloning into 'llama.cpp'...
remote: Enumerating objects: 74399, done.[K
remote: Counting objects: 100% (120/120), done.[K
remote: Compressing objects: 100% (86/86), done.[K
remote: Total 74399 (delta 71), reused 35 (delta 34), pack-reused 74279 (from 3)[K
Receiving objects: 100% (74399/74399), 270.00 MiB | 27.53 MiB/s, done.
Resolving deltas: 100% (53907/53907), done.


In [None]:
#For GPU (T4)
!cd /content/llama.cpp && rm -rf build && cmake -B build -S . -DGGML_CUDA=ON && cmake --build build -j4

-- The C compiler identification is GNU 11.4.0
-- The CXX compiler identification is GNU 11.4.0
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Check for working C compiler: /usr/bin/cc - skipped
-- Detecting C compile features
-- Detecting C compile features - done
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Check for working CXX compiler: /usr/bin/c++ - skipped
-- Detecting CXX compile features
-- Detecting CXX compile features - done
[0mCMAKE_BUILD_TYPE=Release[0m
-- Found Git: /usr/bin/git (found version "2.34.1")
-- The ASM compiler identification is GNU
-- Found assembler: /usr/bin/cc
-- Performing Test CMAKE_HAVE_LIBC_PTHREAD
-- Performing Test CMAKE_HAVE_LIBC_PTHREAD - Success
-- Found Threads: TRUE
-- CMAKE_SYSTEM_PROCESSOR: x86_64
-- GGML_SYSTEM_ARCH: x86
-- Including CPU backend
-- Found OpenMP_C: -fopenmp (found version "4.5")
-- Found OpenMP_CXX: -fopenmp (found version "4.5")
-- Found OpenMP: TRUE (found ve

In [None]:
#For cpu
!apt-get update
!apt-get install -y build-essential cmake python3-pip

!cd /content/llama.cpp && \
rm -rf build && \
cmake -B build -S . && \
cmake --build build -j2

In [None]:
!pip install -q -r /content/llama.cpp/requirements.txt

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m79.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.7/12.7 MB[0m [31m129.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.0/18.0 MB[0m [31m47.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m96.2/96.2 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.9/294.9 kB[0m [31m25.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m178.6/178.6 MB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.2/6.2 MB[0m [31m112.0 MB/s[0m eta [

In [None]:
!python /content/llama.cpp/convert_hf_to_gguf.py \
    {output_dir} \
    --outfile {GGUF_F16} \
    --outtype f16

INFO:hf-to-gguf:Loading model: quiz_parser_merged
INFO:hf-to-gguf:Model architecture: Qwen2ForCausalLM
INFO:hf-to-gguf:gguf: indexing model part 'model.safetensors'
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:token_embd.weight,         torch.float16 --> F16, shape = {1536, 151936}
INFO:hf-to-gguf:blk.0.attn_norm.weight,    torch.float16 --> F32, shape = {1536}
INFO:hf-to-gguf:blk.0.ffn_down.weight,     torch.float16 --> F16, shape = {8960, 1536}
INFO:hf-to-gguf:blk.0.ffn_gate.weight,     torch.float16 --> F16, shape = {1536, 8960}
INFO:hf-to-gguf:blk.0.ffn_up.weight,       torch.float16 --> F16, shape = {1536, 8960}
INFO:hf-to-gguf:blk.0.ffn_norm.weight,     torch.float16 --> F32, shape = {1536}
INFO:hf-to-gguf:blk.0.attn_k.bias,         torch.float16 --> F32, shape = {256}
INFO:hf-to-gguf:blk.0.attn_k.weight,       torch.float16 --> F16, shape = {1536, 256}
INFO:hf-to-gguf:blk.0.attn_output.weight,  torch.floa

###Q4_K_M (optional)

In [None]:
!cd llama.cpp && \
./build/bin/llama-quantize {GGUF_F16} {GGUF_Q4} Q4_K_M

main: build = 7579 (d77d7c5c0)
main: built with GNU 11.4.0 for Linux x86_64
main: quantizing '/content/drive/MyDrive/llm-finetuning/quiz_parser_f16.gguf' to '/content/drive/MyDrive/llm-finetuning/quiz_parser_q4.gguf' as Q4_K_M
llama_model_loader: loaded meta data with 27 key-value pairs and 338 tensors from /content/drive/MyDrive/llm-finetuning/quiz_parser_f16.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = qwen2
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                     general.sampling.top_k i32              = 20
llama_model_loader: - kv   3:                     general.sampling.top_p f32              = 0.800000
llama_model_loader: - kv   4:                      general.sampling.temp f32              = 0.700000
llama_model_lo