In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

In [6]:
import torch

In [7]:
model_path = "/content/drive/MyDrive/HuggingFace_Model/Qwen1.5-0.5B-Chat"

In [8]:
import os
print("Files in folder:")
for file in sorted(os.listdir(model_path)):
    print("  ", file)

Files in folder:
   config.json
   generation_config.json
   merges.txt
   model.safetensors
   special_tokens_map.json
   tokenizer.json
   tokenizer_config.json
   vocab.json


In [9]:
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16)
model

`torch_dtype` is deprecated! Use `dtype` instead!


Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 1024)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (o_proj): Linear(in_features=1024, out_features=1024, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
          (up_proj): Linear(in_features=1024, out_features=2816, bias=False)
          (down_proj): Linear(in_features=2816, out_features=1024, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((1024,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((1024,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((1024,), eps=1e-06)
    (rotary_emb): 

In [10]:
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer

Qwen2TokenizerFast(name_or_path='/content/drive/MyDrive/HuggingFace_Model/Qwen1.5-0.5B-Chat', vocab_size=151643, model_max_length=32768, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [11]:
# Use the Qwen chat template correctly
messages = [
    {"role": "system", "content": "You are a helpful AI assistant."},
    {"role": "user", "content": "Hello, who are you?"}
]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

# Generate response
outputs = model.generate(**inputs, max_new_tokens=100)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(response)

system
You are a helpful AI assistant.
user
Hello, who are you?
assistant
I am a large language model created by Alibaba Cloud. I can answer questions, provide information and engage in conversations. How may I assist you today?


In [12]:
from datasets import load_dataset

In [13]:
data_set_path = "/content/drive/MyDrive/Fine_Tuning_Data/people_data.json"
raw_data = load_dataset("json", data_files=data_set_path)

Generating train split: 0 examples [00:00, ? examples/s]

In [14]:
def flatten(example):
    resp = example["response"]
    completion = f"name: {resp['name']}, age: {resp['age']}, job: {resp['job']}, gender: {resp['gender']}"
    return {
        "prompt": example["prompt"],
        "completion": completion
    }

processed = raw_data["train"].map(flatten, remove_columns=raw_data["train"].column_names)

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

In [15]:
dataset = processed.train_test_split(test_size=0.25, seed=42)

In [16]:
print("Train example:")
print(dataset["train"][1])

print("\nTest example:")
print(dataset["test"][1])

Train example:
{'prompt': 'Within an echoing cathedral, Zoey, currently 39 years old builds a career as a lawyer. She finds peace in practicing Japanese calligraphy in quiet solitude.', 'completion': 'name: Zoey, age: 39, job: lawyer, gender: female'}

Test example:
{'prompt': 'Within an echoing cathedral, Chloe spends weekdays serving as a architect. She dedicates evenings to writing interactive fiction in quiet solitude.', 'completion': 'name: Chloe, age: , job: architect, gender: female'}


In [17]:
print("Train size:", len(dataset["train"]))
print("Test size:", len(dataset["test"]))

Train size: 225
Test size: 75


In [18]:
dataset['train']['prompt'][1]

'Within an echoing cathedral, Zoey, currently 39 years old builds a career as a lawyer. She finds peace in practicing Japanese calligraphy in quiet solitude.'

In [19]:
text = dataset['train']['prompt'][1] + "\n" + dataset['train']['completion'][1]
print(text)


Within an echoing cathedral, Zoey, currently 39 years old builds a career as a lawyer. She finds peace in practicing Japanese calligraphy in quiet solitude.
name: Zoey, age: 39, job: lawyer, gender: female


In [20]:
tokens = tokenizer(
    text,
    max_length=128,
    truncation=True,
    padding="max_length"
)
tokens

{'input_ids': [41961, 458, 93724, 79150, 11, 44803, 1195, 11, 5023, 220, 18, 24, 1635, 2310, 22111, 264, 6931, 438, 264, 15417, 13, 2932, 13719, 8919, 304, 35566, 10769, 1618, 92070, 304, 11340, 98444, 624, 606, 25, 44803, 1195, 11, 4231, 25, 220, 18, 24, 11, 2618, 25, 15417, 11, 9825, 25, 8778, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [21]:
tokenizer = AutoTokenizer.from_pretrained(model_path)

def preprocess(sample):
    sample = sample["prompt"] + "\n" + sample["completion"]

    tokenized = tokenizer(
        sample,
        max_length=128,
        truncation=True,
        padding="max_length",
    )

    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

data = dataset.map(preprocess)

Map:   0%|          | 0/225 [00:00<?, ? examples/s]

Map:   0%|          | 0/75 [00:00<?, ? examples/s]

In [22]:
print(data["train"][0])

{'prompt': 'While sipping coffee at a corner café, Yael spends weekdays serving as a coach. In recent months, she has taken up studying distant galaxies as a hobby.', 'completion': 'name: Yael, age: , job: coach, gender: female', 'input_ids': [7983, 274, 5654, 10799, 518, 264, 9131, 51950, 11, 809, 5891, 37102, 71995, 13480, 438, 264, 7247, 13, 758, 3213, 3951, 11, 1340, 702, 4429, 705, 20956, 28727, 64917, 438, 264, 31528, 624, 606, 25, 809, 5891, 11, 4231, 25, 1154, 2618, 25, 7247, 11, 9825, 25, 8778, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151

In [47]:
print(data["test"][15])

{'prompt': 'On a ferry crossing the bay, earns a living as a economist. A surprising fact: she has been meditatinging on the beach at sunrise since last year.', 'completion': 'name: , age: , job: economist, gender: female', 'input_ids': [1925, 264, 51550, 26638, 279, 22708, 11, 63759, 264, 5382, 438, 264, 45115, 13, 362, 14861, 2097, 25, 1340, 702, 1012, 1774, 49544, 287, 389, 279, 11321, 518, 63819, 2474, 1537, 1042, 624, 606, 25, 1154, 4231, 25, 1154, 2618, 25, 45115, 11, 9825, 25, 8778, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 1

In [23]:
from peft import LoraConfig, get_peft_model, TaskType
from transformers import AutoModelForCausalLM

In [24]:
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map="auto",
    torch_dtype=torch.float16
)


In [25]:
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    target_modules=["q_proj", "k_proj", "v_proj"],
    r=8,
    lora_alpha=16,
    lora_dropout=0.01
)


In [26]:
model = get_peft_model(model, lora_config)

In [27]:
model.print_trainable_parameters()

trainable params: 1,179,648 || all params: 465,167,360 || trainable%: 0.2536


In [30]:
%pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [31]:
from transformers import TrainingArguments, Trainer
import evaluate

In [42]:
training_args = TrainingArguments(
    num_train_epochs=7,
    learning_rate=0.001,
    #per_device_train_batch_size=2,
    #per_device_eval_batch_size=2,
    #evaluation_strategy="epoch",
    #save_strategy="epoch",
    logging_steps=25
    #output_dir="./results",
    #fp16=True
)

In [43]:
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(-1)
    true = labels[labels != -100]
    pred = preds[labels != -100]
    return metric.compute(predictions=pred, references=true)

In [44]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=data["train"],
    eval_dataset=data["test"],
    compute_metrics=compute_metrics
)

trainer.train()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mabdoragabsh10[0m ([33mabdoragabsh10-colab[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
25,1.3943
50,0.4536
75,0.3454
100,0.2764
125,0.2434
150,0.2287
175,0.2115
200,0.1957


TrainOutput(global_step=203, training_loss=0.41528716198916504, metrics={'train_runtime': 57.8258, 'train_samples_per_second': 27.237, 'train_steps_per_second': 3.511, 'total_flos': 374473890201600.0, 'train_loss': 0.41528716198916504, 'epoch': 7.0})

In [45]:
eval_results = trainer.evaluate()
print("Eval results:", eval_results)


Eval results: {'eval_loss': 0.2685545086860657, 'eval_accuracy': 0.61, 'eval_runtime': 11.4339, 'eval_samples_per_second': 6.559, 'eval_steps_per_second': 0.875, 'epoch': 7.0}


In [52]:
prompt = "Under the scorching summer sun, Maya works as a biologist. She is known among friends for learning sign language in quiet solitude."

inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, max_new_tokens=128)
print("Generated:", tokenizer.decode(outputs[0], skip_special_tokens=True))


Generated: Under the scorching summer sun, Maya works as a biologist. She is known among friends for learning sign language in quiet solitude. Whenever possible, she can be found meditating on the beach at sunrise.
name: Maya, age: , job: biologist, gender: female


In [51]:
print(data["test"][17])

{'prompt': 'Under the scorching summer sun, Maya works as a biologist. She is known among friends for learning sign language in quiet solitude.', 'completion': 'name: Maya, age: , job: biologist, gender: female', 'input_ids': [16250, 279, 1136, 21584, 287, 7324, 7015, 11, 50344, 4278, 438, 264, 87604, 13, 2932, 374, 3881, 4221, 4780, 369, 6832, 1841, 4128, 304, 11340, 98444, 624, 606, 25, 50344, 11, 4231, 25, 1154, 2618, 25, 87604, 11, 9825, 25, 8778, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 

In [55]:
from tqdm import tqdm

def evaluate_model(model, tokenizer, dataset, num_samples=20, max_new_tokens=64):
    correct = 0
    total = 0

    for i in tqdm(range(num_samples)):
        example = dataset[i]
        prompt = example["prompt"]
        true_completion = example["completion"]

        # توليد من الموديل
        inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
        outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
        pred = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # نجيب آخر جزء (اللي فيه completion)
        if "name:" in pred:
            pred_completion = pred.split("name:")[-1].strip()
            pred_completion = "name: " + pred_completion
        else:
            pred_completion = pred.strip()

        # مقارنة (بسيطة: string match)
        if pred_completion == true_completion:
            correct += 1
        total += 1

        print(f"\nPrompt: {prompt}")
        print(f"True: {true_completion}")
        print(f"Pred: {pred_completion}")

    acc = correct / total if total > 0 else 0
    print(f"\n✅ Accuracy on {num_samples} samples = {acc:.2f}")
    return acc


In [56]:
acc = evaluate_model(model, tokenizer, dataset["test"], num_samples=10)


 10%|█         | 1/10 [00:01<00:14,  1.66s/it]


Prompt: Beneath the starlit sky, Frida, at the age of 57 spends weekdays serving as a pharmacist. She dedicates evenings to coding retro video games in quiet solitude.
True: name: Frida, age: 57, job: pharmacist, gender: female
Pred: name: Frida, age: 57, job:药师, gender: female


 20%|██        | 2/10 [00:03<00:11,  1.50s/it]


Prompt: Within an echoing cathedral, Chloe spends weekdays serving as a architect. She dedicates evenings to writing interactive fiction in quiet solitude.
True: name: Chloe, age: , job: architect, gender: female
Pred: name: Chloe, age: , job: architect, gender: female


 30%|███       | 3/10 [00:04<00:10,  1.48s/it]


Prompt: Within the concrete jungle, Jamie, aged 45 works as a pilot. Loves brewinging artisanal coffee at home during free moments.
True: name: Jamie, age: 45, job: pilot, gender: 
Pred: name: Jamie, age: 45, job: pilot, gender: male


 40%|████      | 4/10 [00:06<00:09,  1.50s/it]


Prompt: In a neon-lit arcade, Rosa, currently 65 years old has been employed as a art curator. In recent months, she has taken up volunteering as a math tutor as a hobby.
True: name: Rosa, age: 65, job: art curator, gender: female
Pred: name: Rosa, age: 65, job: art curator, gender: female


 50%|█████     | 5/10 [00:07<00:07,  1.45s/it]


Prompt: During a long-haul flight, Paula spends weekdays serving as a biologist. She dedicates evenings to designing family board games in quiet solitude.
True: name: Paula, age: , job: biologist, gender: female
Pred: name: Paula, age: , job: biologist, gender: female


 60%|██████    | 6/10 [00:08<00:05,  1.43s/it]


Prompt: By a roaring fireplace, Viktor, a 73-year-old. He enjoys collecting rare postage stamps.
True: name: Viktor, age: 73, job: , gender: male
Pred: name: Viktor, age: 73, job: , gender: male


 70%|███████   | 7/10 [00:10<00:04,  1.54s/it]


Prompt: Inside the old library, Delphine, currently 71 years old makes ends meet working as a policy analyst. She finds peace in collecting rare postage stamps in quiet solitude.
True: name: Delphine, age: 71, job: policy analyst, gender: female
Pred: name: Delphine, age: 71, job: policy analyst, gender: female


 80%|████████  | 8/10 [00:11<00:03,  1.51s/it]


Prompt: Inside a futuristic lab, Charlie, at the age of 77 has been employed as a musician. Loves exploringing virtual reality art galleries during free moments.
True: name: Charlie, age: 77, job: musician, gender: 
Pred: name: Charlie, age: 77, job: artist, gender: male


 90%|█████████ | 9/10 [00:13<00:01,  1.45s/it]


Prompt: Inside the old library, currently 31 years old makes ends meet working as a public relations officer. He enjoys writing interactive fiction.
True: name: , age: 31, job: public relations officer, gender: male
Pred: name: , age: 31, job: public relations officer, gender: male


100%|██████████| 10/10 [00:14<00:00,  1.42s/it]


Prompt: On the crowded subway platform, Maya, aged 28 earns a living as a plumber. She often spends weekends buildinging model airplanes.
True: name: Maya, age: 28, job: plumber, gender: female
Pred: name: Maya, age: 28, job: plumber, gender: female

✅ Accuracy on 10 samples = 0.70



