<a href="https://colab.research.google.com/github/Avichay3/final_project_eyal/blob/main/basic_model_try.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q transformers accelerate peft datasets evaluate
!pip install -q scikit-learn pandas numpy


In [2]:
import os
os.environ["WANDB_PROJECT"] = "cmv-lora"
os.environ["WANDB_DISABLED"] = "false"

import torch
import pandas as pd
import numpy as np
from datasets import Dataset
from sklearn.model_selection import train_test_split

device = "cuda" if torch.cuda.is_available() else "cpu"
device


'cuda'

In [3]:
from google.colab import files
uploaded = files.upload()

file_name = list(uploaded.keys())[0]

import json
data = [json.loads(line) for line in open(file_name, 'r')]

pairs = []
for sample in data:
    op = sample.get("op_text", "")
    # positive
    if isinstance(sample.get("positive"), dict):
        for c in sample["positive"]["comments"]:
            body = c.get("body", "")
            if body:
                pairs.append({"text": op + " [SEP] " + body, "label": 1})
    # negative
    if isinstance(sample.get("negative"), dict):
        for c in sample["negative"]["comments"]:
            body = c.get("body", "")
            if body:
                pairs.append({"text": op + " [SEP] " + body, "label": 0})

df = pd.DataFrame(pairs).dropna().reset_index(drop=True)
print("Total samples:", len(df))
df.head()


Saving heldout_pair_data.jsonlist to heldout_pair_data (1).jsonlist
Total samples: 2142


Unnamed: 0,text,label
0,I'll start off by saying I'm a vegetarian and ...,1
1,I'll start off by saying I'm a vegetarian and ...,0
2,I'll start off by saying I'm a vegetarian and ...,1
3,I'll start off by saying I'm a vegetarian and ...,0
4,"Hello, I'm Luke and for the longest time a sma...",1


In [4]:
train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df["label"]
)

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)


In [5]:
from transformers import AutoTokenizer

model_name = "Qwen/Qwen3-Embedding-0.6B"

tokenizer = AutoTokenizer.from_pretrained(model_name)

max_length = 4096
tokenizer.model_max_length = max_length

def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=max_length,
        padding=False
    )

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/1713 [00:00<?, ? examples/s]

Map:   0%|          | 0/429 [00:00<?, ? examples/s]

In [6]:
from transformers import AutoModel

base_model = AutoModel.from_pretrained(
    model_name,
    device_map="cpu",
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
)

print("Model loaded on CPU safely.")


`torch_dtype` is deprecated! Use `dtype` instead!


Model loaded on CPU safely.


In [7]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    task_type="FEATURE_EXTRACTION"
)

model = get_peft_model(base_model, lora_config)


model = model.to("cuda")

model.print_trainable_parameters()


trainable params: 4,587,520 || all params: 600,364,032 || trainable%: 0.7641


In [8]:
import torch.nn as nn

hidden_size = base_model.config.hidden_size

class QwenForCMV(nn.Module):
    def __init__(self, encoder):
        super().__init__()
        self.encoder = encoder
        self.classifier = nn.Linear(hidden_size, 2)

    def forward(self, input_ids, attention_mask, labels=None):
        out = self.encoder(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        last_hidden = out.last_hidden_state

        mask = attention_mask.unsqueeze(-1)
        pooled = (last_hidden * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1e-6)

        logits = self.classifier(pooled)

        loss = None
        if labels is not None:
            loss = nn.CrossEntropyLoss()(logits, labels)

        return {"loss": loss, "logits": logits}

full_model = QwenForCMV(model).to("cuda")


In [9]:
from transformers import DataCollatorWithPadding
import evaluate

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
metric_acc = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=1)
    return metric_acc.compute(predictions=preds, references=labels)


In [10]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./cmv_qwen_lora",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    learning_rate=3e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="wandb",
)


In [11]:
from transformers import Trainer

trainer = Trainer(
    model=full_model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()


  trainer = Trainer(
[34m[1mwandb[0m: Currently logged in as: [33maviaimzina123456[0m ([33maviaimzina123456-ariel-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss


OutOfMemoryError: CUDA out of memory. Tried to allocate 298.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 16.12 MiB is free. Process 89432 has 14.72 GiB memory in use. Of the allocated memory 14.15 GiB is allocated by PyTorch, and 443.34 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)