In [1]:
!pip install -U bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.4-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Downloading bitsandbytes-0.45.4-py3-none-manylinux_2_24_x86_64.whl (76.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.0/76.0 MB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.45.4


In [None]:
import wandb

# Initialize with team/entity
wandb.init(
    project="unlp-clf-task",
    entity="shah1st-work-ua-igor-sikorsky-kyiv-polytechnic-institute", 
    name='gemma2-2b-baseline-translated',
    #init_timeout=240
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mshah1st-work-ua[0m ([33mshah1st-work-ua-igor-sikorsky-kyiv-polytechnic-institute[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [3]:
import pandas as pd

df = pd.read_parquet('/kaggle/input/translated-train-unlp-2025/translated_train.parquet')
cv = pd.read_csv('/kaggle/input/translated-train-unlp-2025/cv_split.csv')
df = df.merge(cv, on='id', how='left')

test = pd.read_csv('/kaggle/input/translated-train-unlp-2025/translated_test.csv')

In [4]:
import numpy as np

df['is_valid'] = df.fold == 4

In [5]:
import json

def prompt_generator(text):
    conversation = f"""<start_of_turn>user
You are an expert in analyzing social media posts. You need to determine the extent to which posts contain manipulation techniques.
The text of the article: {text}
<end_of_turn>"""
    return conversation

In [6]:
df.loc[:, 'prompt'] = df.translated_content.apply(prompt_generator)
test.loc[:, 'prompt'] = test.translated_content.apply(prompt_generator)

In [7]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [9]:
from tqdm.autonotebook import tqdm

tqdm.pandas()
from transformers import pipeline, AutoTokenizer

PRETRAINED_MODEL = 'google/gemma-2-2b-it'
MAX_LENGTH = 2048

tokenizer = AutoTokenizer.from_pretrained(
    PRETRAINED_MODEL
)

tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL)
tokenizer.padding_side = 'right'
tokenizer.add_eos_token = True

df['full_text'] = df.prompt.progress_apply(
    lambda x: tokenizer.decode(tokenizer(x, add_special_tokens=False)['input_ids'][:MAX_LENGTH])
)
test['full_text'] = test.prompt.progress_apply(
    lambda x: tokenizer.decode(tokenizer(x, add_special_tokens=False)['input_ids'][:MAX_LENGTH])
)

def tokenize(sample):
    tokenized = tokenizer(sample['full_text'])
    return tokenized

tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

  0%|          | 0/3822 [00:00<?, ?it/s]

  0%|          | 0/5735 [00:00<?, ?it/s]

In [10]:
ssubmission = pd.read_csv('/kaggle/input/translated-train-unlp-2025/sample_submission.csv')
targets = ssubmission.set_index('id').columns

from collections.abc import Iterable

for col in targets:
    df[col] = 0

import numpy as np
for ind, row in df.iterrows():
    if isinstance(row['techniques'], Iterable):
        for t in row['techniques']:
            df.loc[ind, t] = 1

In [11]:
df['labels'] = list(df[targets].values)

In [12]:
from datasets import Dataset

ds_train = Dataset.from_pandas(df[df.is_valid == 0][['full_text', 'labels']].copy())
ds_eval = Dataset.from_pandas(df[df.is_valid == 1][['full_text', 'labels']].copy())
ds_test = Dataset.from_pandas(test[['full_text']].copy())

ds_train = ds_train.map(tokenize)
remove_columns = [c for c in ds_train.features.keys() if c not in ['input_ids', 'attention_mask', 'labels']]
ds_train = ds_train.remove_columns(remove_columns)

ds_eval = ds_eval.map(tokenize)
remove_columns = [c for c in ds_eval.features.keys() if c not in ['input_ids', 'attention_mask', 'labels']]
ds_eval = ds_eval.remove_columns(remove_columns)

ds_test = ds_test.map(tokenize)
remove_columns = [c for c in ds_test.features.keys() if c not in ['input_ids', 'attention_mask', 'labels']]
ds_test = ds_test.remove_columns(remove_columns)

Map:   0%|          | 0/3058 [00:00<?, ? examples/s]

Map:   0%|          | 0/764 [00:00<?, ? examples/s]

Map:   0%|          | 0/5735 [00:00<?, ? examples/s]

In [13]:
import torch
from transformers import Gemma2ForSequenceClassification, BitsAndBytesConfig
from peft import get_peft_config, prepare_model_for_kbit_training, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType

nf4_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=False,
   bnb_4bit_compute_dtype=torch.float16
)

model = Gemma2ForSequenceClassification.from_pretrained(
    PRETRAINED_MODEL,
    num_labels=10,
    torch_dtype=torch.float16, ## nobf
    device_map="cuda:0",
    quantization_config=nf4_config
)

lora_config = LoraConfig(
    r=32,  # the dimension of the low-rank matrices
    lora_alpha=16, # scaling factor for LoRA activations vs pre-trained weight activations
    lora_dropout=0.05, 
    bias='none',
    inference_mode=False,
    task_type=TaskType.SEQ_CLS,
    target_modules=['o_proj', 'v_proj', "q_proj", "k_proj", "gate_proj"]
) 

model = get_peft_model(model, lora_config)
# Trainable Parameters
model.print_trainable_parameters()



config.json:   0%|          | 0.00/838 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/241M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at google/gemma-2-2b-it and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 22,387,200 || all params: 2,636,752,128 || trainable%: 0.8490


In [14]:
import os
import random
import numpy as np
import torch

def set_seeds(seed):
    """Set seeds for reproducibility """
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        

set_seeds(seed=42)

In [15]:
import numpy as np
from scipy.optimize import minimize_scalar

def find_thresholds_for_distribution(preds, desired_distribution):
    """
    Find thresholds for each class to achieve the desired class distribution.

    Args:
        preds (ndarray): Array of shape (num_samples, num_classes) with probabilities (after sigmoid).
        desired_distribution (list): Desired proportion of positive samples for each class.

    Returns:
        thresholds (list): List of thresholds for each class.
    """
    num_classes = preds.shape[1]
    thresholds = []

    for class_idx in range(num_classes):
        probs = preds[:, class_idx]
        desired_ratio = desired_distribution[class_idx]

        # Function to minimize the difference between actual and desired positive ratios
        def objective(threshold):
            predicted_ratio = (probs >= threshold).mean()
            return abs(predicted_ratio - desired_ratio)

        # Find the threshold using optimization
        result = minimize_scalar(objective, bounds=(0, 1), method="bounded")
        thresholds.append(result.x)

    return thresholds
df[targets].mean().values

array([0.03610675, 0.07849294, 0.1007326 , 0.04107797, 0.04133961,
       0.51622187, 0.12637363, 0.12087912, 0.13396128, 0.12114076])

In [16]:
import os
from transformers import (AutoTokenizer, TrainingArguments, Trainer,
                          AutoModelForSequenceClassification, DataCollatorWithPadding)
from sklearn.metrics import f1_score
import numpy as np


TARGET_DISTRIBUTION = df[targets].mean().values


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    proba = torch.nn.functional.sigmoid(torch.tensor(logits)).numpy()
    optimal_thresholds = find_thresholds_for_distribution(
        proba, desired_distribution=TARGET_DISTRIBUTION
    )
    binarized_preds = (proba >= np.array(optimal_thresholds)).astype(int)

    return {"f1": f1_score(labels, binarized_preds, average="macro")}


train_args = TrainingArguments(
    output_dir='model_checkpoints_gemma2_qlora_tholds',
    logging_dir='./model_logs_gemma2_qlora_tholds',
    learning_rate=2e-5,
    weight_decay=0.01,
    lr_scheduler_type='cosine',
    warmup_ratio=0.0,
    num_train_epochs=3,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,
    #bf16=False, ## NO SUPPORT 
    #gradient_checkpointing=True,
    report_to="wandb",
    optim='adamw_8bit',
    eval_strategy='steps',
    save_strategy="steps",
    eval_steps=200,
    logging_steps=20,
    save_steps=200,
    save_total_limit=10,
    metric_for_best_model='f1',
    greater_is_better=True,
    load_best_model_at_end=True,
)

In [17]:
from torch.nn import BCEWithLogitsLoss

class CustomTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = torch.tensor((1/df[targets].mean()).tolist()).cuda()

    def compute_loss(self, model, inputs, return_outputs=False, *args, **kwargs):
        outputs = model(inputs['input_ids'], inputs['attention_mask'])
        logits = outputs.logits

        # Initialize BCEWithLogitsLoss with class weights
        #loss_fn = BCEWithLogitsLoss(weight=self.class_weights)
        loss_fn = BCEWithLogitsLoss()
        #print(logits[:2])
        #print(inputs['labels'][:2])
        loss = loss_fn(logits, inputs['labels'].float())

        return (loss, outputs) if return_outputs else loss

In [18]:
trainer = CustomTrainer(
    model=model, 
    args=train_args, 
    train_dataset=ds_train,
    eval_dataset=ds_eval,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics
)
trainer.train()

  super().__init__(*args, **kwargs)


Step,Training Loss,Validation Loss,F1
200,0.2777,0.297964,0.241201
400,0.2347,0.273063,0.339056
600,0.2697,0.262036,0.313778
800,0.2329,0.253994,0.361396
1000,0.2612,0.252486,0.413346


The 'batch_size' argument of HybridCache is deprecated and will be removed in v4.49. Use the more precisely named 'max_batch_size' argument instead.
The 'batch_size' attribute of HybridCache is deprecated and will be removed in v4.49. Use the more precisely named 'self.max_batch_size' attribute instead.


TrainOutput(global_step=1146, training_loss=0.26924921070301927, metrics={'train_runtime': 4495.3411, 'train_samples_per_second': 2.041, 'train_steps_per_second': 0.255, 'total_flos': 2.045182098306816e+16, 'train_loss': 0.26924921070301927, 'epoch': 2.9941137998691953})

In [19]:
preds_val = trainer.predict(ds_eval)

In [20]:
import numpy as np
from sklearn.metrics import f1_score

def optimize_thresholds(preds, targets, num_thresholds=100):
    """
    Find optimal thresholds for each class to maximize average F1 score.
    
    Args:
        preds (ndarray): Array of shape (num_samples, num_classes) with logits.
        targets (ndarray): Array of shape (num_samples, num_classes) with binary ground-truth labels.
        num_thresholds (int): Number of thresholds to evaluate (default: 100).

    Returns:
        optimal_thresholds (list): List of optimal thresholds for each class.
        best_avg_f1 (float): Best average F1 score achieved.
    """
    num_classes = preds.shape[1]
    thresholds = np.linspace(0, 1, num_thresholds)
    optimal_thresholds = []
    best_avg_f1 = 0

    for class_idx in tqdm(range(num_classes)):
        best_f1 = 0
        best_threshold = 0
        for threshold in thresholds:
            # Binarize predictions for this class
            binarized_preds = (preds[:, class_idx] >= threshold).astype(int)
            # Calculate F1 score for this class
            f1 = f1_score(targets[:, class_idx], binarized_preds, zero_division=0)
            if f1 > best_f1:
                best_f1 = f1
                best_threshold = threshold
        optimal_thresholds.append(best_threshold)
    
    # Calculate average F1 score across all classes using optimal thresholds
    binarized_preds = (preds >= np.array(optimal_thresholds)).astype(int)
    avg_f1 = f1_score(targets, binarized_preds, average='macro', zero_division=0)

    return optimal_thresholds, avg_f1

In [21]:
optimal_thresholds, best_avg_f1 = optimize_thresholds(
    torch.nn.functional.sigmoid(
        torch.tensor(preds_val.predictions)).numpy(), preds_val.label_ids)

  0%|          | 0/10 [00:00<?, ?it/s]

In [22]:
best_avg_f1

0.46218882500187747

In [23]:
optimal_thresholds_notnull = np.array(optimal_thresholds) + 1e-6

In [24]:
binarized_preds = (torch.nn.functional.sigmoid(
        torch.tensor(preds_val.predictions)).numpy() >= np.array(optimal_thresholds_notnull)).astype(int)

In [25]:
f1_score(preds_val.label_ids, binarized_preds, average='macro', zero_division=0)

0.46218882500187747

In [26]:
preds_test = trainer.predict(ds_test)

In [27]:
binarized_preds = (torch.nn.functional.sigmoid(
        torch.tensor(preds_test.predictions)).numpy() >= np.array(optimal_thresholds_notnull)).astype(int)

In [28]:
for ind, col in enumerate(ssubmission.columns[1:]):
    ssubmission[col] = binarized_preds[:, ind]

In [31]:
ssubmission.to_csv('/kaggle/working/gemma2-2b-cv0.462.csv', index=False)

In [32]:
import numpy as np
from scipy.optimize import minimize_scalar

def find_thresholds_for_distribution(preds, desired_distribution):
    """
    Find thresholds for each class to achieve the desired class distribution.

    Args:
        preds (ndarray): Array of shape (num_samples, num_classes) with probabilities (after sigmoid).
        desired_distribution (list): Desired proportion of positive samples for each class.

    Returns:
        thresholds (list): List of thresholds for each class.
    """
    num_classes = preds.shape[1]
    thresholds = []

    for class_idx in range(num_classes):
        probs = preds[:, class_idx]
        desired_ratio = desired_distribution[class_idx]

        # Function to minimize the difference between actual and desired positive ratios
        def objective(threshold):
            predicted_ratio = (probs >= threshold).mean()
            return abs(predicted_ratio - desired_ratio)

        # Find the threshold using optimization
        result = minimize_scalar(objective, bounds=(0, 1), method="bounded")
        thresholds.append(result.x)

    return thresholds

In [33]:
df[targets].mean().values

array([0.03610675, 0.07849294, 0.1007326 , 0.04107797, 0.04133961,
       0.51622187, 0.12637363, 0.12087912, 0.13396128, 0.12114076])

In [34]:
optimal_thresholds = find_thresholds_for_distribution(
    torch.nn.functional.sigmoid(
        torch.tensor(preds_val.predictions)).numpy(), desired_distribution=df[targets].mean().values)

In [35]:
binarized_preds = (torch.nn.functional.sigmoid(
        torch.tensor(preds_val.predictions)).numpy() >= np.array(optimal_thresholds)).astype(int)

In [36]:
f1_score(preds_val.label_ids, binarized_preds, average='macro', zero_division=0)

0.41334614420016724

In [37]:
binarized_preds = (torch.nn.functional.sigmoid(
        torch.tensor(preds_test.predictions)).numpy() >= np.array(optimal_thresholds)).astype(int)

In [38]:
binarized_preds.mean(axis=0)

array([0.03156059, 0.08387097, 0.08666085, 0.        , 0.05858762,
       0.52850915, 0.12327812, 0.12258065, 0.13077594, 0.10549259])

In [39]:
for ind, col in enumerate(ssubmission.columns[1:]):
    ssubmission[col] = binarized_preds[:, ind]

In [40]:
ssubmission.to_csv('/kaggle/working/gemma2-2b-cv0.413.csv', index=False)

In [42]:
import shutil
shutil.make_archive('Gemma2b-translation-ckpt1000', 'zip', '/kaggle/working/model_checkpoints_gemma2_qlora_tholds/checkpoint-1000')


'/kaggle/working/Gemma2b-translation-ckpt1000.zip'