In [1]:
import wandb

# Initialize with team/entity
wandb.init(
    project="unlp-clf-task",
    entity="bazdyrev99-igor-sikorsky-kyiv-polytechnic-institute", 
    name='gemma2-9b-baseline'
)

[34m[1mwandb[0m: Currently logged in as: [33mbazdyrev99[0m ([33mbazdyrev99-igor-sikorsky-kyiv-polytechnic-institute[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


In [2]:
import pandas as pd

df = pd.read_parquet('train.parquet')

ssubmission = pd.read_csv('sample_submission.csv')
test = pd.read_csv('test.csv')

In [3]:
import numpy as np


df['is_valid'] = np.random.binomial(1, 0.2, df.shape[0])

In [4]:
import json

def prompt_generator(text):
    conversation = f"""<start_of_turn>user
Ти експерт в аналізу постів в соцмережах. Тобі необхідно визначити наскільки пости містять в собі техніки маніпуляції.
Текст статті: {text}
<end_of_turn>"""
    return conversation

In [5]:
df.loc[:, 'prompt'] = df.content.apply(prompt_generator)
test.loc[:, 'prompt'] = test.content.apply(prompt_generator)

In [6]:
import torch
from tqdm.autonotebook import tqdm

tqdm.pandas()
from transformers import pipeline, AutoTokenizer

PRETRAINED_MODEL = 'google/gemma-2-9b-it'
MAX_LENGTH = 1024

tokenizer = AutoTokenizer.from_pretrained(
    PRETRAINED_MODEL
)

tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL)
tokenizer.padding_side = 'right'
tokenizer.add_eos_token = True

df['full_text'] = df.prompt.progress_apply(
    lambda x: tokenizer.decode(tokenizer(x, add_special_tokens=False)['input_ids'][:MAX_LENGTH])
)
test['full_text'] = test.prompt.progress_apply(
    lambda x: tokenizer.decode(tokenizer(x, add_special_tokens=False)['input_ids'][:MAX_LENGTH])
)

def tokenize(sample):
    tokenized = tokenizer(sample['full_text'])
    return tokenized

  from tqdm.autonotebook import tqdm


  0%|          | 0/3822 [00:00<?, ?it/s]

  0%|          | 0/5735 [00:00<?, ?it/s]

In [7]:
ssubmission = pd.read_csv('sample_submission.csv')
targets = ssubmission.set_index('id').columns

from collections.abc import Iterable

for col in targets:
    df[col] = 0

import numpy as np
for ind, row in df.iterrows():
    if isinstance(row['techniques'], Iterable):
        for t in row['techniques']:
            df.loc[ind, t] = 1

In [8]:
df['labels'] = list(df[targets].values)

In [9]:
from datasets import Dataset

ds_train = Dataset.from_pandas(df[df.is_valid == 0][['full_text', 'labels']].copy())
ds_eval = Dataset.from_pandas(df[df.is_valid == 1][['full_text', 'labels']].copy())
ds_test = Dataset.from_pandas(test[['full_text']].copy())

ds_train = ds_train.map(tokenize)
remove_columns = [c for c in ds_train.features.keys() if c not in ['input_ids', 'attention_mask', 'labels']]
ds_train = ds_train.remove_columns(remove_columns)

ds_eval = ds_eval.map(tokenize)
remove_columns = [c for c in ds_eval.features.keys() if c not in ['input_ids', 'attention_mask', 'labels']]
ds_eval = ds_eval.remove_columns(remove_columns)

ds_test = ds_test.map(tokenize)
remove_columns = [c for c in ds_test.features.keys() if c not in ['input_ids', 'attention_mask', 'labels']]
ds_test = ds_test.remove_columns(remove_columns)

Map:   0%|          | 0/3088 [00:00<?, ? examples/s]

Map:   0%|          | 0/734 [00:00<?, ? examples/s]

Map:   0%|          | 0/5735 [00:00<?, ? examples/s]

In [10]:
from transformers import Gemma2ForSequenceClassification, BitsAndBytesConfig
from peft import get_peft_config, prepare_model_for_kbit_training, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType



model = Gemma2ForSequenceClassification.from_pretrained(
    PRETRAINED_MODEL,
    num_labels=10,
    torch_dtype=torch.bfloat16
)

lora_config = LoraConfig(
    r=32,  # the dimension of the low-rank matrices
    lora_alpha=16, # scaling factor for LoRA activations vs pre-trained weight activations
    lora_dropout=0.05, 
    bias='none',
    inference_mode=False,
    task_type=TaskType.SEQ_CLS,
    target_modules=['o_proj', 'v_proj', "q_proj", "k_proj", "gate_proj"]
) 

model = get_peft_model(model, lora_config)
# Trainable Parameters
model.print_trainable_parameters()

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at google/gemma-2-9b-it and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 59,902,976 || all params: 9,301,644,800 || trainable%: 0.6440


In [11]:
import os
import random
import numpy as np
import torch

def set_seeds(seed):
    """Set seeds for reproducibility """
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        

set_seeds(seed=42)

In [12]:
import os
from transformers import (AutoTokenizer, TrainingArguments, Trainer,
                          AutoModelForSequenceClassification, DataCollatorWithPadding)
from sklearn.metrics import f1_score
import numpy as np


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits >= 0.0
    return {"f1": f1_score(labels, predictions, average="macro")}


train_args = TrainingArguments(
    output_dir='model_checkpoints_gemma2_qlora',
    logging_dir='./model_logs_gemma2_qlora',
    learning_rate=2e-5,
    weight_decay=0.01,
    lr_scheduler_type='cosine',
    warmup_ratio=0.0,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    bf16=True,
    report_to="wandb",
    optim='adamw_torch',
    eval_strategy='steps',
    save_strategy="steps",
    eval_steps=200,
    logging_steps=20,
    save_steps=200,
    save_total_limit=2,
    metric_for_best_model='f1',
    greater_is_better=True,
    load_best_model_at_end=True,
)

In [13]:
from torch.nn import BCEWithLogitsLoss

class CustomTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = torch.tensor((1/df[targets].mean()).tolist()).cuda()

    def compute_loss(self, model, inputs, return_outputs=False, *args, **kwargs):
        outputs = model(inputs['input_ids'], inputs['attention_mask'])
        logits = outputs.logits

        # Initialize BCEWithLogitsLoss with class weights
        #loss_fn = BCEWithLogitsLoss(weight=self.class_weights)
        loss_fn = BCEWithLogitsLoss()
        #print(logits[:2])
        #print(inputs['labels'][:2])
        loss = loss_fn(logits, inputs['labels'].float())

        return (loss, outputs) if return_outputs else loss

In [14]:
trainer = CustomTrainer(
    model=model, 
    args=train_args, 
    train_dataset=ds_train,
    eval_dataset=ds_eval,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics
)
trainer.train()

  super().__init__(*args, **kwargs)


Step,Training Loss,Validation Loss,F1
200,0.291,0.26893,0.229957
400,0.2375,0.249081,0.301925
600,0.235,0.24327,0.285104
800,0.2061,0.237464,0.348672
1000,0.2023,0.238576,0.362517


The 'batch_size' argument of HybridCache is deprecated and will be removed in v4.49. Use the more precisely named 'max_batch_size' argument instead.
The 'batch_size' attribute of HybridCache is deprecated and will be removed in v4.49. Use the more precisely named 'self.max_batch_size' attribute instead.

KeyboardInterrupt



In [15]:
preds_val = trainer.predict(ds_eval)

In [24]:
import numpy as np
from sklearn.metrics import f1_score

def optimize_thresholds(preds, targets, num_thresholds=100):
    """
    Find optimal thresholds for each class to maximize average F1 score.
    
    Args:
        preds (ndarray): Array of shape (num_samples, num_classes) with logits.
        targets (ndarray): Array of shape (num_samples, num_classes) with binary ground-truth labels.
        num_thresholds (int): Number of thresholds to evaluate (default: 100).

    Returns:
        optimal_thresholds (list): List of optimal thresholds for each class.
        best_avg_f1 (float): Best average F1 score achieved.
    """
    num_classes = preds.shape[1]
    thresholds = np.linspace(0, 1, num_thresholds)
    optimal_thresholds = []
    best_avg_f1 = 0

    for class_idx in tqdm(range(num_classes)):
        best_f1 = 0
        best_threshold = 0
        for threshold in thresholds:
            # Binarize predictions for this class
            binarized_preds = (preds[:, class_idx] >= threshold).astype(int)
            # Calculate F1 score for this class
            f1 = f1_score(targets[:, class_idx], binarized_preds, zero_division=0)
            if f1 > best_f1:
                best_f1 = f1
                best_threshold = threshold
        optimal_thresholds.append(best_threshold)
    
    # Calculate average F1 score across all classes using optimal thresholds
    binarized_preds = (preds >= np.array(optimal_thresholds)).astype(int)
    avg_f1 = f1_score(targets, binarized_preds, average='macro', zero_division=0)

    return optimal_thresholds, avg_f1

In [56]:
optimal_thresholds, best_avg_f1 = optimize_thresholds(
    torch.nn.functional.sigmoid(
        torch.tensor(preds_val.predictions)).numpy(), preds_val.label_ids)

  0%|          | 0/10 [00:00<?, ?it/s]

In [57]:
best_avg_f1

0.5161763391803001

In [61]:
optimal_thresholds_notnull = np.array(optimal_thresholds) + 1e-6

In [62]:
binarized_preds = (torch.nn.functional.sigmoid(
        torch.tensor(preds_val.predictions)).numpy() >= np.array(optimal_thresholds_notnull)).astype(int)

In [63]:
f1_score(preds_val.label_ids, binarized_preds, average='macro', zero_division=0)

0.5161763391803001

In [35]:
preds_test = trainer.predict(ds_test)

In [36]:
binarized_preds = (preds_test.predictions >= np.array(optimal_thresholds_notnull)).astype(int)

In [46]:
for ind, col in enumerate(ssubmission.columns[1:]):
    ssubmission[col] = binarized_preds[:, ind]

In [48]:
ssubmission.to_csv('submissions/gemma2-9b-cv0.374.csv', index=False)

In [64]:
binarized_preds = (torch.nn.functional.sigmoid(
        torch.tensor(preds_test.predictions)).numpy() >= np.array(optimal_thresholds_notnull)).astype(int)

In [65]:
for ind, col in enumerate(ssubmission.columns[1:]):
    ssubmission[col] = binarized_preds[:, ind]

In [66]:
ssubmission.to_csv('submissions/gemma2-9b-cv0.516.csv', index=False)

In [68]:
import numpy as np
from scipy.optimize import minimize_scalar

def find_thresholds_for_distribution(preds, desired_distribution):
    """
    Find thresholds for each class to achieve the desired class distribution.

    Args:
        preds (ndarray): Array of shape (num_samples, num_classes) with probabilities (after sigmoid).
        desired_distribution (list): Desired proportion of positive samples for each class.

    Returns:
        thresholds (list): List of thresholds for each class.
    """
    num_classes = preds.shape[1]
    thresholds = []

    for class_idx in range(num_classes):
        probs = preds[:, class_idx]
        desired_ratio = desired_distribution[class_idx]

        # Function to minimize the difference between actual and desired positive ratios
        def objective(threshold):
            predicted_ratio = (probs >= threshold).mean()
            return abs(predicted_ratio - desired_ratio)

        # Find the threshold using optimization
        result = minimize_scalar(objective, bounds=(0, 1), method="bounded")
        thresholds.append(result.x)

    return thresholds

In [72]:
df[targets].mean().values

array([0.03610675, 0.07849294, 0.1007326 , 0.04107797, 0.04133961,
       0.51622187, 0.12637363, 0.12087912, 0.13396128, 0.12114076])

In [73]:
optimal_thresholds = find_thresholds_for_distribution(
    torch.nn.functional.sigmoid(
        torch.tensor(preds_val.predictions)).numpy(), desired_distribution=df[targets].mean().values)

In [74]:
binarized_preds = (torch.nn.functional.sigmoid(
        torch.tensor(preds_val.predictions)).numpy() >= np.array(optimal_thresholds)).astype(int)

In [75]:
f1_score(preds_val.label_ids, binarized_preds, average='macro', zero_division=0)

0.48029719522515724

In [80]:
binarized_preds = (torch.nn.functional.sigmoid(
        torch.tensor(preds_test.predictions)).numpy() >= np.array(optimal_thresholds)).astype(int)

In [81]:
binarized_preds.mean(axis=0)

array([0.03417611, 0.08212729, 0.10479512, 0.03452485, 0.04568439,
       0.52258065, 0.11647777, 0.10584133, 0.13879686, 0.09816914])

In [82]:
for ind, col in enumerate(ssubmission.columns[1:]):
    ssubmission[col] = binarized_preds[:, ind]

In [83]:
ssubmission.to_csv('submissions/gemma2-9b-cv0.48.csv', index=False)

In [85]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    analyzer='word',
    ngram_range=(1, 5),
    lowercase=True,
    min_df=5,
    max_features=50_000
)

tfidf.fit(df.content)

In [86]:
X_train = tfidf.transform(df[df.is_valid==0].content)
X_val = tfidf.transform(df[df.is_valid==1].content)
X_test = tfidf.transform(test.content)

In [87]:
X_train

<3088x13282 sparse matrix of type '<class 'numpy.float64'>'
	with 169010 stored elements in Compressed Sparse Row format>

In [91]:
y_train = df[df.is_valid==0][targets].values
y_val = df[df.is_valid==0][targets].values

In [128]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

model = OneVsRestClassifier(LogisticRegression(C=10.0, max_iter=1000, random_state=42))

# Fit the model
model.fit(X_train, y_train)

In [129]:
y_hat_proba = model.predict_proba(X_val)

In [130]:
optimal_thresholds = find_thresholds_for_distribution(
    y_hat_proba, desired_distribution=df[targets].mean().values)

In [131]:
binarized_preds = (y_hat_proba >= np.array(optimal_thresholds)).astype(int)

In [132]:
f1_score(preds_val.label_ids, binarized_preds, average='macro', zero_division=0)

0.3310499318036769

In [133]:
y_hat_proba = model.predict_proba(X_test)

In [134]:
binarized_preds = (y_hat_proba >= np.array(optimal_thresholds)).astype(int)

In [136]:
for ind, col in enumerate(ssubmission.columns[1:]):
    ssubmission[col] = binarized_preds[:, ind]

In [137]:
ssubmission.to_csv('submissions/logreg-baseline-cv0.33.csv', index=False)