In [1]:
import wandb

# Initialize with team/entity
wandb.init(project="unlp-clf-task", entity="bazdyrev99-igor-sikorsky-kyiv-polytechnic-institute")

[34m[1mwandb[0m: Currently logged in as: [33mbazdyrev99[0m ([33mbazdyrev99-igor-sikorsky-kyiv-polytechnic-institute[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


In [2]:
import pandas as pd

df = pd.read_parquet('train.parquet')

ssubmission = pd.read_csv('sample_submission.csv')
test = pd.read_csv('test.csv')

In [3]:
import numpy as np


df['is_valid'] = np.random.binomial(1, 0.2, df.shape[0])

In [4]:
import json

def prompt_generator(text):
    conversation = text
    return conversation

In [5]:
df.loc[:, 'prompt'] = df.content.apply(prompt_generator)
test.loc[:, 'prompt'] = test.content.apply(prompt_generator)

In [6]:
import torch
from tqdm.autonotebook import tqdm

tqdm.pandas()
from transformers import pipeline, AutoTokenizer

PRETRAINED_MODEL = 'bert-base-multilingual-cased'
MAX_LENGTH = 500

tokenizer = AutoTokenizer.from_pretrained(
    PRETRAINED_MODEL
)

tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL)
tokenizer.padding_side = 'right'
tokenizer.add_eos_token = True

df['full_text'] = df.prompt.progress_apply(
    lambda x: tokenizer.decode(tokenizer(x, add_special_tokens=False)['input_ids'][:MAX_LENGTH])
)
test['full_text'] = test.prompt.progress_apply(
    lambda x: tokenizer.decode(tokenizer(x, add_special_tokens=False)['input_ids'][:MAX_LENGTH])
)

def tokenize(sample):
    tokenized = tokenizer(sample['full_text'])
    return tokenized

  from tqdm.autonotebook import tqdm


  0%|          | 0/3822 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (603 > 512). Running this sequence through the model will result in indexing errors


  0%|          | 0/5735 [00:00<?, ?it/s]

In [7]:
ssubmission = pd.read_csv('sample_submission.csv')
targets = ssubmission.set_index('id').columns

from collections.abc import Iterable

for col in targets:
    df[col] = 0

import numpy as np
for ind, row in df.iterrows():
    if isinstance(row['techniques'], Iterable):
        for t in row['techniques']:
            df.loc[ind, t] = 1

In [8]:
df['labels'] = list(df[targets].values)

In [9]:
from datasets import Dataset

ds_train = Dataset.from_pandas(df[df.is_valid == 0][['full_text', 'labels']].copy())
ds_eval = Dataset.from_pandas(df[df.is_valid == 1][['full_text', 'labels']].copy())
ds_test = Dataset.from_pandas(test[['full_text']].copy())

ds_train = ds_train.map(tokenize)
remove_columns = [c for c in ds_train.features.keys() if c not in ['input_ids', 'attention_mask', 'labels']]
ds_train = ds_train.remove_columns(remove_columns)

ds_eval = ds_eval.map(tokenize)
remove_columns = [c for c in ds_eval.features.keys() if c not in ['input_ids', 'attention_mask', 'labels']]
ds_eval = ds_eval.remove_columns(remove_columns)

ds_test = ds_test.map(tokenize)
remove_columns = [c for c in ds_test.features.keys() if c not in ['input_ids', 'attention_mask', 'labels']]
ds_test = ds_test.remove_columns(remove_columns)

Map:   0%|          | 0/3062 [00:00<?, ? examples/s]

Map:   0%|          | 0/760 [00:00<?, ? examples/s]

Map:   0%|          | 0/5735 [00:00<?, ? examples/s]

In [10]:
from transformers import AutoModelForSequenceClassification, BitsAndBytesConfig
from peft import get_peft_config, prepare_model_for_kbit_training, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType



model = AutoModelForSequenceClassification.from_pretrained(
    'bert-base-multilingual-cased',
    num_labels=10
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
import os
import random
import numpy as np
import torch

def set_seeds(seed):
    """Set seeds for reproducibility """
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        

set_seeds(seed=42)

In [12]:
import os
from transformers import (AutoTokenizer, TrainingArguments, Trainer,
                          AutoModelForSequenceClassification, DataCollatorWithPadding)
from sklearn.metrics import f1_score
import numpy as np


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits >= 0.0
    return {"f1": f1_score(labels, predictions, average="macro")}

os.environ["WANDB_PROJECT"]="unlp-clf-task"
os.environ["WANDB_LOG_MODEL"]="false"
os.environ["WANDB_WATCH"]="false"

train_args = TrainingArguments(
    output_dir='model_checkpoints_bert_base',
    logging_dir='./model_logs_bert_base',
    learning_rate=2e-5,
    weight_decay=0.01,
    lr_scheduler_type='cosine',
    warmup_ratio=0.0,
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=1,
    #bf16=True,
    report_to="wandb",
    optim='adamw_torch',
    eval_strategy='steps',
    save_strategy="steps",
    eval_steps=200,
    logging_steps=50,
    save_steps=200,
    save_total_limit=2,
    metric_for_best_model='f1',
    greater_is_better=True,
    load_best_model_at_end=True,
)

In [13]:
from torch.nn import BCEWithLogitsLoss

class CustomTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = torch.tensor((1/df[targets].mean()).tolist()).cuda()

    def compute_loss(self, model, inputs, return_outputs=False, *args, **kwargs):
        outputs = model(inputs['input_ids'], inputs['attention_mask'])
        logits = outputs.logits

        # Initialize BCEWithLogitsLoss with class weights
        #loss_fn = BCEWithLogitsLoss(weight=self.class_weights)
        loss_fn = BCEWithLogitsLoss()
        #print(logits[:2])
        #print(inputs['labels'][:2])
        loss = loss_fn(logits, inputs['labels'].float())

        return (loss, outputs) if return_outputs else loss

In [14]:
trainer = CustomTrainer(
    model=model, 
    args=train_args, 
    train_dataset=ds_train,
    eval_dataset=ds_eval,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics
)
trainer.train()

  super().__init__(*args, **kwargs)


Step,Training Loss,Validation Loss,F1
200,0.298,0.275432,0.109515
400,0.2713,0.256188,0.158918
600,0.2344,0.254837,0.210196
800,0.2189,0.254313,0.22065


TrainOutput(global_step=960, training_loss=0.2595095120370388, metrics={'train_runtime': 315.8963, 'train_samples_per_second': 48.465, 'train_steps_per_second': 3.039, 'total_flos': 3617273178101688.0, 'train_loss': 0.2595095120370388, 'epoch': 5.0})