In [1]:
import numpy as np
import pandas as pd

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')
train

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [3]:
from sklearn.model_selection import train_test_split


X = train.drop(columns=['target'])
y = train['target']

# Сделайте разделение даты на трейн и валидацию с помощью train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from transformers import TrainingArguments



def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds),
        "precision": precision_score(labels, preds),
        "recall": recall_score(labels, preds)
    }


# BertForSequenceClassification автоматически добавляет линейный слой на клс модели с выбранным количеством лейблов
#TO DO
'''
Обучите берт используя Trainer, после чего создайте сабмит с его предсказаниеями и проверьте результат на кагле
'''

# используйте эти параметры обучения
training_args = TrainingArguments(
    # Основные параметры
    output_dir='./bert-binary-classifier',  # Директория для сохранения
    
    # Параметры обучения
    num_train_epochs=3,                     # Количество эпох
    per_device_train_batch_size=1,         # Размер батча для обучения
    per_device_eval_batch_size=1,          # Размер батча для валидации
    learning_rate=2e-5,                     # Learning rate
    warmup_ratio = 0.1,                     # 10% от общего числа шагов для вармапа или warmup_steps = int(0.1 * total_training_steps)
    lr_scheduler_type = 'cosine',           # Можете посмотреть на них в 
                                            # https://www.kaggle.com/code/snnclsr/learning-rate-schedulers 
                                            # соответсвующий ему будет get_cosine_schedule_with_warmup
    gradient_accumulation_steps=8,
    # Сохранение и логирование
    logging_dir='./logs',                   # Директория для логов
    logging_steps=20,                      # Частота логирования
    save_steps=200,                         # Частота сохранения
    save_total_limit=2,                     # Максимум чекпоинтов
    save_strategy='steps',                  # Стратегия сохранения
    
    # Валидация
    eval_strategy='steps',
    eval_steps=200,            # Стратегия валидации
    load_best_model_at_end=True,            # Загружать лучшую модель
    metric_for_best_model='f1',             # Метрика для выбора лучшей
    greater_is_better=True,                 # Больше значение = лучше
    # воспроизводимость
    seed=42,                                # Seed для воспроизводимости
)

In [5]:
from datasets import Dataset
from transformers import AutoTokenizer


train_dataset = Dataset.from_dict({"text": X_train['text'].tolist(), "label": y_train.tolist()})
val_dataset  = Dataset.from_dict({"text": X_val['text'].tolist(), "label": y_val.tolist()})

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-Embedding-0.6B")
tokenizer.padding_side = "right"

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

def preprocess(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=512
    )

train_dataset = train_dataset.map(preprocess, batched=True)
val_dataset = val_dataset.map(preprocess, batched=True)

Map:   0%|          | 0/6090 [00:00<?, ? examples/s]

Map:   0%|          | 0/1523 [00:00<?, ? examples/s]

In [7]:
from transformers import AutoModelForSequenceClassification
import os

# model_path = "Qwen/Qwen3-Embedding-0.6B"
model_path = "bert-binary-classifier/checkpoint-2283"

model = AutoModelForSequenceClassification.from_pretrained(
    model_path,
    num_labels=2
)

In [None]:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
model.config.pad_token_id = tokenizer.pad_token_id

In [8]:
from transformers import Trainer


def train():
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics
    )

    trainer.train()

train()

Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
200,0.6182,0.428719,0.815496,0.739574,0.927907,0.614792
400,0.4407,0.653211,0.782666,0.771251,0.699248,0.859784
600,0.5075,0.455304,0.797111,0.776573,0.731608,0.827427
800,0.4819,0.480926,0.819435,0.778047,0.816949,0.742681
1000,0.1915,0.714095,0.831254,0.780529,0.875479,0.70416
1200,0.342,0.577508,0.826658,0.786062,0.82906,0.747304
1400,0.2607,0.6,0.795798,0.772827,0.734722,0.8151
1600,0.2983,0.783607,0.837163,0.793333,0.863884,0.733436
1800,0.3032,0.916678,0.820749,0.787879,0.794671,0.781202
2000,0.1664,0.867684,0.833224,0.797448,0.826446,0.770416


In [9]:
test_dataset = Dataset.from_dict({"text": test["text"].tolist()})

# токенизируем (обязательно!)
test_dataset = test_dataset.map(
    lambda examples: tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",  # как при обучении!
        max_length=512
    ),
    batched=True
)


Map:   0%|          | 0/3263 [00:00<?, ? examples/s]

In [10]:
test_dataset = test_dataset.remove_columns("text")

In [None]:
import torch
from tqdm import tqdm
import numpy as np
import pandas as pd
from datetime import datetime
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding

def predict_argmax(test_dataset, model, tokenizer,
                   device="cuda", batch_size=16) -> list[float]:
    model.to(device)
    model.eval()

    data_collator = DataCollatorWithPadding(
        tokenizer=tokenizer,
        padding="max_length",  # строго до max_length
        max_length=512
    )

    loader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=data_collator)


    predictions = []
    with torch.no_grad():
        for batch in tqdm(loader, desc="Predicting"):
            # Собираем только tensor-поля и исключаем метки, если они есть
            model_inputs = {
                k: v.to(device)
                for k, v in batch.items()
                if isinstance(v, torch.Tensor) and k not in ("label", "labels")
            }

            logits = model(**model_inputs).logits
            probs = torch.softmax(logits, dim=1)[:, 1]  # вероятность класса 1
            predictions.extend(probs.cpu().numpy())

    return predictions


In [42]:
# val_dataset = val_dataset.remove_columns("label")
# val_dataset = val_dataset.remove_columns("text")

val_preds = predict_argmax(test_dataset=val_dataset, model=model, tokenizer=tokenizer)

Predicting: 100%|██████████| 96/96 [1:11:00<00:00, 44.38s/it] 


In [43]:
from sklearn.metrics import f1_score
import numpy as np

best_threshold = 0
best_f1 = 0

for t in np.arange(0.1, 0.9, 0.01):
    preds_bin = (val_preds >= t).astype(int)
    f1 = f1_score(y_val, preds_bin)
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = t

print("Best threshold:", best_threshold, "with F1:", best_f1)

Best threshold: 0.8299999999999996 with F1: 0.7986906710310966


In [12]:
def save_submission_with_threshold(predictions, test_df, threshold=0.65):
    predictions = (np.array(predictions) >= threshold).astype(int)
    submission_df = pd.DataFrame({
        "id": test_df["id"],
        "target": predictions
    })
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    submission_df.to_csv(f"Submission_{timestamp}.csv", index=False)
    return submission_df

In [15]:
test_preds = predict_argmax(test_dataset=test_dataset, model=model, tokenizer=tokenizer)

Predicting: 100%|██████████| 204/204 [02:07<00:00,  1.60it/s]


In [16]:
save_submission_with_threshold(test_preds, test, 0.8)

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1
