In [None]:
!pip -q install kagglehub evaluate catboost

import kagglehub
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import f1_score
from transformers import (AutoModelForSequenceClassification, AutoTokenizer, Trainer,
                          TrainingArguments, DataCollatorWithPadding, AutoModel)
from scipy.optimize import minimize, differential_evolution
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
import lightgbm as lgb
import evaluate
from tqdm.auto import tqdm
import torch.nn.functional as F
import torch
import os
import gc
from torch.utils.data import Dataset
from sklearn.linear_model import Ridge, LogisticRegression
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
init_path = kagglehub.dataset_download('bobbyshmurda31/nlp-with-disaster-tweets-for-colab') + '/'

Using Colab cache for faster access to the 'nlp-with-disaster-tweets-for-colab' dataset.


In [None]:
train_data = pd.read_csv(init_path + 'train.csv').drop(['id', 'location', 'keyword'], axis=1)
test_data = pd.read_csv(init_path + 'test.csv')
test_ids = test_data['id']
test_data = test_data.drop(['id', 'location', 'keyword'], axis=1)

train_texts = train_data['text'].values
test_texts = test_data['text'].values
y_train = train_data['target'].values

# location, keyword ###################################

In [None]:
class CustomDataset(Dataset):
    def __init__(self, tokenizer, texts, labels=None, max_length=512):
        self.tokenizer = tokenizer
        self.texts = texts
        self.labels = labels
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        encoded = self.tokenizer(
            self.texts[index],
            add_special_tokens=True,
            truncation=True,
            padding=False,
            max_length=self.max_length,
            return_tensors=None
        )
        if self.labels is not None:
            encoded['labels'] = self.labels[index]
        return encoded

class WeightedCETrainer(Trainer):
    def __init__(self, *args, num_labels=None, train_labels=None, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        if class_weights is not None:
            self.class_weights = torch.as_tensor(class_weights, dtype=torch.float32)
        elif train_labels is not None and num_labels is not None:
            y = np.asarray(train_labels).astype(int)
            counts = np.bincount(y, minlength=num_labels)
            n = counts.sum()
            w = np.zeros(num_labels, dtype=np.float32)
            nz = counts > 0
            w[nz] = n / (num_labels * counts[nz].astype(np.float32))
            self.class_weights = torch.tensor(w, dtype=torch.float32)
        else:
            self.class_weights = None

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        if logits.size(0) != labels.size(0):
            raise ValueError(f"Batch size mismatch: logits batch={logits.size(0)} vs labels batch={labels.size(0)}")

        weight = self.class_weights.to(logits.device) if self.class_weights is not None else None
        loss = F.cross_entropy(logits, labels.long(), weight=weight)
        return (loss, outputs) if return_outputs else loss

class HFmodel:
    def __init__(self, checkpoint='prajjwal1/bert-tiny', max_length=512):
        self.model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
        self.tokenizer = AutoTokenizer.from_pretrained(checkpoint)
        self.max_length = max_length

    def fit(self, train_texts, y_train, val_texts, y_val):
        f1 = evaluate.load('f1')
        def compute_metrics(preds):
            logits, labels = preds
            predictions = np.argmax(logits, axis=-1)
            return f1.compute(predictions=predictions, references=labels, average='weighted')

        train_dataset = CustomDataset(self.tokenizer, train_texts, y_train, self.max_length)
        val_dataset = CustomDataset(self.tokenizer, val_texts, y_val, self.max_length)
        self.data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer)

        encoder_params = []; head_params = []
        for name, param in self.model.named_parameters():
            if 'classifier' in name or 'head' in name or 'projection' in name: head_params.append(param)
            else: encoder_params.append(param)
        optimizer_grouped_parameters = [
            {'params': encoder_params, 'lr': 2e-5},
            {'params': head_params, 'lr': 6e-5}
        ]
        optimizer = torch.optim.AdamW(optimizer_grouped_parameters)

        train_args = TrainingArguments(
            per_device_train_batch_size=32,
            per_device_eval_batch_size=64,
            num_train_epochs=2,
            # learning_rate=3e-5,
            metric_for_best_model='f1',
            save_total_limit=None,
            load_best_model_at_end=True,
            lr_scheduler_type='constant',
            warmup_ratio=0.1,
            weight_decay=0.01,
            logging_strategy='steps',
            eval_strategy='steps',
            # save_strategy='steps',
            logging_steps=50,
            eval_steps=50,
            # save_steps=100,
            report_to='none',
            output_dir='./result',
            fp16=True
        )
        self.trainer = Trainer(
            args=train_args,
            model=self.model,
            tokenizer=self.tokenizer,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            data_collator=self.data_collator,
            compute_metrics=compute_metrics,
            optimizers=(optimizer, None)
        )
        self.trainer.train()
        self.model.save_pretrained('./result/best_model')
        self.embedding_model = AutoModel.from_pretrained('./result/best_model')

        return self

    def get_embeddings(self, texts, batch_size=32):
        embeddings = []
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.embedding_model.to(device)
        self.model.to(device)

        for i in tqdm(range(0, len(texts), batch_size)):
            batch = texts[i:i + batch_size]
            encoded = self.tokenizer(
                batch.tolist(),
                truncation=True,
                max_length=self.max_length,
                padding='longest',
                return_tensors='pt'
            ).to(device)

            with torch.no_grad():
                embeds = self.embedding_model(**encoded).last_hidden_state[:, 0]

            embeds = embeds.cpu().numpy()
            embeddings.append(embeds)

        return np.concatenate(embeddings, axis=0)

    def predict(self, texts, return_probs=False):
        dataset = CustomDataset(self.tokenizer, texts, max_length=self.max_length)
        preds = self.trainer.predict(dataset).predictions
        preds = F.softmax(torch.tensor(preds), dim=-1).numpy()[:, 1]

        if not return_probs:
            preds = (preds > 0.5).astype(int)

        return preds

In [None]:
test_hf_model_data = np.zeros((len(test_data), 769))  # 129
train_oof_hf_model_data = np.zeros((len(train_data), 769))  # 129

N_SPLITS = 5
kfold = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

for train_idx, val_idx in tqdm(list(kfold.split(train_texts, y_train)), desc='Получение данных от HF модели'):
    train_texts_fold, val_texts_fold = train_texts[train_idx], train_texts[val_idx]
    y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]

    # prajjwal1/bert-tiny google-bert/bert-base-uncased
    model = HFmodel(checkpoint='google-bert/bert-base-uncased', max_length=512)
    model.fit(train_texts_fold, y_train_fold, val_texts_fold, y_val_fold)

    test_preds = np.concatenate([
        model.get_embeddings(test_texts, batch_size=32),
        model.predict(test_texts, return_probs=True).reshape((-1, 1))
    ], axis=1)
    train_preds = np.concatenate([
        model.get_embeddings(val_texts_fold, batch_size=32),
        model.predict(val_texts_fold, return_probs=True).reshape((-1, 1))
    ], axis=1)

    test_hf_model_data += test_preds / N_SPLITS
    train_oof_hf_model_data[val_idx] = train_preds

del model, kfold, test_preds, train_preds, N_SPLITS
del train_texts_fold, val_texts_fold, y_train_fold, y_val_fold
gc.collect()

Получение данных от HF модели:   0%|          | 0/5 [00:00<?, ?it/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

  self.trainer = Trainer(


Step,Training Loss,Validation Loss,F1
50,0.5439,0.437026,0.798924
100,0.4361,0.435414,0.812394
150,0.4199,0.389663,0.822907
200,0.4098,0.389647,0.845145
250,0.3438,0.391605,0.833031
300,0.3003,0.440156,0.831
350,0.3318,0.44502,0.823856


  0%|          | 0/102 [00:00<?, ?it/s]

  0%|          | 0/48 [00:00<?, ?it/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.trainer = Trainer(


Step,Training Loss,Validation Loss,F1
50,0.5604,0.434937,0.818441
100,0.4225,0.410418,0.83043
150,0.4263,0.400489,0.838702
200,0.3845,0.382343,0.834543
250,0.3707,0.387033,0.831879
300,0.3292,0.387355,0.843598
350,0.3666,0.424689,0.821467


  0%|          | 0/102 [00:00<?, ?it/s]

  0%|          | 0/48 [00:00<?, ?it/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.trainer = Trainer(


Step,Training Loss,Validation Loss,F1
50,0.5665,0.459333,0.798585
100,0.4133,0.44283,0.806861
150,0.4232,0.438799,0.807705
200,0.3832,0.428863,0.821829
250,0.3391,0.465168,0.812001
300,0.3159,0.430582,0.820502
350,0.3372,0.419742,0.823909


  0%|          | 0/102 [00:00<?, ?it/s]

  0%|          | 0/48 [00:00<?, ?it/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.trainer = Trainer(


Step,Training Loss,Validation Loss,F1
50,0.54,0.414691,0.826353
100,0.4595,0.382992,0.843941
150,0.4296,0.391783,0.830722
200,0.3838,0.387522,0.845001
250,0.3461,0.388402,0.843302
300,0.3552,0.369178,0.845271
350,0.303,0.391286,0.839643


  0%|          | 0/102 [00:00<?, ?it/s]

  0%|          | 0/48 [00:00<?, ?it/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.trainer = Trainer(


Step,Training Loss,Validation Loss,F1
50,0.5625,0.497921,0.782172
100,0.4397,0.412852,0.82602
150,0.4186,0.460711,0.812368
200,0.4148,0.437596,0.818818
250,0.3275,0.435574,0.840327
300,0.3494,0.415117,0.840065
350,0.3502,0.382128,0.843612


  0%|          | 0/102 [00:00<?, ?it/s]

  0%|          | 0/48 [00:00<?, ?it/s]

4661

In [None]:
# vectorizers = [
#     TfidfVectorizer(max_features=5000, analyzer='char', ngram_range=(3, 5)),
#     TfidfVectorizer(max_features=5000, analyzer='word', ngram_range=(1, 2)),
#     TfidfVectorizer(max_features=5000, analyzer='char_wb', ngram_range=(3, 5))
# ]

# all_texts = np.concatenate([train_texts, test_texts], axis=0)
# encoded_all_texts = [vectorizer.fit_transform(all_texts.tolist()).toarray() for vectorizer in vectorizers]
# encoded_all_texts = np.concatenate(encoded_all_texts, axis=1)

# train_encoded_texts, test_encoded_texts = encoded_all_texts[:len(train_texts)], encoded_all_texts[len(train_texts):]

# def get_texts_statistics(text):
#     return np.array([
#         len(text.strip()),
#         sum([symbol.islower() for symbol in text]),
#         sum([symbol.isupper() for symbol in text]),
#         len(text.split(' ')),
#         text.count(','),
#         sum([s in ',.!-_=+!@#$%^&*()";:?~`' for s in text]),
#         int('@' in text),
#         sum([s in '0123456789' for s in text])
#     ])

# text_statistics = np.concatenate([get_texts_statistics(text) for text in all_texts], axis=0).reshape((-1, 8))
# train_statistics_texts, test_statistics_texts = text_statistics[:len(train_texts)], text_statistics[len(train_texts):]

# all_train_data = np.concatenate([train_encoded_texts, train_oof_hf_model_data, train_statistics_texts], axis=1)
# all_test_data = np.concatenate([test_encoded_texts, test_hf_model_data, test_statistics_texts], axis=1)
all_train_data = train_oof_hf_model_data
all_test_data = test_hf_model_data

del train_encoded_texts, test_encoded_texts  # , train_oof_hf_model_data, test_hf_model_data
del train_statistics_texts, test_statistics_texts, encoded_all_texts
gc.collect()

0

In [None]:
oof_train_preds = np.zeros((len(all_train_data), 4))
test_preds = np.zeros((len(all_test_data), 4))

N_SPLITS = 5
kfold = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

for train_idx, val_idx in tqdm(list(kfold.split(all_train_data, y_train)), desc='Обучение ансамбля'):
    x_train_fold, x_val_fold = all_train_data[train_idx], all_train_data[val_idx]
    y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]

    model = lgb.LGBMClassifier(
        n_estimators=1500,
        learning_rate=0.03,
        verbosity=-1,
        max_depth=6,
        class_weights='balanced',
        device='gpu'
    )
    model.fit(
        x_train_fold,  y_train_fold,
        eval_set=[(x_val_fold, y_val_fold)],
        eval_metric='f1',
        callbacks=[
            # lgb.log_evaluation(20),
            lgb.early_stopping(100, verbose=False)
        ]
    )
    preds = model.predict(x_val_fold)
    model_score = f1_score(preds, y_val_fold)
    print(f'LGBM\'s score: {round(model_score, 5)}')
    oof_train_preds[val_idx, 0] = model.predict_proba(x_val_fold)[:, 1]
    test_preds[:, 0] += model.predict_proba(all_test_data)[:, 1] / N_SPLITS

    model = XGBClassifier(
        n_estimators=1500,
        max_depth=6,
        learning_rate=0.03,
        early_stopping_rounds=100,
        device='cuda'
    )
    model.fit(
        x_train_fold, y_train_fold, verbose=False,
        eval_set=[(x_val_fold, y_val_fold)]
    )
    preds = model.predict(x_val_fold)
    model_score = f1_score(preds, y_val_fold)
    print(f'XGBClassifier\'s score: {round(model_score, 5)}')
    oof_train_preds[val_idx, 1] = model.predict_proba(x_val_fold)[:, 1]
    test_preds[:, 1] += model.predict_proba(all_test_data)[:, 1] / N_SPLITS

    model = CatBoostClassifier(
        iterations=1500,
        task_type='GPU',
        learning_rate=0.03,
        verbose=50,
        auto_class_weights='Balanced',
        eval_metric='TotalF1',
        l2_leaf_reg=2.0,
        early_stopping_rounds=100,
        max_depth=6
    )
    model.fit(
        x_train_fold, y_train_fold,
        eval_set=(x_val_fold, y_val_fold)
    )
    # preds = model.predict(x_val_fold)
    # model_score = f1_score(preds, y_val_fold)
    # print(f'CatBoostClassifier\'s score: {round(model_score, 5)}')
    oof_train_preds[val_idx, 2] = model.predict_proba(x_val_fold)[:, 1]
    test_preds[:, 2] += model.predict_proba(all_test_data)[:, 1] / N_SPLITS

    model = LogisticRegression(C=1.0, max_iter=1000).fit(x_train_fold, y_train_fold)
    preds = model.predict(x_val_fold)
    model_score = f1_score(preds, y_val_fold)
    print(f'LogisticRegression\'s score: {round(model_score, 5)}')
    oof_train_preds[val_idx, 3] = model.predict_proba(x_val_fold)[:, 1]
    test_preds[:, 3] += model.predict_proba(all_test_data)[:, 1] / N_SPLITS

def optimize_weights_f1(predictions, y_true):
    objective = lambda weights: -f1_score(y_true, (np.dot(
        predictions, weights / weights.sum()) >= 0.5).astype(int), average='weighted')
    bounds = [(0, 1) for _ in range(predictions.shape[1])]
    result = differential_evolution(objective, bounds, seed=42, maxiter=1000, polish=True)
    return result.x / result.x.sum(), -result.fun

weights, best_f1 = optimize_weights_f1(oof_train_preds, y_train)
print(f'Ensemble\'s score: {round(best_f1, 5)}')
print(weights)

Обучение ансамбля:   0%|          | 0/5 [00:00<?, ?it/s]



LGBM's score: 0.79598
XGBClassifier's score: 0.7874
0:	learn: 0.8258636	test: 0.5083016	best: 0.5083016 (0)	total: 180ms	remaining: 4m 30s
50:	learn: 0.8403568	test: 0.7844217	best: 0.8219165 (6)	total: 7.73s	remaining: 3m 39s
100:	learn: 0.8474327	test: 0.7932899	best: 0.8219165 (6)	total: 13.5s	remaining: 3m 7s
bestTest = 0.8219165019
bestIteration = 6
Shrink model to first 7 iterations.
LogisticRegression's score: 0.78249




LGBM's score: 0.7983
XGBClassifier's score: 0.80102
0:	learn: 0.8163387	test: 0.8295935	best: 0.8295935 (0)	total: 111ms	remaining: 2m 46s
50:	learn: 0.8351630	test: 0.8311893	best: 0.8344712 (9)	total: 3.9s	remaining: 1m 50s
100:	learn: 0.8433669	test: 0.8305361	best: 0.8344712 (9)	total: 6.88s	remaining: 1m 35s
bestTest = 0.8344712396
bestIteration = 9
Shrink model to first 10 iterations.
LogisticRegression's score: 0.78469




LGBM's score: 0.73224
XGBClassifier's score: 0.73126
0:	learn: 0.8292841	test: 0.7826255	best: 0.7826255 (0)	total: 111ms	remaining: 2m 46s
50:	learn: 0.8391105	test: 0.7764283	best: 0.7826255 (0)	total: 3.63s	remaining: 1m 43s
100:	learn: 0.8458693	test: 0.7806360	best: 0.7826255 (0)	total: 6.69s	remaining: 1m 32s
bestTest = 0.7826254728
bestIteration = 0
Shrink model to first 1 iterations.
LogisticRegression's score: 0.76466




LGBM's score: 0.79419
XGBClassifier's score: 0.79385
0:	learn: 0.8200127	test: 0.8203106	best: 0.8203106 (0)	total: 113ms	remaining: 2m 49s
50:	learn: 0.8374081	test: 0.8191274	best: 0.8231812 (20)	total: 3.22s	remaining: 1m 31s
100:	learn: 0.8459551	test: 0.8194955	best: 0.8231812 (20)	total: 6.21s	remaining: 1m 26s
bestTest = 0.8231812433
bestIteration = 20
Shrink model to first 21 iterations.
LogisticRegression's score: 0.72785




LGBM's score: 0.77408
XGBClassifier's score: 0.76678
0:	learn: 0.8217287	test: 0.8150527	best: 0.8150527 (0)	total: 109ms	remaining: 2m 43s
50:	learn: 0.8376882	test: 0.8240949	best: 0.8265822 (13)	total: 3.21s	remaining: 1m 31s
100:	learn: 0.8454067	test: 0.8248329	best: 0.8265822 (13)	total: 6.18s	remaining: 1m 25s
bestTest = 0.8265822192
bestIteration = 13
Shrink model to first 14 iterations.
LogisticRegression's score: 0.75766
Ensemble's score: 0.82656
[0.16169444 0.14295392 0.62680142 0.06855022]


In [None]:
weights, best_f1 = optimize_weights_f1(oof_train_preds, y_train)
print(f'Ensemble\'s score: {round(best_f1, 5)}')
print(weights)

Ensemble's score: 0.82656
[0.16169444 0.14295392 0.62680142 0.06855022]


In [None]:
preds = (np.dot(np.array(test_preds), weights) >= 0.5).astype(int)
submission = pd.DataFrame({'id': test_ids, 'target': preds})
submission.to_csv('submission.csv', index=False)
submission.head(10)

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
5,12,1
6,21,0
7,22,0
8,27,0
9,29,0
