In [None]:
import pandas as pd
import torch
from data_preprocessing_utils import preprocess_data

In [None]:
TRAIN_DATA_PATH = 'HeadHunter_train.csv'
TEST_DATA_PATH = 'HeadHunter_test.csv'

In [None]:
train = pd.read_csv(TRAIN_DATA_PATH)
test = pd.read_csv(TEST_DATA_PATH)

train = preprocess_data(train)
test = preprocess_data(test)

In [None]:
from transformers import AutoTokenizer

In [None]:
BASE_MODEL = "DeepPavlov/rubert-base-cased"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

spec_toks = ['[%s%d]' % (g,r) for g in ['A', 'B', 'C', 'D', 'E', 'F'] for r in range(1,6) ]

tokenizer.add_tokens(spec_toks, special_tokens = True)

In [3]:
from data_preprocessing_utils import make_text_features_with_rating_tokens, collate_fn
from sklearn.preprocessing import MultiLabelBinarizer
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler

In [None]:
input_ids, attention_mask, token_types = make_text_features_with_rating_tokens(train)
mlb = MultiLabelBinarizer()
labels = torch.LongTensor(mlb.fit_transform(train.target.apply(lambda x:x.split(',')))).float()
dataset_train = TensorDataset(input_ids, attention_mask, token_types, labels)
batch_size = 16
train_size = int(0.95 * len(dataset_train))
val_size = len(dataset_train) - train_size
train_dataset, val_dataset = random_split(dataset_train, [train_size, val_size])


train_dataloader = DataLoader(train_dataset,sampler = RandomSampler(train_dataset) ,batch_size = batch_size, collate_fn=collate_fn)
valid_dataloader = DataLoader(val_dataset, batch_size = batch_size, collate_fn=collate_fn)


In [None]:
from train_utils import get_optimizer_grouped_parameters, train_loop, model_eval, collate_fn_pred, get_predict

In [None]:

bert_config = {
    'from_pretrained':"DeepPavlov/rubert-base-cased",
    'cls_dim' : 768, 
    'n_classes' : 9, 
    'h_dim' : None,
    'p': 0.5
}

DEVICE = 'cuda'

train_config = {
    'bert_lr':{'bert_lr':5e-5, 'embed_lr':1e-6, 'n_layers': 9, 'lr_decay': 0.9, 'n_reinit':0},
    'task_lr':  1e-4, 
    'epoch': 2, 
    'loss_fn': torch.nn.BCEWithLogitsLoss(), 
    'opt_param_fn' : get_optimizer_grouped_parameters
}

In [None]:
from models import BertMeanMaxPooling

In [None]:
model = BertMeanMaxPooling(**bert_config)
model.bert.resize_token_embeddings(len(tokenizer))
t = model.to(DEVICE)

In [None]:
train_loop(model, train_dataloader, train_config, model_eval, valid_dataloader)

In [None]:
from utils import save_submit

In [None]:
input_ids, attention_mask = make_text_features_with_rating_tokens(test)
dataset_test = TensorDataset(input_ids, attention_mask)
test_dataloader = DataLoader(dataset_test, batch_size = batch_size, collate_fn = collate_fn_pred)
pred = get_predict(model, test_dataloader, binary = False)
save_submit(pred)