In [3]:
import json
from pprint import pprint
import re
import nltk
import string
import pandas as pd
import torch
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from transformers import AutoModel, AutoTokenizer
from sklearn import svm
from nltk.corpus import stopwords
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [52]:
rubert_tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")

Downloading:   0%|          | 0.00/401 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.74M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [53]:
# обрабатывать можно как 1 пример, так и батчи
texts = ["продавец мила (шевченко 17)", "кассир в пиццерию г витебск"]
# токенизируем батч и смотрим на результат
encoded_texts = rubert_tokenizer(    texts, padding=True, truncation=True, return_tensors="pt")
for k, v in encoded_texts.items():
    print(k, "=>\n", v)

input_ids =>
 tensor([[    2, 50848, 11951,   971,    12,   336,  2535, 10497,   685,    13,
             3,     0,     0],
        [    2, 60045,   870,   314, 48762, 25312,  2686,   315, 25040,   988,
         52710,   865,     3]])
token_type_ids =>
 tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
attention_mask =>
 tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])


In [54]:
# вот так выглядят данные после этого шага
rubert_tokenizer.decode(encoded_texts["input_ids"][0])

'[CLS] продавец мила ( шевченко 17 ) [SEP] [PAD] [PAD]'

In [55]:
rubert_model = AutoModel.from_pretrained("cointegrated/rubert-tiny2")

Downloading:   0%|          | 0.00/715 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/118M [00:00<?, ?B/s]

Some weights of the model checkpoint at cointegrated/rubert-tiny2 were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [56]:
model_output = rubert_model(**encoded_texts)
embeddings = model_output.last_hidden_state[:, 0, :]
embeddings = torch.nn.functional.normalize(embeddings)

In [57]:
embeddings

tensor([[ 2.8943e-02, -5.0244e-02, -7.8985e-02, -8.4074e-02, -5.9612e-03,
         -1.7475e-02, -1.5316e-02, -2.8070e-02, -1.7053e-03,  2.8987e-02,
          2.6499e-02, -3.9807e-02,  4.0313e-02,  4.9844e-02,  5.9612e-02,
         -5.7806e-02,  5.2571e-02,  2.3915e-02,  1.2160e-02,  4.8263e-02,
         -1.7403e-03, -4.8625e-04,  1.0781e-02, -1.5538e-02,  1.0777e-01,
          1.4003e-02,  1.5912e-02,  7.4606e-03, -4.0181e-02,  1.6072e-02,
         -4.1244e-03,  1.9664e-02,  1.9901e-02, -6.5723e-02, -5.0434e-02,
         -1.7168e-02,  7.0580e-02, -8.0662e-03, -4.3247e-02, -3.2058e-02,
         -1.4544e-01,  1.0295e-01,  1.2049e-01, -4.6864e-02, -9.7900e-03,
         -9.5861e-02,  9.8281e-02, -5.3196e-02,  8.9461e-03,  4.4318e-03,
         -2.8308e-02, -1.7248e-02, -1.8647e-02,  1.1467e-02,  7.9162e-04,
          8.3070e-02,  6.7830e-02,  5.3903e-03, -4.1022e-02,  2.1465e-03,
         -1.3058e-02,  2.1448e-02,  7.2634e-02,  6.0540e-02, -1.3626e-02,
          1.0140e-01, -2.5507e-02,  6.

## Создание и обучение модели

In [10]:
class FullDescriptionCreator(BaseEstimator, TransformerMixin):
    """Добавляет столбец с полным описанием вакансии"""

    patt = re.compile("[^\s\w]")

    def __init__(self, responsibilities):
        self.responsibilities = responsibilities

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X["responsibilities"] = self.responsibilities
        X["full_description"] = (
            X["name"] + " " + X["responsibilities"].fillna("")
        ).map(str.lower)
        X.loc[:, "full_description"] = X["full_description"].str.replace(
            self.patt, " ", regex=True
        )
        return X

In [11]:
class BertEmbedder(BaseEstimator, TransformerMixin):
    """Получаете эмбеддинги для батча текстов"""

    def __init__(self, bert_tokenizer, bert_model):
        self.bert_tokenizer = bert_tokenizer
        self.bert_model = bert_model

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        with torch.no_grad():
            t = self.bert_tokenizer(
                X.tolist(), padding=True, truncation=True, return_tensors="pt"
            )

            model_output = self.bert_model(**t)
            embeddings = model_output.last_hidden_state[:, 0, :]
            embeddings = torch.nn.functional.normalize(embeddings)
        return embeddings.numpy()

In [12]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [58]:
train = pd.read_csv("train.csv", index_col="index", sep=",")
train = train.query("target != -1")

with open(f"3_parsed.json", "r", encoding="utf8") as fp:
    descriptions = json.load(fp)
responsibilities = pd.Series(
    {
        description["ID"]: r[0]
        if (r := description["Content"].get("Обязанности")) is not None
        else None
        for description in descriptions
    },
    name="responsibilities",
)

russian_stopwords = stopwords.words("russian")
train['name'] = train['name'].str.replace(r"\([^()]*\)", " ", regex=True)
train['description'] = train['description'].str.replace(r"\<[^<>]*\>", " ", regex=True).str.replace(r'[^\w\s]+', ' ',regex=True).str.replace(r"\([^()]*\)", "", regex=True)
train.loc[:, "name"] = train["name"].str.replace("/[^а-я]+/i", " ", regex=True)
train.loc[:, "description"] = train["description"].str.replace("/[^а-я]+/i", " ", regex=True)
train['name'] = train['name'].apply(lambda x: ' '.join([word for word in x.split() if word not in (russian_stopwords)]))
X_train, y_train = train.drop(columns=["target"]), train["target"]

print(train['name'].shape)

(15000,)


In [59]:
#рабочий пайплайн
clf_bert = Pipeline(
    [
        ("add_full_description", FullDescriptionCreator(responsibilities)),
        (
            "bert",
            # изменен способ векторизации текстов
            ColumnTransformer([("vectorize", BertEmbedder(rubert_tokenizer, rubert_model), "name")]),
        ),
        (
            "clf",
            # заменена модель-классификатор
            MLPClassifier(max_iter=300),
        ),
    ]
)

clf_bert.fit(X_train, y_train)
print(clf_bert.score(X_train, y_train))

1.0


In [60]:
test = pd.read_csv("test_new.csv", index_col='index')
test_int = pd.DataFrame(test.index)
test['name'] = test['name'].str.replace(r"\([^()]*\)", " ", regex=True)
test['description'] = test['description'].str.replace(r"\<[^<>]*\>", " ", regex=True).str.replace(r'[^\w\s]+', ' ',regex=True).str.replace(r"\([^()]*\)", "", regex=True)
test.loc[:, "name"] = test["name"].str.replace("/[^а-я]+/i", " ", regex=True)
test.loc[:, "description"] = test["description"].str.replace("/[^а-я]+/i", " ", regex=True)

test2 = pd.read_csv("test_new.csv")
test2['name'] = test2['name'].str.replace(r"\([^()]*\)", " ", regex=True)
test2['description'] = test2['description'].str.replace(r"\<[^<>]*\>", " ", regex=True).str.replace(r'[^\w\s]+', ' ',regex=True).str.replace(r"\([^()]*\)", "", regex=True)
test2.loc[:, "name"] = test2["name"].str.replace("/[^а-я]+/i", " ", regex=True)
test2.loc[:, "description"] = test2["description"].str.replace("/[^а-я]+/i", " ", regex=True)

y_pred = clf_bert.predict(test)

In [61]:
submission = test2[['index']].assign(target=y_pred)
display(submission.head())
submission.to_csv("result_new.csv", index=False)

Unnamed: 0,index,target
0,26461447,9621
1,26464220,3331
2,26467473,5223
3,26468989,9334
4,26471705,4222
