In [1]:
import torch
import warnings
import numpy as np
import transformers
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import TruncatedSVD
from tqdm import notebook
from tensorflow.keras.layers import Input, Dense, Concatenate, Dropout, Add, dot, LSTM
warnings.filterwarnings("ignore")

https://github.com/google-research/bert

https://huggingface.co/transformers/model_doc/bert.html#bertmodel

In [2]:
# считываем датасеты с нормированными описаниями тендеров

train_data_desc_stemmed = pd.read_csv('../data/intermid/train_data_desc_stemmed.csv')
test_data_desc_stemmed = pd.read_csv('../data/intermid/test_data_desc_stemmed.csv')

# соединяем датасеты

data = pd.concat([train_data_desc_stemmed, test_data_desc_stemmed], ignore_index=True)

In [3]:
data = data[:1000]

In [4]:
# инициализация модели
tokenizer = transformers.BertTokenizer(vocab_file='../download_models/uncased_L-12_H-768_A-12/vocab.txt')
config = transformers.BertConfig.from_json_file('../download_models/uncased_L-12_H-768_A-12/bert_config.json')
model = transformers.BertModel.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
tokenized = data['text_description_tender_stemmed'].apply(lambda x: tokenizer.encode(x, max_length=500, add_special_tokens = True))
print("Tokenized head:", tokenized.head())
n = max(map(len, tokenized))  # применим padding (уравняем длины исходных описаний)
for i in range(len(tokenized)):   
    tokenized[i] = tokenized[i] + [0]*(n-len(tokenized[i]))
tokenized = np.stack(tokenized)
attention_mask = np.where(tokenized != 0, 1, 0) # создадим маску (укажем нулевые и ненулевые значения)
print("Attention mask:", attention_mask.shape)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Tokenized head: 0    [101, 1198, 29747, 29436, 29748, 29741, 10260,...
1    [101, 1194, 16856, 10325, 14150, 29740, 16856,...
2    [101, 1194, 16856, 10325, 14150, 29740, 16856,...
3    [101, 1194, 16856, 14150, 29742, 29436, 15290,...
4    [101, 1194, 16856, 10325, 14150, 29740, 16856,...
Name: text_description_tender_stemmed, dtype: object
Attention mask: (1000, 500)


In [6]:
# сделаем цикл по батчам:
batch_size = 100
embeddings = []
for i in notebook.tqdm(range(tokenized.shape[0] // batch_size)):       
    batch = torch.LongTensor(tokenized[batch_size*i:batch_size*(i+1)])  # преобразуем данные в формат тензоров
    attention_mask_batch = torch.LongTensor(attention_mask[batch_size*i:batch_size*(i+1)])  # преобразуем маску
    with torch.no_grad(): # градиенты не нужны, модель BERT обучать не будем
        batch_embeddings = model(batch, attention_mask=attention_mask_batch)
        # получаем эмбеддинги для батча, передав модели данные и маску
    embeddings.append(batch_embeddings[0][:,0,:].numpy())  
        # извлекаем нужные элементы из тензора и добавляем в список эмбеддингов  

  0%|          | 0/10 [00:00<?, ?it/s]

In [7]:
embeddings

[array([[-0.4706191 ,  0.12665111, -0.13658667, ..., -0.11170372,
          0.3024224 ,  0.3312525 ],
        [-0.23084061,  0.34706017,  0.09129888, ..., -0.02850064,
          0.5029464 ,  0.2785562 ],
        [-0.37292403,  0.20395312, -0.11883457, ..., -0.03590769,
          0.41513634,  0.3958849 ],
        ...,
        [-0.3669382 ,  0.32654774,  0.00282824, ..., -0.02264736,
          0.45594034,  0.30381262],
        [-0.31806135,  0.2796135 ,  0.00537187, ..., -0.05984727,
          0.39598483,  0.31441912],
        [-0.5517641 ,  0.19259205, -0.31420934, ..., -0.02630982,
          0.5279942 ,  0.36230895]], dtype=float32),
 array([[-0.26976913,  0.33320117, -0.04327675, ...,  0.00703785,
          0.47228822,  0.24412946],
        [-0.29748487,  0.3178135 ,  0.03755035, ..., -0.13640223,
          0.33707598,  0.32772842],
        [-0.19031663,  0.3059399 ,  0.03691196, ...,  0.04728355,
          0.4600698 ,  0.21634683],
        ...,
        [-0.2665012 ,  0.4299869 , -0.0

In [8]:
vectors = np.concatenate(embeddings)
print(vectors.shape)

(1000, 768)


In [9]:
X_reduced = TruncatedSVD(n_components=100, random_state=0).fit_transform(vectors) # оставляем n главных компонент
bert_vectors_redused  = pd.DataFrame(data['pn_lot_id']).join(pd.DataFrame(X_reduced))

In [10]:
bert_vectors_redused

Unnamed: 0,pn_lot_id,0,1,2,3,4,5,6,7,8,...,90,91,92,93,94,95,96,97,98,99
0,7031618,13.501331,3.237092,0.807673,0.404966,-0.608116,0.231575,0.400840,0.482247,-0.176830,...,-0.211987,0.090788,-0.105622,0.064085,-0.020659,0.105589,0.019773,0.014205,-0.241538,-0.019701
1,7808247,13.664414,-1.028028,-0.967501,-0.708337,0.151037,0.106892,0.307951,-0.008613,-0.559414,...,0.072642,-0.200774,-0.179768,0.009964,-0.066842,-0.054728,-0.063379,-0.000343,-0.037718,0.016285
2,7009496,13.553486,0.633898,-1.478396,-1.369439,0.757321,0.090601,0.221957,-0.206632,-0.395499,...,0.038435,0.022619,-0.127503,-0.078644,-0.020889,0.016643,-0.000573,-0.070886,0.052449,-0.052902
3,5938735,13.667197,1.453499,-0.643730,-1.013408,-0.046867,0.220003,0.138429,-0.544248,0.108580,...,0.081264,0.110336,0.037457,0.030405,-0.092684,0.045929,0.023358,0.021979,0.014179,0.095570
4,9327348,13.549007,-0.552627,-0.998366,0.032463,-0.305187,-0.602238,-0.100203,0.113755,-0.626121,...,0.005484,-0.008763,-0.079150,-0.048439,-0.016995,-0.041554,-0.128636,0.074861,-0.096709,0.057576
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,8691563,13.832704,-0.307220,0.282596,-0.347251,0.236381,-0.268431,0.173621,-0.141698,0.246494,...,0.091687,0.010458,-0.050586,-0.063858,0.036700,0.052563,-0.093054,-0.030072,0.000586,0.074759
996,1951794,13.724087,-0.238426,-0.147065,-0.403062,-0.012487,0.430822,0.930970,0.031606,-0.273080,...,0.000838,0.002463,0.102267,0.055150,0.067703,0.046478,0.102742,0.083731,0.021148,0.157411
997,3464251,13.824816,0.430473,-0.131659,0.130567,0.206427,0.280987,0.936840,-0.142142,0.022244,...,0.008341,-0.148073,0.043334,-0.042757,0.011266,0.070051,0.022419,0.070192,0.049736,-0.012234
998,4093085,13.370934,-1.619873,-0.727181,-0.110802,0.524483,0.256816,0.862570,-0.115578,-0.375329,...,-0.179954,-0.036437,0.056522,0.111130,0.068333,0.024813,0.029127,0.005879,0.034918,0.054619


In [11]:
bert_vectors_redused.to_csv('../data/intermid/bert_vectors_redused.csv', index=False, encoding='utf-8')