# Text feature extraction

### setup

##### install

In [None]:
!pip install -q transformers datasets
!pip install -q wandb

# navec embeddings
!pip install -q navec
!wget https://storage.yandexcloud.net/natasha-navec/packs/navec_hudlit_v1_12B_500K_300d_100q.tar

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m43.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m469.0/469.0 KB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.2/199.2 KB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m58.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 KB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 KB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.9/132.9 KB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# repo

# develop setup
# !git config --global user.email "SECRET_MAIL@mail.ru"
# !git config --global user.name "SECRET NAME"
# !git clone https://abletobetable:SECRET_TOKEN@github.com/Abletobetable/smart-product.git

!git clone https://github.com/Abletobetable/smart-product.git

Cloning into 'smart-product'...
remote: Enumerating objects: 285, done.[K
remote: Counting objects: 100% (101/101), done.[K
remote: Compressing objects: 100% (70/70), done.[K
remote: Total 285 (delta 67), reused 59 (delta 31), pack-reused 184[K
Receiving objects: 100% (285/285), 74.95 MiB | 17.58 MiB/s, done.
Resolving deltas: 100% (175/175), done.
Updating files: 100% (15/15), done.


###### import

In [None]:
import os
import re
import json
import shutil
import pandas as pd
import numpy as np

import torch
from transformers import AutoModel, AutoTokenizer
from transformers import AutoModelForSequenceClassification

# init run for efficient logging
import wandb
run = wandb.init(project="kazan_internship2023")

# load navec pretrained embeddings
from navec import Navec
path = 'navec_hudlit_v1_12B_500K_300d_100q.tar'
navec = Navec.load(path)

# imports from repo
%cd /content/smart-product
from src.text_features import create_average_navec_embed, \
                              preprocess_attributes, \
                              filter_description, \
                              concatenate_text_fields, \
                              create_model_and_trainer, \
                              get_text_features
from src.dataset_utils import create_text_datasets
%cd /content

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(DEVICE)

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


/content/smart-product


  sentences: pd.Series(),
  category_ids: pd.Series(),
  product_ids: pd.Series() = None,
  def preprocess_attributes(attributes: pd.Series()) -> pd.Series():
  def preprocess_attributes(attributes: pd.Series()) -> pd.Series():
  def filter_description(descriptions: pd.Series()) -> pd.Series():
  def filter_description(descriptions: pd.Series()) -> pd.Series():
  def filter_characteristics(df: pd.DataFrame()) -> pd.Series():
  prep_title: pd.Series(),
  prep_attrib: pd.Series(),
  prep_descrip: pd.Series()) -> pd.DataFrame():


/content
cuda


### Text feature extraction

load dataset

In [None]:
# directory with saved artifact
table_dataset_version = 'processed_table:v0'

artifact = run.use_artifact(f'abletobetable/kazan_internship2023/{table_dataset_version}', 
                            type='preprocessed_data')
artifact_dir = artifact.download()

train_df = pd.read_csv('/content/artifacts/processed_table:v0/train_processed.tsv', 
                       sep='\t')
predict_df = pd.read_csv('/content/artifacts/processed_table:v0/predict_processed.tsv', 
                         sep='\t')

[34m[1mwandb[0m: Downloading large artifact processed_table:v0, 390.46MB. 2 files... 
[34m[1mwandb[0m:   2 of 2 files downloaded.  
Done. 0:0:17.1


#### shop title

average navec embeddings for shop title

In [None]:
# will save output in this folder
folder_path = '/content/navec_shop_features'
if not os.path.exists(folder_path):
    os.mkdir(folder_path)

X_train = create_average_navec_embed(navec, 
                                     sentences=train_df['shop_title'], 
                                     category_ids=train_df['category_id'], 
                                     product_ids=train_df['product_id'], 
                                     split='train')

# save features in .npy for further logging
np.save(f'{folder_path}/train_shop_navec_features.npy', X_train)

X_predict = create_average_navec_embed(navec, 
                                       sentences=predict_df['shop_title'], 
                                       category_ids=predict_df['product_id'],
                                       product_ids=predict_df['product_id'], 
                                       split='test')

# save features in .npy for further logging
np.save(f'{folder_path}/predict_shop_navec_features.npy', X_predict)

100%|██████████| 91120/91120 [00:08<00:00, 11212.51it/s]
100%|██████████| 16860/16860 [00:01<00:00, 11471.43it/s]


log features

In [None]:
shop_dataset = wandb.Artifact("shop_features", type="preprocessed_data")
shop_dataset.add_dir(folder_path)
run.log_artifact(shop_dataset)

[34m[1mwandb[0m: Adding directory to artifact (/content/navec_shop_features)... Done. 1.1s


<wandb.sdk.wandb_artifacts.Artifact at 0x7f24288e9dc0>

#### informative text fields

bert feature extractor

##### load dataset

In [None]:
folder = 'text_fields_dataset:v0'

In [None]:
artifact = run.use_artifact(f'abletobetable/kazan_internship2023/{folder}', type='preprocessed_data')
artifact_dir = artifact.download()

[34m[1mwandb[0m: Downloading large artifact text_fields_dataset:v0, 130.82MB. 2 files... 
[34m[1mwandb[0m:   2 of 2 files downloaded.  
Done. 0:0:6.7


In [None]:
prep_train_df = pd.read_csv(f'artifacts/{folder}/text_fields_train.tsv', 
                            sep='\t')
prep_predict_df = pd.read_csv(f'artifacts/{folder}/text_fields_predict.tsv', 
                              sep='\t')

unsplitted_dataset, train_dataset, valid_dataset, \
    predict_dataset, label2id, id2label = create_text_datasets(
        prep_train_df, 
        prep_predict_df, 
        "abletobetable/text_feature_extractor"
        )

Number of labels: 874
rare categories: [12836, 11875, 11549, 12901]


Downloading (…)okenizer_config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/3.57M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Map:   0%|          | 0/72899 [00:00<?, ? examples/s]

Map:   0%|          | 0/18225 [00:00<?, ? examples/s]

Map:   0%|          | 0/91124 [00:00<?, ? examples/s]

Map:   0%|          | 0/16860 [00:00<?, ? examples/s]

##### want to train bert?)

init model

In [None]:
# login for pushing to huggingface model hub
!pip install -q huggingface_hub
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) n
Token is valid.
Your token has been saved to /root/.cache/huggingface/token
Login successful


I will use pretrained bert feature extractor from 🤗 transformers: https://huggingface.co/DeepPavlov/rubert-base-cased

In [None]:
model, trainer = create_model_and_trainer(
    model_checkpoint="DeepPavlov/rubert-base-cased", 
    train_dataset=train_dataset, 
    valid_dataset=valid_dataset,
    num_epochs=3, 
    batch_size=10, 
    freeze=False, 
    num_labels=874,
    label2id=label2id, 
    id2label=id2label,
    report_to="wandb",
    push_to_hub=True,
    )

Downloading (…)okenizer_config.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were n

Download file pytorch_model.bin:   0%|          | 16.5k/681M [00:00<?, ?B/s]

Download file training_args.bin: 100%|##########| 3.43k/3.43k [00:00<?, ?B/s]

Clean file training_args.bin:  29%|##9       | 1.00k/3.43k [00:00<?, ?B/s]

Clean file pytorch_model.bin:   0%|          | 1.00k/681M [00:00<?, ?B/s]

train and evaluate

In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__, category_id, product_id. If text, __index_level_0__, category_id, product_id are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 72899
  Num Epochs = 3
  Instantaneous batch size per device = 10
  Total train batch size (w. parallel, distributed & accumulation) = 10
  Gradient Accumulation steps = 1
  Total optimization steps = 21870
  Number of trainable parameters = 178525546
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1
1,1.2041,1.010948,0.752143


Saving model checkpoint to text_feature_extractor/checkpoint-1000
Configuration saved in text_feature_extractor/checkpoint-1000/config.json
Model weights saved in text_feature_extractor/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in text_feature_extractor/checkpoint-1000/tokenizer_config.json
Special tokens file saved in text_feature_extractor/checkpoint-1000/special_tokens_map.json
tokenizer config file saved in text_feature_extractor/tokenizer_config.json
Special tokens file saved in text_feature_extractor/special_tokens_map.json
Saving model checkpoint to text_feature_extractor/checkpoint-2000
Configuration saved in text_feature_extractor/checkpoint-2000/config.json
Model weights saved in text_feature_extractor/checkpoint-2000/pytorch_model.bin
tokenizer config file saved in text_feature_extractor/checkpoint-2000/tokenizer_config.json
Special tokens file saved in text_feature_extractor/checkpoint-2000/special_tokens_map.json
tokenizer config file saved in text_feat

Downloading builder script:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss,F1
1,1.2041,1.010948,0.752143


Saving model checkpoint to text_feature_extractor/checkpoint-8000
Configuration saved in text_feature_extractor/checkpoint-8000/config.json
Model weights saved in text_feature_extractor/checkpoint-8000/pytorch_model.bin
tokenizer config file saved in text_feature_extractor/checkpoint-8000/tokenizer_config.json
Special tokens file saved in text_feature_extractor/checkpoint-8000/special_tokens_map.json
tokenizer config file saved in text_feature_extractor/tokenizer_config.json
Special tokens file saved in text_feature_extractor/special_tokens_map.json
Saving model checkpoint to text_feature_extractor/checkpoint-9000
Configuration saved in text_feature_extractor/checkpoint-9000/config.json
Model weights saved in text_feature_extractor/checkpoint-9000/pytorch_model.bin
tokenizer config file saved in text_feature_extractor/checkpoint-9000/tokenizer_config.json
Special tokens file saved in text_feature_extractor/checkpoint-9000/special_tokens_map.json
tokenizer config file saved in text_feat

In [None]:
trainer.evaluate()

##### get features

load model

here I already finetuned my bert for classification and load model from my huggingface model repository

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("abletobetable/text_feature_extractor")

model = AutoModelForSequenceClassification.from_pretrained("abletobetable/text_feature_extractor")

Downloading (…)lve/main/config.json:   0%|          | 0.00/33.7k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

get features

In [None]:
# will save output in this folder
folder_path = '/content/bert_text_features'
if not os.path.exists(folder_path):
    os.mkdir(folder_path)

X_train = get_text_features(unsplitted_dataset, model, DEVICE)

# save features in .npy for further logging
np.save(f'{folder_path}/train_bert_text_features.npy', X_train)

X_predict = get_text_features(predict_dataset, model, DEVICE)

# save features in .npy for further logging
np.save(f'{folder_path}/predict_bert_text_features.npy', X_predict)

 35%|███▌      | 31917/91124 [20:35<37:57, 25.99it/s]

log features for further using

In [None]:
text_dataset = wandb.Artifact("bert_text_features", type="preprocessed_data")
text_dataset.add_dir(folder_path)
run.log_artifact(text_dataset)

#### other fields

like shop title in this case I also compute average embeddings

In [None]:
artifact = run.use_artifact('abletobetable/kazan_internship2023/dict_text_dataset:v0', type='preprocessed_data')
artifact_dir = artifact.download()

[34m[1mwandb[0m: \ 1 of 2 files downloaded...[34m[1mwandb[0m:   2 of 2 files downloaded.  


In [None]:
train_dtext = pd.read_csv('/content/artifacts/dict_text_dataset:v0/dict_text_train.tsv', 
                          sep='\t')
predict_dtext = pd.read_csv('/content/artifacts/dict_text_dataset:v0/dict_text_predict.tsv', 
                          sep='\t')

# will save output in this folder
folder_path = '/content/navec_dtext_features'
if not os.path.exists(folder_path):
    os.mkdir(folder_path)

X_train = create_average_navec_embed(navec, 
                                     sentences=train_dtext.fillna('')['characteristics'], 
                                     category_ids=train_df['category_id'], 
                                     product_ids=train_df['product_id'], 
                                     split='train')

# save features in .npy for further logging
np.save(f'{folder_path}/train_dtext_navec_features.npy', X_train)

X_predict = create_average_navec_embed(navec, 
                                       sentences=predict_dtext.fillna('')['characteristics'], 
                                       category_ids=predict_df['product_id'],
                                       product_ids=predict_df['product_id'], 
                                       split='test')

# save features in .npy for further logging
np.save(f'{folder_path}/predict_dtext_navec_features.npy', X_predict)

100%|██████████| 91120/91120 [00:05<00:00, 15978.35it/s]
100%|██████████| 16860/16860 [00:00<00:00, 19871.02it/s]


and as usual log features in wandb

In [None]:
dtext_dataset = wandb.Artifact("dtext_features", type="preprocessed_data")
dtext_dataset.add_dir(folder_path)
run.log_artifact(dtext_dataset)

[34m[1mwandb[0m: Adding directory to artifact (/content/navec_dtext_features)... Done. 1.3s


<wandb.sdk.wandb_artifacts.Artifact at 0x7f39b3293af0>