# Text feature extraction

### setup

###### install

In [1]:
!pip install -q transformers datasets
!pip install -q wandb

# navec embeddings
!pip install -q navec
!wget https://storage.yandexcloud.net/natasha-navec/packs/navec_hudlit_v1_12B_500K_300d_100q.tar

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m469.0/469.0 KB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.2/199.2 KB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m60.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 KB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 KB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.9/132.9 KB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
# repo
!git config --global user.email "abletobetable@mail.ru"
!git config --global user.name "Aleksandr Lokis"
!git clone https://abletobetable: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@github.com/Abletobetable/smart-product.git

Cloning into 'smart-product'...
remote: Enumerating objects: 253, done.[K
remote: Counting objects: 100% (69/69), done.[K
remote: Compressing objects: 100% (47/47), done.[K
remote: Total 253 (delta 46), reused 45 (delta 22), pack-reused 184[K
Receiving objects: 100% (253/253), 74.92 MiB | 34.67 MiB/s, done.
Resolving deltas: 100% (154/154), done.


###### import

In [3]:
import os
import re
import json
import shutil
import pandas as pd
import numpy as np

import torch
from transformers import AutoModel, AutoTokenizer
from transformers import AutoModelForSequenceClassification

# init run for efficient logging
import wandb
run = wandb.init(project="kazan_internship2023")

# load navec pretrained embeddings
from navec import Navec
path = 'navec_hudlit_v1_12B_500K_300d_100q.tar'
navec = Navec.load(path)

# imports from repo
%cd /content/smart-product
from src.text_features import create_average_navec_embed, \
                              preprocess_attributes, \
                              filter_description, \
                              concatenate_text_fields, \
                              create_model_and_trainer, \
                              get_text_features
from src.dataset_utils import create_text_datasets
# from src.features_for_images import create_model_and_trainer, get_image_features
%cd /content

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(DEVICE)

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


/content/smart-product


  sentences: pd.Series(),
  category_ids: pd.Series(),
  product_ids: pd.Series() = None,
  def preprocess_attributes(attributes: pd.Series()) -> pd.Series():
  def preprocess_attributes(attributes: pd.Series()) -> pd.Series():
  def filter_description(descriptions: pd.Series()) -> pd.Series():
  def filter_description(descriptions: pd.Series()) -> pd.Series():
  prep_title: pd.Series(),
  prep_attrib: pd.Series(),
  prep_descrip: pd.Series()) -> pd.DataFrame():


/content
cpu


### Text feature extraction

load dataset

In [None]:
# directory with saved artifact
table_dataset_version = 'processed_table:v0'

artifact = run.use_artifact(f'abletobetable/kazan_internship2023/{table_dataset_version}', 
                            type='preprocessed_data')
artifact_dir = artifact.download()

train_df = pd.read_csv('/content/artifacts/processed_table:v0/train_processed.tsv', 
                       sep='\t')
predict_df = pd.read_csv('/content/artifacts/processed_table:v0/predict_processed.tsv', 
                         sep='\t')

#### shop title

navec approach

In [None]:
# will save output in this folder
folder_path = '/content/navec_shop_features'
if not os.path.exists(folder_path):
    os.mkdir(folder_path)

X_train = create_average_navec_embed(navec, 
                                     sentences=train_df['shop_title'], 
                                     category_ids=train_df['category_id'], 
                                     product_ids=train_df['product_id'], 
                                     split='train')

# save features in .npy for further logging
np.save(f'{folder_path}/train_shop_navec_features.npy', X_train)

X_predict = create_average_navec_embed(navec, 
                                       sentences=predict_df['shop_title'], 
                                       category_ids=predict_df['product_id'],
                                       product_ids=predict_df['product_id'], 
                                       split='test')

# save features in .npy for further logging
np.save(f'{folder_path}/predict_shop_navec_features.npy', X_predict)

100%|██████████| 91120/91120 [00:08<00:00, 11212.51it/s]
100%|██████████| 16860/16860 [00:01<00:00, 11471.43it/s]


log features

In [None]:
shop_dataset = wandb.Artifact("shop_features", type="preprocessed_data")
shop_dataset.add_dir(folder_path)
run.log_artifact(shop_dataset)

[34m[1mwandb[0m: Adding directory to artifact (/content/navec_shop_features)... Done. 1.1s


<wandb.sdk.wandb_artifacts.Artifact at 0x7f24288e9dc0>

#### informative text fields

bert feature extractor

###### load dataset

In [None]:
folder = 'text_fields_dataset:v0'

In [None]:
artifact = run.use_artifact(f'abletobetable/kazan_internship2023/{folder}', type='preprocessed_data')
artifact_dir = artifact.download()

[34m[1mwandb[0m: Downloading large artifact text_fields_dataset:v0, 130.82MB. 2 files... 
[34m[1mwandb[0m:   2 of 2 files downloaded.  
Done. 0:0:2.2


In [None]:
prep_train_df = pd.read_csv(f'artifacts/{folder}/text_fields_train.tsv', 
                            sep='\t')
prep_predict_df = pd.read_csv(f'artifacts/{folder}/text_fields_predict.tsv', 
                              sep='\t')

unsplitted_dataset, train_dataset, valid_dataset, \
    predict_dataset, label2id, id2label = create_text_datasets(
        prep_train_df, 
        prep_predict_df, 
        "abletobetable/text_feature_extractor"
        )

Number of labels: 874
rare categories: [12836, 11875, 11549, 12901]


Downloading (…)okenizer_config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/3.57M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Map:   0%|          | 0/72899 [00:00<?, ? examples/s]

Map:   0%|          | 0/18225 [00:00<?, ? examples/s]

Map:   0%|          | 0/91124 [00:00<?, ? examples/s]

Map:   0%|          | 0/16860 [00:00<?, ? examples/s]

###### want to train bert?)

init model

In [None]:
# for pushing to huggingface model hub
!pip install -q huggingface_hub
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) n
Token is valid.
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
model, trainer = create_model_and_trainer(
    model_checkpoint="DeepPavlov/rubert-base-cased", 
    train_dataset=train_dataset, 
    valid_dataset=valid_dataset,
    num_epochs=10, 
    batch_size=8, 
    freeze=False, 
    num_labels=874,
    label2id=label2id, 
    id2label=id2label,
    report_to="wandb",
    push_to_hub=True,
    )

Downloading pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were n

train and evaluate

In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: category_id, product_id, text, __index_level_0__. If category_id, product_id, text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 72899
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 91130
  Number of trainable parameters = 178525546
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1
1,1.3285,1.329662,0.687235


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: category_id, product_id, text, __index_level_0__. If category_id, product_id, text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 18225
  Batch size = 8
  metric = load_metric('f1')


Downloading builder script:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

Saving model checkpoint to text_feature_extractor/checkpoint-9113
Configuration saved in text_feature_extractor/checkpoint-9113/config.json
Model weights saved in text_feature_extractor/checkpoint-9113/pytorch_model.bin
tokenizer config file saved in text_feature_extractor/checkpoint-9113/tokenizer_config.json
Special tokens file saved in text_feature_extractor/checkpoint-9113/special_tokens_map.json
tokenizer config file saved in text_feature_extractor/tokenizer_config.json
Special tokens file saved in text_feature_extractor/special_tokens_map.json


Epoch,Training Loss,Validation Loss,F1
1,1.3285,1.329662,0.687235


In [None]:
trainer.evaluate()

###### get features

load model

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("abletobetable/text_feature_extractor")

model = AutoModelForSequenceClassification.from_pretrained("abletobetable/text_feature_extractor")

Downloading (…)lve/main/config.json:   0%|          | 0.00/33.7k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

get features

In [None]:
# will save output in this folder
folder_path = '/content/bert_text_features'
if not os.path.exists(folder_path):
    os.mkdir(folder_path)

X_train = get_text_features(unsplitted_dataset, model, DEVICE)

# save features in .npy for further logging
np.save(f'{folder_path}/train_bert_text_features.npy', X_train)

X_predict = get_text_features(predict_dataset, model, DEVICE)

# save features in .npy for further logging
np.save(f'{folder_path}/predict_bert_text_features.npy', X_predict)

 35%|███▌      | 31917/91124 [20:35<37:57, 25.99it/s]

log features

In [None]:
text_dataset = wandb.Artifact("bert_text_features", type="preprocessed_data")
text_dataset.add_dir(folder_path)
run.log_artifact(text_dataset)

#### other fields

# save in git

In [None]:
cd smart-product

/content/smart-product


In [None]:
!git status

On branch main
Your branch is up to date with 'origin/main'.

Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	[31mmodified:   src/text_features.py[m

no changes added to commit (use "git add" and/or "git commit -a")


In [None]:
!git add src/text_features.py 

In [None]:
!git commit -m "add get_text_features"

[main 0eba3e9] add get_text_features
 1 file changed, 56 insertions(+)


In [None]:
!git push https://abletobetable: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@github.com/Abletobetable/smart-product.git

Enumerating objects: 7, done.
Counting objects:  14% (1/7)Counting objects:  28% (2/7)Counting objects:  42% (3/7)Counting objects:  57% (4/7)Counting objects:  71% (5/7)Counting objects:  85% (6/7)Counting objects: 100% (7/7)Counting objects: 100% (7/7), done.
Delta compression using up to 2 threads
Compressing objects:  25% (1/4)Compressing objects:  50% (2/4)Compressing objects:  75% (3/4)Compressing objects: 100% (4/4)Compressing objects: 100% (4/4), done.
Writing objects:  25% (1/4)Writing objects:  50% (2/4)Writing objects:  75% (3/4)Writing objects: 100% (4/4)Writing objects: 100% (4/4), 955 bytes | 955.00 KiB/s, done.
Total 4 (delta 3), reused 0 (delta 0)
remote: Resolving deltas:   0% (0/3)[Kremote: Resolving deltas:  33% (1/3)[Kremote: Resolving deltas:  66% (2/3)[Kremote: Resolving deltas: 100% (3/3)[Kremote: Resolving deltas: 100% (3/3), completed with 3 local objects.[K
To https://github.com/Abletobetable/smart-product.git
   722b4b4..0eba3e9  main 