# Text Processing

### setup

###### install

In [1]:
!pip install -q transformers datasets
!pip install -q wandb

# navec embeddings
!pip install -q navec
!wget https://storage.yandexcloud.net/natasha-navec/packs/navec_hudlit_v1_12B_500K_300d_100q.tar

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m37.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m469.0/469.0 KB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.2/199.2 KB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m40.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.9/132.9 KB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 KB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 KB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
# repo
!git config --global user.email "SECRET_NAME@mail.ru"
!git config --global user.name "SECRET NAME"
!git clone https://github.com/Abletobetable/smart-product.git

Cloning into 'smart-product'...
remote: Enumerating objects: 257, done.[K
remote: Counting objects: 100% (73/73), done.[K
remote: Compressing objects: 100% (51/51), done.[K
remote: Total 257 (delta 47), reused 44 (delta 22), pack-reused 184[K
Receiving objects: 100% (257/257), 74.94 MiB | 27.07 MiB/s, done.
Resolving deltas: 100% (155/155), done.


###### import

In [3]:
import os
import re
import json
import shutil
import pandas as pd
import numpy as np

import torch
from transformers import AutoModel, AutoTokenizer
from transformers import AutoModelForSequenceClassification

# init run for efficient logging
import wandb
run = wandb.init(project="kazan_internship2023")

# load navec pretrained embeddings
from navec import Navec
path = 'navec_hudlit_v1_12B_500K_300d_100q.tar'
navec = Navec.load(path)

# imports from repo
%cd /content/smart-product
from src.text_features import create_average_navec_embed, \
                              preprocess_attributes, \
                              filter_description, \
                              concatenate_text_fields, \
                              filter_characteristics
%cd /content

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(DEVICE)

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


/content/smart-product
/content
cpu


  sentences: pd.Series(),
  category_ids: pd.Series(),
  product_ids: pd.Series() = None,
  def preprocess_attributes(attributes: pd.Series()) -> pd.Series():
  def preprocess_attributes(attributes: pd.Series()) -> pd.Series():
  def filter_description(descriptions: pd.Series()) -> pd.Series():
  def filter_description(descriptions: pd.Series()) -> pd.Series():
  prep_title: pd.Series(),
  prep_attrib: pd.Series(),
  prep_descrip: pd.Series()) -> pd.DataFrame():


### text processing

##### load preprocessed dataset

In [4]:
# directory with saved artifact
table_dataset_version = 'processed_table:v0'

In [5]:
artifact = run.use_artifact(f'abletobetable/kazan_internship2023/{table_dataset_version}', 
                            type='preprocessed_data')
artifact_dir = artifact.download()

[34m[1mwandb[0m: Downloading large artifact processed_table:v0, 390.46MB. 2 files... 
[34m[1mwandb[0m:   2 of 2 files downloaded.  
Done. 0:0:6.2


In [6]:
train_df = pd.read_csv('/content/artifacts/processed_table:v0/train_processed.tsv', 
                       sep='\t')
predict_df = pd.read_csv('/content/artifacts/processed_table:v0/predict_processed.tsv', 
                         sep='\t')

In [None]:
train_df.columns

Index(['product_id', 'category_id', 'sale', 'shop_id', 'shop_title', 'rating',
       'text_fields', 'title', 'description', 'attributes',
       'custom_characteristics', 'defined_characteristics', 'filters', 'path'],
      dtype='object')

#### work with data

##### informative text fields (title, description and attributes)

processing

In [None]:
# title
prep_title_train = train_df['title']
prep_title_predict = predict_df['title']

# attributes
prep_attrib_train = preprocess_attributes(train_df['attributes'])
prep_attrib_predict = preprocess_attributes(predict_df['attributes'])

# description
prep_descrip_train = filter_description(train_df['description'])
prep_descrip_predict = filter_description(predict_df['description'])

# concatenate
concat_text_fields_train = concatenate_text_fields(train_df[['category_id', 
                                                             'product_id']], 
                                                   prep_title_train, 
                                                   prep_attrib_train, 
                                                   prep_descrip_train)

concat_text_fields_predict = concatenate_text_fields(predict_df['product_id'], 
                                                     prep_title_predict, 
                                                     prep_attrib_predict, 
                                                     prep_descrip_predict)

100%|██████████| 91120/91120 [00:01<00:00, 69236.40it/s]
100%|██████████| 16860/16860 [00:00<00:00, 69843.48it/s]
100%|██████████| 91120/91120 [00:02<00:00, 43824.51it/s]
100%|██████████| 16860/16860 [00:00<00:00, 64112.51it/s]


save dataset

In [None]:
# will save output in this folder
folder_path = '/content/text_fields_dataset'
if not os.path.exists(folder_path):
    os.mkdir(folder_path)


concat_text_fields_train.to_csv(f'{folder_path}/text_fields_train.tsv', 
                                  sep='\t', index=False)

concat_text_fields_predict.to_csv(f'{folder_path}/text_fields_predict.tsv', 
                                  sep='\t', index=False)

log processed text dataset

In [None]:
text_dataset = wandb.Artifact("text_fields_dataset", type="preprocessed_data")
text_dataset.add_dir(folder_path)
run.log_artifact(text_dataset)

[34m[1mwandb[0m: Adding directory to artifact (/content/text_fields_dataset)... Done. 0.9s


<wandb.sdk.wandb_artifacts.Artifact at 0x7f571e24b0d0>

##### other information

every field (if it not empty) is dict
so I will get every key from fields and concatenate in one string

In [54]:
dict_text_train = pd.DataFrame(filter_characteristics(train_df))
dict_text_train['category_id'] = train_df['category_id']
dict_text_train['product_id'] = train_df['product_id']

dict_text_predict = pd.DataFrame(filter_characteristics(predict_df))
dict_text_predict['product_id'] = predict_df['product_id']

# will save output in this folder
folder_path = '/content/dict_text_dataset'
if not os.path.exists(folder_path):
    os.mkdir(folder_path)

dict_text_train.to_csv(f'{folder_path}/dict_text_train.tsv', 
                         sep='\t', index=False)

dict_text_predict.to_csv(f'{folder_path}/dict_text_predict.tsv', 
                         sep='\t', index=False)

100%|██████████| 91120/91120 [00:02<00:00, 40735.77it/s]
100%|██████████| 16860/16860 [00:00<00:00, 41482.43it/s]


log processed text dataset

In [55]:
text_dataset = wandb.Artifact("dict_text_dataset", type="preprocessed_data")
text_dataset.add_dir(folder_path)
run.log_artifact(text_dataset)

[34m[1mwandb[0m: Adding directory to artifact (/content/dict_text_dataset)... Done. 0.1s


<wandb.sdk.wandb_artifacts.Artifact at 0x7f22cef2f400>