<b> Данный блокнот производит обработку текстовой информации. Мы хотим получить текстовые эмбеддинги с помощью трансформера "DeepPavlov/rubert-base-cased" 

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import warnings

df_train = pd.read_csv('train.csv', index_col=0)

print(f"Train shape: {df_train.shape}")


Train shape: (197198, 44)


In [3]:
import transformers
import html
from bs4 import BeautifulSoup
import re
import torch
from tqdm import tqdm
from sklearn.preprocessing import OrdinalEncoder




def full_preprocess_data(df, encoder = None):

    def preprocess_text(text):
        # Декодировать HTML-сущности
        text = html.unescape(text)
        # Удалить HTML-теги
        text = BeautifulSoup(text, "html.parser").get_text()
        # Удалить специальные символы и нормализовать пробелы
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'[^\w\s.,!?]', '', text)
        # Удалить лишние пробелы
        text = text.strip()
        return text

    def preprocess_function(text):
        
        text = preprocess_text(text)
        
        encoded_input = tokenizer(
        text,
        return_tensors='pt',           # Возвращать PyTorch тензоры
        padding=True,                   # Добивать до максимальной длины в батче
        truncation=True,                # Обрезать слишком длинные тексты
        max_length=512,                 # Максимальная длина
        add_special_tokens=True)
        return encoded_input


    df["description"].fillna('[UNK]', inplace=True)
    df["name_rus"].fillna("[UNK]", inplace= True)
        
    model_name = "DeepPavlov/rubert-base-cased"
    tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
    model = transformers.AutoModel.from_pretrained(model_name)

    df_text = df["name_rus"] + " " + df["description"]
    text = df_text.to_list() 
    
    preprocess_data = list(map(preprocess_function, text))

    df["rating_1_count"].fillna(0, inplace = True) 
    df["rating_2_count"].fillna(0, inplace = True)
    df["rating_3_count"].fillna(0, inplace = True)
    df["rating_4_count"].fillna(0, inplace = True)
    df["rating_5_count"].fillna(0, inplace = True)
    df["comments_published_count"].fillna(0 , inplace = True)
    df["photos_published_count"].fillna(0 , inplace = True)
    df["videos_published_count"].fillna(0 , inplace = True)


    device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
    model.to(device)
    print(f"Using device: {device}")


    embeddings = []
    for item in tqdm(preprocess_data, desc="Processing embeddings"):
        item = {key: value.to(device) for key, value in item.items()}
        with torch.no_grad():
            outputs = model(item["input_ids"], item["token_type_ids"], item["attention_mask"])
        embeddings.append(outputs.pooler_output.cpu().numpy().squeeze())


    
    cols = ['brand_name', 'CommercialTypeName4']

    if encoder is not None:
        df[cols] = encoder.transform(df[cols])
    else:
        encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1, encoded_missing_value=-1)
        df[cols] = encoder.fit_transform(df[cols])


    df = df.drop(columns=['name_rus', 'description'], inplace=False)
    embedding_df = pd.DataFrame(embeddings, index=df.index)
    df_combined = pd.concat([df, embedding_df], axis=1)
    df_combined.to_csv('ready_df.csv', index=False)

    return df_combined, encoder

In [5]:
df_train["description"].fillna('unknown', inplace=True)
df_train["name_rus"].fillna("unknown", inplace= True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train["description"].fillna('unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train["name_rus"].fillna("unknown", inplace= True)


<b> Объединяем колонки "name_rus"  и "description" ,чтобы все вместе подать на вход языковой модели. 

In [12]:
df_text = df_train["name_rus"] + " " + df_train["description"] 

In [14]:
text = df_text.to_list() 

In [16]:
import html
from bs4 import BeautifulSoup
import re
import torch
from tqdm import tqdm


def preprocess_text(text):
    # Декодировать HTML-сущности
    text = html.unescape(text)
    # Удалить HTML-теги
    text = BeautifulSoup(text, "html.parser").get_text()
    # Удалить специальные символы и нормализовать пробелы
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s.,!?]', '', text)
    # Удалить лишние пробелы
    text = text.strip()
    return text

def preprocess_function(text):
    
    text = preprocess_text(text)
    
    encoded_input = tokenizer(
    text,
    return_tensors='pt',           # Возвращать PyTorch тензоры
    padding=True,                   # Добивать до максимальной длины в батче
    truncation=True,                # Обрезать слишком длинные тексты
    max_length=512,                 # Максимальная длина
    add_special_tokens=True)
    return encoded_input
    

In [88]:
preprocess_data = list(map(preprocess_function, text))

<b> Заполняем NANы 

In [19]:
df_train["rating_1_count"].fillna(0, inplace = True) 
df_train["rating_2_count"].fillna(0, inplace = True)
df_train["rating_3_count"].fillna(0, inplace = True)
df_train["rating_4_count"].fillna(0, inplace = True)
df_train["rating_5_count"].fillna(0, inplace = True)
df_train["comments_published_count"].fillna(0 , inplace = True)
df_train["photos_published_count"].fillna(0 , inplace = True)
df_train["videos_published_count"].fillna(0 , inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train["rating_1_count"].fillna(0, inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train["rating_2_count"].fillna(0, inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are se

In [None]:
import torch
from tqdm import tqdm

device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
model.to(device)
print(f"Using device: {device}")

embeddings = []
for item in tqdm(preprocess_data, desc="Processing embeddings"):
    item = {key: value.to(device) for key, value in item.items()}
    with torch.no_grad():
        outputs = model(item["input_ids"], item["token_type_ids"], item["attention_mask"])
    embeddings.append(outputs.pooler_output.cpu().numpy())

Using device: mps


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Processing embeddings: 100%|██████████| 197198/197198 [39:06<00:00, 84.04it/s] 


<b> Выделяем категориальные признаки 

In [22]:
cols = ['brand_name', 'CommercialTypeName4']
df_text1= df_train[cols]

In [24]:
df_text1.head()

Unnamed: 0_level_0,brand_name,CommercialTypeName4
id,Unnamed: 1_level_1,Unnamed: 2_level_1
159385,ACTRUM,Пылесборник
288616,Red Line,Крышка для объектива
108090,Talwar Brothers,Аксессуар для музыкального инструмента
415607,,Видеоигра
332391,,Видеоигра


In [47]:
df_train.to_csv("data_text.csv")

In [28]:
from sklearn.preprocessing import OrdinalEncoder

cols = ['brand_name', 'CommercialTypeName4']
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1 , encoded_missing_value= -1)
df_train[cols] = encoder.fit_transform(df_train[cols])

print(df_train[cols].head())

        brand_name  CommercialTypeName4
id                                     
159385        73.0                449.0
288616      2615.0                306.0
108090      3095.0                 42.0
415607        -1.0                115.0
332391        -1.0                115.0


In [53]:
embeddings = [emb.squeeze() for emb in embeddings]

In [32]:
df_train_new = df_train.drop(columns=['name_rus', 'description'], inplace=False)


In [None]:
embedding_df = pd.DataFrame(embeddings, index=df_train_new.index)
df_combined = pd.concat([df_train_new, embedding_df], axis=1)
print(df_combined.head())

In [61]:
df_combined.to_csv('train_with_textembeddings.csv', index=False)

In [66]:
#test pipeline
df_test = pd.read_csv('test.csv', index_col=0)
print(f"Test shape: {df_test.shape}")

df_test_processed_with_text_embeddings, encoder = full_preprocess_data(df_test, encoder)

Test shape: (22760, 43)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["description"].fillna('[UNK]', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["name_rus"].fillna("[UNK]", inplace= True)
Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', '

Using device: mps


Processing embeddings: 100%|██████████| 22760/22760 [04:36<00:00, 82.38it/s]


Unnamed: 0_level_0,brand_name,CommercialTypeName4,rating_1_count,rating_2_count,rating_3_count,rating_4_count,rating_5_count,comments_published_count,photos_published_count,videos_published_count,...,758,759,760,761,762,763,764,765,766,767
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
17384,-1.0,347.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.536627,0.476969,0.65909,0.985664,0.565392,-0.301035,0.454583,-0.261645,0.671193,0.321908
260316,415.0,242.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.256805,0.66666,0.329477,0.989858,0.573546,0.160044,0.571014,-0.36965,0.722612,0.62553
10610,566.0,479.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.297294,0.698494,0.308386,0.990717,0.564226,0.218557,0.451757,-0.347273,0.741989,0.590884
205236,819.0,330.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.338254,0.435517,0.55227,0.971449,0.498337,-0.316467,0.401764,-0.237847,0.626943,0.165919
308655,1386.0,558.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.282907,0.269583,0.643121,0.869446,0.53867,-0.154961,0.404107,-0.152066,0.542762,0.146707
