# Классификация на эмбеддингах

In [1]:
import numpy as np
import pandas as pd
import torch
import random
import transformers
from tqdm import notebook
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
df_tweets = pd.read_csv('https://code.s3.yandex.net/datasets/tweets.csv')

In [3]:
# ind_400 = random.sample(list(df_tweets.index), 400)
# df_tweets = df_tweets[df_tweets.index.isin(ind_400)].reset_index(drop=True)
df_tweets.shape

(5000, 2)

In [4]:
tokenizer = transformers.BertTokenizer.from_pretrained('DeepPavlov/rubert-base-cased-sentence')

tokenized = df_tweets['text'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))

max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len - len(i)) for i in tokenized.values])

attention_mask = np.where(padded != 0, 1, 0)

In [5]:
# config = transformers.BertConfig.from_json_file(
#     '/datasets/ds_bert/bert_config.json')
model = transformers.BertModel.from_pretrained('DeepPavlov/rubert-base-cased-sentence')

In [6]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)
device = torch.device("cpu")
model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(119547, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
         

In [None]:
batch_size = 2500
embeddings = []
for i in notebook.tqdm(range(padded.shape[0] // batch_size)):
        batch = torch.LongTensor(padded[batch_size*i:batch_size*(i+1)]) 
        attention_mask_batch = torch.LongTensor(attention_mask[batch_size*i:batch_size*(i+1)])
        
        with torch.no_grad():
            batch_embeddings = model(batch.to(device), attention_mask=attention_mask_batch.to(device))
        
        # embeddings.append(batch_embeddings[0][:,0,:].numpy())
        embeddings.append(batch_embeddings[0][:,0,:].cpu().numpy())

  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
features = np.concatenate(embeddings)

display(features.shape)
 
# разделим наши данные на признаки (матрица X) и целевую переменную (y)
X = features
y = df_tweets['positive']
 
# разделяем модель на обучающую и валидационную выборку
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5, random_state=0
)
 
# зададим алгоритм для модели
model = LogisticRegression(random_state=0)
 
# обучим модель
model.fit(X_train, y_train)
 
predictions = model.predict(X_test)
 
print(accuracy_score(y_test, predictions))