In [None]:
!pip install transformers

In [None]:
!pip install -U sentence-transformers

In [3]:
import pandas as pd
import numpy as np

import sklearn

from tqdm.notebook import tqdm
tqdm.pandas()

#Данные

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df_train = pd.read_csv('train.csv')

#Векторизация текстов

В качетве энкодера для текстов взял Bert, дообученный на датасете CoLA. Датасет создан для оценки грамматической приемлемости ("грамматичности") текста, что кажется довольно близкой задачей.

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("textattack/bert-base-uncased-CoLA")

model = AutoModelForSequenceClassification.from_pretrained("textattack/bert-base-uncased-CoLA")

In [None]:
def get_cola_embedding(text, tokenizer, model):
  input_ids = tokenizer([text], return_tensors="pt").input_ids[:,:512] 
  embedding = model.base_model(input_ids)['pooler_output'][0].numpy()

  return embedding

In [None]:
with torch.no_grad():
  df_train['text_embedded'] = df_train['discourse_text'].progress_apply(lambda text: get_cola_embedding(text, tokenizer, model))

In [None]:
train_texts_embeddings = df_train['text_embedded'].tolist()

In [None]:
df_train[:5]

Unnamed: 0.1,Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness,text_embedded
0,0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate,[-0.17674124 -0.14251244 -0.7324696 0.016722...
1,1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,Adequate,[-2.43735537e-01 -7.61663690e-02 -9.62651372e-...
2,2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,Adequate,[-7.33554721e-01 -9.37814564e-02 -2.42979348e-...
3,3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,Adequate,[ 2.82963723e-01 1.14817008e-01 6.51097298e-...
4,4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,Adequate,[-7.25191891e-01 -1.71787202e-01 -5.41366756e-...


In [None]:
'''
#Векторизуется довольно долго, поэтому сохранял в отдельный csv и потом подгружал с постобработкой
from google.colab import drive
drive.mount('/content/drive')

df_train = pd.read_csv('/content/drive/MyDrive/train_dataset_embedded.csv')

train_texts_embedded = df_train['text_embedded']

train_texts_embeddings = [np.asarray(text.replace('[', '').replace(']', '').replace('\n', '').split(), dtype=float) for text in train_texts_embedded]
'''

#Векторизация типов

Вместо того, чтобы напрямую векторизовать тип (через W2V или хотя бы onehot) я взял описания каждого типа со страницы соревнования, векторизовал описание через XLM-Robert'у (обучена на парафразах, поэтому неплохо улавливает семантику) и использовал эти векторы в качестве эмбеддингов типов

In [8]:
from sentence_transformers import SentenceTransformer, util

In [9]:
embedder = SentenceTransformer('xlm-r-100langs-bert-base-nli-stsb-mean-tokens')

Downloading:   0%|          | 0.00/574 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.06k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/731 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/527 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [10]:
type_desc_dict = {"Lead" : "an introduction that begins with a statistic, a quotation, a description, or some other device to grab the reader’s attention and point toward the thesis",
"Position" : "an opinion or conclusion on the main question",
"Claim" : "a claim that supports the position",
"Counterclaim" : "a claim that refutes another claim or gives an opposing reason to the position",
"Rebuttal" : "a claim that refutes a counterclaim",
"Evidence" : "ideas or examples that support claims, counterclaims, or rebuttals.",
"Concluding Statement" : "a concluding statement that restates the claims"}

In [11]:
type_embeddings_dict = {}
for k in type_desc_dict:
  type_embeddings_dict[k] = embedder.encode([type_desc_dict[k]], convert_to_tensor=False)[0]

In [12]:
df_train['discourse_type_embedding'] = df_train['discourse_type'].progress_apply(lambda discourse_type_label: type_embeddings_dict[discourse_type_label])

  0%|          | 0/36765 [00:00<?, ?it/s]

In [13]:
discourse_type_embeddings = df_train['discourse_type_embedding'].tolist()

#Классификация

In [279]:
from sklearn.model_selection import train_test_split

In [280]:
from sklearn import preprocessing

In [281]:
le = preprocessing.LabelEncoder()

In [282]:
y_train_raw = df_train['discourse_effectiveness'].tolist()

In [283]:
le.fit(y_train_raw)

y_train_ = le.transform(y_train_raw)

В качестве X для обучения используем сконкатериванные эмбеддинги текстов и типов

In [284]:
X_train_ = [np.concatenate((train_texts_embedding, discourse_type_embedding)) for 
           train_texts_embedding, discourse_type_embedding in 
           zip(train_texts_embeddings, discourse_type_embeddings)]

In [285]:
X_train_[0].shape

(1536,)

In [290]:
X_train, X_val, y_train, y_val = train_test_split(X_train_, y_train_, test_size=0.1, random_state=6, shuffle=True)

In [291]:
from sklearn.linear_model import SGDClassifier

In [292]:
sgd = SGDClassifier(verbose=1, learning_rate='adaptive', penalty='l1', eta0=0.003, early_stopping=True, n_iter_no_change=5)

sgd.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


-- Epoch 1
Norm: 4.58, NNZs: 1034, Bias: 0.006000, T: 29779, Avg. loss: 0.918160
Total training time: 0.79 seconds.
-- Epoch 2
Norm: 7.55, NNZs: 879, Bias: 0.009000, T: 59558, Avg. loss: 0.853097
Total training time: 1.92 seconds.
-- Epoch 3
Norm: 10.05, NNZs: 758, Bias: 0.006000, T: 89337, Avg. loss: 0.817021
Total training time: 3.11 seconds.
-- Epoch 4
Norm: 12.16, NNZs: 771, Bias: 0.036000, T: 119116, Avg. loss: 0.806781
Total training time: 3.73 seconds.
-- Epoch 5
Norm: 14.03, NNZs: 713, Bias: 0.021000, T: 148895, Avg. loss: 0.795222
Total training time: 4.33 seconds.
-- Epoch 6
Norm: 15.75, NNZs: 677, Bias: 0.024000, T: 178674, Avg. loss: 0.789538
Total training time: 4.93 seconds.
-- Epoch 7
Norm: 17.32, NNZs: 635, Bias: 0.045000, T: 208453, Avg. loss: 0.783974
Total training time: 5.54 seconds.
-- Epoch 8
Norm: 17.39, NNZs: 651, Bias: 0.054000, T: 238232, Avg. loss: 0.693603
Total training time: 6.14 seconds.
-- Epoch 9
Norm: 17.46, NNZs: 643, Bias: 0.054000, T: 268011, Avg. l

[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   54.0s finished


SGDClassifier(early_stopping=True, eta0=0.003, learning_rate='adaptive',
              penalty='l1', verbose=1)

In [293]:
sgd.score(X_val, y_val)

0.6565134620614631