In [10]:
from transformers import BertConfig, BertModel, BertTokenizer
import pandas as pd
import torch
from sklearn.linear_model import LogisticRegressionCV

In [4]:
pos = pd.DataFrame(
    [x.rstrip() for x
     in open("../data/rt-polaritydata/rt-polarity.pos").readlines()],
columns=['review'])
pos['type'] = 'positive'
neg = pd.DataFrame(
    [x.rstrip() for x
     in open("../data/rt-polaritydata/rt-polarity.neg").readlines()],
columns=['review'])
neg['type'] = 'negative'

In [5]:
TRAIN_PROP = 0.8
train_pos = pos.sample(frac=TRAIN_PROP)
train_neg = neg.sample(frac=TRAIN_PROP)
train_reviews = pd.concat([train_pos, train_neg])
test_pos = pos.drop(train_pos.index)
test_neg = neg.drop(train_neg.index)
test_reviews = pd.concat([test_pos, test_neg])

In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
config = BertConfig.from_pretrained("bert-base-uncased", output_hidden_states=True)
model = BertModel.from_pretrained('bert-base-uncased', config=config)
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

## Use BERT to create sentence representations (13 layers x n words)

In [7]:
train_vectors = []
test_vectors = []
with torch.no_grad():
    for x in train_reviews['review']:
        input_ids = torch.tensor([tokenizer.encode(x, add_special_tokens=True)])
        output = model(input_ids)
        train_vectors.append((output, input_ids, x))
    for x in test_reviews['review']:
        input_ids = torch.tensor([tokenizer.encode(x, add_special_tokens=True)])
        output = model(input_ids)
        test_vectors.append((output, input_ids, x))

In [14]:
train_vectors[-1]

((tensor([[[-0.1197, -0.5103,  0.1755,  ..., -0.4528,  0.5111,  0.3569],
           [-0.0253, -0.4410,  0.0922,  ..., -0.4358,  0.9313,  0.2552],
           [ 0.1569, -0.8916,  0.9990,  ..., -0.0488,  0.7035, -0.1455],
           ...,
           [ 0.0419,  0.0642,  0.2044,  ..., -0.0439,  0.3478, -0.2916],
           [-0.0450, -1.2345, -0.0363,  ...,  0.6918,  0.7434, -0.7393],
           [ 0.8274,  0.0998, -0.2487,  ..., -0.0129, -0.5028, -0.2075]]]),
  tensor([[-8.3739e-01, -3.9210e-01, -8.1751e-01,  7.1130e-01,  7.4057e-01,
           -5.4031e-02,  6.3976e-01,  1.4254e-01, -6.7329e-01, -9.9995e-01,
           -5.5910e-01,  9.0345e-01,  9.6192e-01,  2.5095e-01,  8.4693e-01,
           -6.0248e-01, -1.7685e-01, -5.2174e-01,  2.3617e-01, -2.0786e-01,
            5.9117e-01,  9.9995e-01,  1.1312e-01,  2.6169e-01,  4.2185e-01,
            9.8173e-01, -5.6391e-01,  8.8605e-01,  9.0437e-01,  7.4216e-01,
           -3.2492e-01,  2.2916e-01, -9.8062e-01, -6.5900e-02, -7.5941e-01,
           

## Train classifier using each level of representation in BERT

In [12]:
for layer in range(len(output[-1])):
    train_vectors_ = []
    for vecs, input_ids, str_ in train_vectors:
        long_vecs = vecs[-1][layer].detach().numpy().sum(axis=1).flatten()
        train_vectors_.append(long_vecs)
    model2 = LogisticRegressionCV(max_iter=2000)
    model2.fit(X=train_vectors_, y=train_reviews['type'])
    test_vectors_ = []
    for vecs, input_ids, str_ in test_vectors:
        long_vecs = vecs[-1][layer].detach().numpy().sum(axis=1).flatten()
        test_vectors_.append(long_vecs)
    print(layer, model2.score(X=test_vectors_, y=test_reviews['type']))

0 0.7673545966228893
1 0.7725140712945591
2 0.7678236397748592
3 0.7612570356472795
4 0.7696998123827392
5 0.7781425891181989
6 0.7790806754221389
7 0.801125703564728
8 0.8147279549718575
9 0.8198874296435272
10 0.8180112570356473
11 0.8189493433395872
12 0.8039399624765479
