In [1]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from transformers import BertModel, BertTokenizer
import pandas as pd
import numpy as np
import random
import torch

In [2]:
MODELS = [(BertModel, BertTokenizer, 'bert-base-uncased')]

In [3]:
for model_class, tokenizer_class, pretrained_weights in MODELS:
    
    tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
    bert_model = model_class.from_pretrained(pretrained_weights)

In [4]:
df = pd.read_csv('https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv', delimiter='\t', header=None)

In [5]:
df.head()

Unnamed: 0,0,1
0,"a stirring , funny and finally transporting re...",1
1,apparently reassembled from the cutting room f...,0
2,they presume their audience wo n't sit still f...,0
3,this is a visually stunning rumination on love...,1
4,jonathan parker 's bartleby should have been t...,1


In [6]:
batch = df[:4000]

### Tokenization

In [7]:
def tokenize_cut_pad(df):
    
    df = df.copy()
    
    max_input_size = tokenizer.max_model_input_sizes['bert-base-uncased']
    
    # shorten sequences longer than BERT max input size
    df[0] = [text[:max_input_size - 2] for text in df[0].values] 
    tokenized = df[0].apply((lambda x: tokenizer.encode(x, add_special_tokens=True))) # tokenizes and converts tokens to ids, includes special tokens
    
    max_len = 0
    for i in tokenized.values:
        if len(i) > max_len:
            # max_len will be equal to longest sequence in the tokenized values
            max_len = len(i)

    padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
    
    return torch.tensor(padded)

### Get BERT model embedding for each CLS token in each example

In [8]:
input_ids = tokenize_cut_pad(batch)

In [9]:
input_ids.shape

torch.Size([4000, 66])

In [10]:
with torch.no_grad():
    last_hidden_states = bert_model(input_ids)[0]

In [11]:
features = last_hidden_states[:,0,:].numpy()

In [12]:
features.shape

(4000, 768)

In [13]:
labels = batch[1]

In [14]:
labels.shape

(4000,)

### Training

In [15]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

In [16]:
print(train_features.shape)
print(test_features.shape)

(3000, 768)
(1000, 768)


In [17]:
model = LogisticRegression(solver='lbfgs')
model.fit(train_features, train_labels)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

### Testing

In [18]:
model.score(test_features, test_labels)

0.83

### Prediction

In [19]:
def prediction(text):
    
    input_text = tokenizer.encode(text)
    test_input_ids = torch.tensor(input_text)
    test_input_ids = test_input_ids.unsqueeze(0)
    with torch.no_grad():
        hidden_states = bert_model(test_input_ids)[0]
    test_features = hidden_states[:, 0, :].numpy()
    pred = model.predict(test_features)[0]
    
    return pred

In [20]:
prediction("amazing film, i loved it so much")

1