In [1]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/13/33/ffb67897a6985a7b7d8e5e7878c3628678f553634bd3836404fef06ef19b/transformers-2.5.1-py3-none-any.whl (499kB)
[K     |▋                               | 10kB 15.9MB/s eta 0:00:01[K     |█▎                              | 20kB 1.7MB/s eta 0:00:01[K     |██                              | 30kB 2.2MB/s eta 0:00:01[K     |██▋                             | 40kB 1.6MB/s eta 0:00:01[K     |███▎                            | 51kB 1.8MB/s eta 0:00:01[K     |████                            | 61kB 2.2MB/s eta 0:00:01[K     |████▋                           | 71kB 2.4MB/s eta 0:00:01[K     |█████▎                          | 81kB 2.6MB/s eta 0:00:01[K     |██████                          | 92kB 2.9MB/s eta 0:00:01[K     |██████▋                         | 102kB 2.7MB/s eta 0:00:01[K     |███████▏                        | 112kB 2.7MB/s eta 0:00:01[K     |███████▉                        | 122kB 2.7M

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb

# Data

In [0]:
df = pd.read_csv('https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv', delimiter='\t', header=None)
df.columns=['Review', 'Sentiment']

In [4]:
df.head()

Unnamed: 0,Review,Sentiment
0,"a stirring , funny and finally transporting re...",1
1,apparently reassembled from the cutting room f...,0
2,they presume their audience wo n't sit still f...,0
3,this is a visually stunning rumination on love...,1
4,jonathan parker 's bartleby should have been t...,1


# Sentence Embeddings

In [0]:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

## Tokenisation

In [6]:
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

HBox(children=(IntProgress(value=0, description='Downloading', max=231508, style=ProgressStyle(description_wid…




HBox(children=(IntProgress(value=0, description='Downloading', max=546, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=267967963, style=ProgressStyle(description_…




In [0]:
tokenized = df.Review.apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

## Padding

In [0]:
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

## Masking

In [0]:
attention_mask = np.where(padded != 0, 1, 0)

## DistilBERT Outputs

In [0]:
input_ids = torch.LongTensor(padded)  
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

# Classification

## Features
We are only interested in DistilBERT's output for the \[CLS\] token

In [0]:
features = last_hidden_states[0][:,0,:].numpy()

## Labels

In [0]:
labels = df.Sentiment

## Split the data

In [0]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

In [18]:
lr_clf = LogisticRegression(max_iter=1000)
lr_clf.fit(train_features, train_labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [19]:
lr_clf.score(test_features, test_labels)

0.8514450867052024