In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

## Importing the dataset
We'll use pandas to read the dataset and load it into a dataframe.

In [2]:
df = pd.read_csv('https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv', delimiter='\t', header=None)

In [3]:
len(df)

6920

In [4]:
batch_1 = df[:2000]

In [5]:
batch_1[1].value_counts()

1    1041
0     959
Name: 1, dtype: int64

## Loading the Pre-trained BERT model


In [6]:
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

## Want BERT instead of distilBERT? Uncomment the following line:
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

In [21]:
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [22]:
np.array(padded).shape

(2000, 59)

In [23]:
padded[0]

array([  101,  1037, 18385,  1010,  6057,  1998,  2633, 18276,  2128,
       16603,  1997,  5053,  1998,  1996,  6841,  1998,  5687,  5469,
        3152,   102,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0])

In [25]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(2000, 59)

In [26]:
input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids.to(torch.long), attention_mask=attention_mask)

In [27]:
features = last_hidden_states[0][:,0,:].numpy()

In [28]:
features

array([[-0.5566494 , -0.3312926 , -0.22280571, ..., -0.22786115,
         0.63191926,  0.2430665 ],
       [-0.28789234, -0.14285488, -0.068579  , ..., -0.31690574,
         0.18455262,  0.31989858],
       [-0.18645273,  0.3022949 , -0.18511131, ..., -0.33492947,
         0.9848733 ,  0.5297745 ],
       ...,
       [-0.728426  , -0.09083511, -0.12268987, ...,  0.11295961,
         0.38278896,  0.77147824],
       [-0.08991037,  0.41575503, -0.18096499, ..., -0.23456138,
         0.39538264,  0.56566936],
       [ 0.23998234,  0.20087859,  0.11803275, ..., -0.18060392,
         0.3674405 ,  0.14869848]], dtype=float32)

In [29]:
len(features[0])

768

In [72]:
labels = batch_1[1]

## Train/Test Split


In [73]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

In [74]:
lr_clf = LogisticRegression()
lr_clf.fit(train_features, train_labels)

LogisticRegression()

In [75]:
lr_clf.score(test_features, test_labels)

0.818

In [76]:
from sklearn.dummy import DummyClassifier
clf = DummyClassifier()

scores = cross_val_score(clf, train_features, train_labels)
print("Dummy classifier score: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Dummy classifier score: 0.491 (+/- 0.05)
