In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv', delimiter='\t', header=None)

In [4]:
batch_1 = df[:2000]
batch_1[1].value_counts()

1    1041
0     959
Name: 1, dtype: int64

In [6]:
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

Downloading: 100%|██████████| 226k/226k [00:00<00:00, 392kB/s]
Downloading: 100%|██████████| 28.0/28.0 [00:00<00:00, 8.09kB/s]
Downloading: 100%|██████████| 455k/455k [00:00<00:00, 591kB/s]
Downloading: 100%|██████████| 483/483 [00:00<00:00, 159kB/s]
Downloading: 100%|██████████| 256M/256M [00:05<00:00, 45.9MB/s]
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model

# tokenization

In [7]:
tokenized = batch_1[0].apply(lambda x: tokenizer.encode(x, add_special_tokens= True))

# Padding

In [8]:
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [2220] * (max_len - len(i)) for i in tokenized.values])

In [10]:
padded.shape

(2000, 59)

# Masking

In [11]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(2000, 59)

# Model #1

In [12]:
input_ids = torch.tensor(padded)
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask = attention_mask)

In [13]:
features = last_hidden_states[0][:,0,:].numpy()

In [14]:
labels = batch_1[1]

# Model#2 : Train/Test Split

In [21]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

## Grid Search for Parameters

In [16]:
parameters = {'C' : np.linspace(0.0001, 100, 200)}
grid_search = GridSearchCV(LogisticRegression(), parameters)
grid_search.fit(train_features, train_labels)

GridSearchCV(estimator=LogisticRegression(),
             param_grid={'C': array([1.00000000e-04, 5.02612060e-01, 1.00512412e+00, 1.50763618e+00,
       2.01014824e+00, 2.51266030e+00, 3.01517236e+00, 3.51768442e+00,
       4.02019648e+00, 4.52270854e+00, 5.02522060e+00, 5.52773266e+00,
       6.03024472e+00, 6.53275678e+00, 7.03526884e+00, 7.53778090e+00,
       8.04029296e+00, 8.54280503e+00, 9.04531709e+0...
       8.84422226e+01, 8.89447347e+01, 8.94472467e+01, 8.99497588e+01,
       9.04522709e+01, 9.09547829e+01, 9.14572950e+01, 9.19598070e+01,
       9.24623191e+01, 9.29648312e+01, 9.34673432e+01, 9.39698553e+01,
       9.44723673e+01, 9.49748794e+01, 9.54773915e+01, 9.59799035e+01,
       9.64824156e+01, 9.69849276e+01, 9.74874397e+01, 9.79899518e+01,
       9.84924638e+01, 9.89949759e+01, 9.94974879e+01, 1.00000000e+02])})

In [17]:
print('best parameters: ', grid_search.best_params_)
print('best scores: ', grid_search.best_score_)

best parameters:  {'C': 0.5026120603015075}
best scores:  0.79


In [19]:
lr_clf = LogisticRegression(C = grid_search.best_params_['C'])
lr_clf.fit(train_features, train_labels)

LogisticRegression(C=0.5026120603015075)

# Evaluating Model #2

In [23]:
lr_clf.score(test_features, test_labels)

0.882

In [38]:
from sklearn.dummy import DummyClassifier
clf = DummyClassifier()

scores = cross_val_score(clf, train_features, train_labels)
print('Dummy classifier score: %0.3f (+/- %0.2f)' % (scores.mean(), scores.std() * 2))

Dummy classifier score: 0.521 (+/- 0.00)
