In [30]:
# Following https://jalammar.github.io/a-visual-guide-to-using-bert-for-the-first-time/


import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('../Data/wsb_cleaned.csv')

In [4]:
batch_1 = df[:5000][['title', 'selftext', 'ups']]

In [5]:

# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

## Want BERT instead of distilBERT? Uncomment the following line:
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:

tokenized = batch_1['selftext'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [9]:

max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [10]:
np.array(padded).shape

(2000, 70)

In [11]:
padded

array([[  101,  2204,  2051, ...,     0,     0,     0],
       [  101,  1045, 10587, ...,     0,     0,     0],
       [  101,  4965, 20014, ...,     0,     0,     0],
       ...,
       [  101, 10514, 11561, ...,     0,     0,     0],
       [  101, 18411,  2278, ...,     0,     0,     0],
       [  101,  4301,  1998, ...,     0,     0,     0]])

In [12]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(2000, 70)

In [16]:

input_ids = torch.tensor(padded).type(torch.LongTensor) # have to cast to Longs
attention_mask = torch.tensor(attention_mask).type(torch.LongTensor)



In [17]:
with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [25]:
features = last_hidden_states[0][:,0,:].numpy()


Classification

In [None]:
labels = batch_1.ups > 1

In [26]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

In [27]:
lr_clf = LogisticRegression()
lr_clf.fit(train_features, train_labels)

LogisticRegression()

In [28]:
lr_clf.score(test_features, test_labels)

0.608

In [29]:
from sklearn.dummy import DummyClassifier
clf = DummyClassifier()

scores = cross_val_score(clf, train_features, train_labels)
print("Dummy classifier score: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Dummy classifier score: 0.637 (+/- 0.00)


Regression:

In [40]:
from sklearn.dummy import DummyRegressor
clf = DummyRegressor()

scores = cross_val_score(clf, train_features, train_labels)
print("Dummy classifier score: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Dummy classifier score: -0.006 (+/- 0.01)


In [33]:
features = last_hidden_states[0][:,0,:].numpy()
labels = batch_1.ups# > 1
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

In [34]:
lr = LinearRegression()
lr.fit(train_features, train_labels)

LinearRegression()

In [35]:
lr.score(test_features, test_labels)

-0.09169165287122327

In [38]:
import xgboost as xgb
# Instantiate an XGBRegressor
xgr = xgb.XGBRegressor(random_state=2)

# Fit the classifier to the training set
xgr.fit(train_features, train_labels)
xgr.score(test_features, test_labels)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=2,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)