In [1]:
# Following https://jalammar.github.io/a-visual-guide-to-using-bert-for-the-first-time/


import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('../Data/wsb_full.csv')

df = df.loc[df.removed_by_category.isnull()]

col = 'title'
df = df.dropna(subset = [col])

In [3]:
batch_1 = df[:5000][['title', 'selftext', 'ups']]

In [4]:

# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

## Want BERT instead of distilBERT? Uncomment the following line:
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:

tokenized = batch_1[col].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [None]:

max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [None]:
np.array(padded).shape

In [None]:
padded

In [None]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

In [None]:

input_ids = torch.tensor(padded).type(torch.LongTensor) # have to cast to Longs
attention_mask = torch.tensor(attention_mask).type(torch.LongTensor)



In [None]:
with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [None]:
features = last_hidden_states[0][:,0,:].numpy()


Classification

In [None]:
labels = batch_1.ups > 1

In [None]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

In [None]:
lr_clf = LogisticRegression()
lr_clf.fit(train_features, train_labels)

In [None]:
lr_clf.score(test_features, test_labels)

In [None]:
from sklearn.dummy import DummyClassifier
clf = DummyClassifier()

scores = cross_val_score(clf, train_features, train_labels)
print("Dummy classifier score: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Regression:

In [None]:
from sklearn.dummy import DummyRegressor
clf = DummyRegressor()

scores = cross_val_score(clf, train_features, train_labels)
print("Dummy classifier score: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [None]:
features = last_hidden_states[0][:,0,:].numpy()
labels = batch_1.ups# > 1
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

In [None]:
lr = LinearRegression()
lr.fit(train_features, train_labels)

In [None]:
lr.score(test_features, test_labels)

In [None]:
import xgboost as xgb
# Instantiate an XGBRegressor
xgr = xgb.XGBRegressor(random_state=2)

# Fit the classifier to the training set
xgr.fit(train_features, train_labels)
xgr.score(test_features, test_labels)