In [10]:
# Following https://jalammar.github.io/a-visual-guide-to-using-bert-for-the-first-time/


import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.dummy import DummyClassifier, DummyRegressor
import xgboost as xgb
import tqdm

import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

In [2]:

import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

## Want BERT instead of distilBERT? Uncomment the following line:
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
#wsb = pd.read_csv('../Data/wsb_full.csv')
#df = wsb.loc[wsb.removed_by_category.isnull()]
#df.to_csv("../Data/wsb_not_null.csv")

In [30]:
df = pd.read_csv('../Data/wsb_not_null.csv', usecols = ['id', 'title', 'selftext', 'ups', 'total_awards_received'])

In [22]:
df.head()

Unnamed: 0,selftext,title,ups,total_awards_received
0,https://www.schaeffersresearch.com/content/ana...,Buy INTU - DD,5,0
1,"Alright WSB,\n\nSo I recently came into about ...",New Years Challenge: 5K Make-it or Break-it,50,0
2,"I saw an older thread on it, curious who is st...",How many of you tism's are doing the UPRO/TMF ...,27,0
3,"First day of legal weed in Illinois, passed by...",DD on pot stocks,6,0
4,"I was going to tag this as technical, but sinc...",Markets are on the cusp of a correction and th...,173,0


In [6]:
col = 'title'
#col = 'selftext' # this results in strings that are too long for the model ... TODO7 how to fix this?
# maybe truncation is okay, ... maybe the first 400 and last time 112?

df = df.dropna(subset = [col])

In [13]:
batched_features = []

def split_dataframe(df, chunk_size = 1000): 
    # https://stackoverflow.com/questions/17315737/split-a-large-pandas-dataframe
    chunks = list()
    num_chunks = len(df) // chunk_size + (1 if len(df) % chunk_size else 0)
    for i in range(num_chunks):
        chunks.append(df[i*chunk_size:(i+1)*chunk_size])
    return chunks

cut_off = len(df) #200
for batch in tqdm.tqdm(split_dataframe(df[:cut_off])):
    # have to restrict into a smaller batch because my computer doesn't have enough memory...
    
    tokenized = batch[col].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))


    max_len = 0
    for i in tokenized.values:
        if len(i) > max_len:
            max_len = len(i)

    padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

    attention_mask = np.where(padded != 0, 1, 0)
    attention_mask.shape


    input_ids = torch.tensor(padded).type(torch.LongTensor) # have to cast to Longs
    attention_mask = torch.tensor(attention_mask).type(torch.LongTensor)

    with torch.no_grad():
        last_hidden_states = model(input_ids, attention_mask=attention_mask)

    features = last_hidden_states[0][:,0,:].numpy()
    batched_features.append(features)
    
features = np.vstack(batched_features)
labels = df[:cut_off].ups

100%|██████████| 98/98 [2:07:00<00:00, 77.76s/it]   


In [16]:
pd.DataFrame(features).to_csv("bert_features.csv")

In [None]:
df['bert_features'] = list(features)
df.to_csv('wsb_with_bert.csv')

Classification

In [17]:
def test_classification(features, bin_labels, model = LogisticRegression()):

    
    clf = DummyClassifier()
    scores = cross_val_score(clf, features, bin_labels)
    print("Dummy classifier score: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    
    
    scores = cross_val_score(model, features, bin_labels)
    print("Model classifier score: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    
    

In [18]:
test_classification(features, labels > 1, LogisticRegression())

Dummy classifier score: 0.913 (+/- 0.00)
Model classifier score: 0.923 (+/- 0.04)


In [24]:
test_classification(features, df.total_awards_received >= 1, LogisticRegression())

Dummy classifier score: 0.758 (+/- 0.00)
Model classifier score: 0.749 (+/- 0.03)


Regression:

In [19]:
def test_regression(features, labels, model):
    
    #train_features, test_features, train_labels, test_labels = train_test_split(features, labels)
    clf = DummyRegressor()

    scores = cross_val_score(clf, features, labels)
    print("Dummy regressor score: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    
    
    #model.fit(train_features, train_labels)
    
    scores = cross_val_score(model, features, labels)
    print("Model regressor score: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))


In [20]:
for model in [LinearRegression(), xgb.XGBRegressor(random_state=2)]:
    print(model)
    test_regression(features, labels, model)


LinearRegression()
Dummy regressor score: -0.037 (+/- 0.06)
Model regressor score: -0.120 (+/- 0.26)
XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
             colsample_bynode=None, colsample_bytree=None, gamma=None,
             gpu_id=None, importance_type='gain', interaction_constraints=None,
             learning_rate=None, max_delta_step=None, max_depth=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             random_state=2, reg_alpha=None, reg_lambda=None,
             scale_pos_weight=None, subsample=None, tree_method=None,
             validate_parameters=None, verbosity=None)
Dummy regressor score: -0.037 (+/- 0.06)
Model regressor score: -0.850 (+/- 1.46)
