In [1]:
!nvidia-smi

Wed Apr 28 18:40:03 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 466.11       Driver Version: 466.11       CUDA Version: 11.3     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ... WDDM  | 00000000:02:00.0  On |                  N/A |
| 11%   63C    P0    36W / 215W |    836MiB /  8192MiB |      1%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
import pickle

import torch
from transformers import BertForSequenceClassification, BertTokenizerFast, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm

In [4]:
# load sample data
# format { post_id: [post, [comment1, comment2, ... ] }

data_file_path = '../data/grouped_data.pickle'
# data_file_path = '../sample_data/sample_data_10k.pickle'

def load_data(file_path):
    posts_comments = []
    upvotes = []
    with open(file_path, 'rb') as f:
        data = pickle.load(f)
        for post_id, val in data.items():
            post = val[0]
            comms = val[1]
            for com in comms:
                try:
                    if len(post['title'] + " " + post['selftext']) < 500 and len(com['body']) < 500:
                        posts_comments.append((post['title'] + " " + post['selftext'], com['body']))
                        upvotes.append(com['score'])
                except:
                    pass
                
    return posts_comments, upvotes
    
posts_comms, upvotes = load_data(data_file_path)

In [5]:
# split text
train_texts, test_texts, train_labels, test_labels = train_test_split(posts_comms, upvotes, test_size=.1, train_size=.2, shuffle=True)
print(len(train_texts), len(train_labels), len(test_texts), len(test_labels))

posts_comms = None
upvote_max = max(upvotes)
upvotes = None

217471 217471 108736 108736


In [6]:
# load tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased', use_cache=True)

In [7]:
# tokenize train data

train_post = []
train_comm = []

for x in tqdm(train_texts):
    train_post.append(x[0])
    train_comm.append(x[1])

train_texts = None
train_encodings = tokenizer(text=train_post, text_pair=train_comm, truncation=True, padding=True)


  0%|          | 0/217471 [00:00<?, ?it/s]

In [8]:
# tokenize test data

test_post = []
test_comm = []

for x in tqdm(test_texts):
    test_post.append(x[0])
    test_comm.append(x[1])

test_texts = None

test_encodings = tokenizer(text=test_post, text_pair=test_comm, truncation=True, padding=True)


  0%|          | 0/108736 [00:00<?, ?it/s]

In [9]:
# load model
model = BertForSequenceClassification.from_pretrained('bert-base-cased', num_labels=upvote_max)
model.train()

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [10]:
# create dataset class and load encodings and associated labels to it

class upvote_prediction_dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = upvote_prediction_dataset(train_encodings, train_labels)
test_dataset = upvote_prediction_dataset(test_encodings, test_labels)

In [11]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=5,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset             # evaluation dataset
)

trainer.train()

Step,Training Loss


IndexError: Target -1 is out of bounds.