In [50]:
import pickle

import torch
from transformers import BertForSequenceClassification, BertTokenizerFast, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split


In [51]:
# load entire dataset, create smaller dataset to test with
# format: { post_id: [post, [comment1, comment2, ... ] }
# data_file_path = '../data/grouped_data.pickle'

# with open(data_file_path, 'rb') as f:
#     all_data = pickle.load(f)
    
# sample_data_file_path = '../data/sample_data_10k.pickle'
# adk = list(all_data.keys())
# adk = adk[:10000]

# new_data = {x:all_data[x] for x in adk}

# with open(sample_data_file_path, 'wb') as file:
#     pickle.dump(new_data, file)

In [31]:
# load sample data
# format { post_id: [post, [comment1, comment2, ... ] }

# data_file_path = '../data/grouped_data.pickle'
data_file_path = '../sample_data/sample_data_10k.pickle'

def load_data(file_path):
    posts_comments = []
    upvotes = []
    with open(file_path, 'rb') as f:
        data = pickle.load(f)
        for post_id, val in data.items():
            post = val[0]
            comms = val[1]
            for com in comms:
                if len(post['title'] + " " + post['selftext']) < 500 and len(com['body']) < 500:
                    posts_comments.append((post['title'] + " " + post['selftext'], com['body']))
                    upvotes.append(com['score'])
                
    return posts_comments, upvotes
    
posts_comms, upvotes = load_data(data_file_path)

In [32]:
# split text
train_texts, test_texts, train_labels, test_labels = train_test_split(posts_comms, upvotes, test_size=.2)

85508
21378


In [26]:
# load tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')


Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

In [33]:
# tokenize train and test data

train_post = [x[0] for x in train_texts]
train_comm = [x[1] for x in train_texts]
train_encodings = tokenizer(text=train_post, text_pair=train_comm, truncation=True, padding=True)

test_post = [x[0] for x in test_texts]
test_comm = [x[1] for x in test_texts]
test_encodings = tokenizer(text=test_post, text_pair=test_comm, truncation=True, padding=True)

In [47]:
# load model
model = BertForSequenceClassification.from_pretrained('bert-base-cased', num_labels=max(upvotes))
model.train()

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [48]:
# create dataset class and load encodings and associated labels to it

class upvote_prediction_dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = upvote_prediction_dataset(train_encodings, train_labels)
test_dataset = upvote_prediction_dataset(test_encodings, test_labels)

In [49]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset             # evaluation dataset
)

trainer.train()

Step,Training Loss


KeyboardInterrupt: 