# Summary
In this notebook we try to use only the manually labels to fine-tune a BERT model.

The manual labeling scheme included `ignore` and `non-informative`. In this notebook I ignore it and run naively, changing thenm to `neutral`

In [None]:
!nvidia-smi

Mon Nov 20 19:47:20 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0    24W / 300W |      0MiB / 16384MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!pip install transformers accelerate dataset




# Setup

In [None]:
# Globals
# ---------------

# data locations
root = 'Your root folder/Reddit/'
ORIGIN      = './data/unlabeled/'
DATA_FILE   = ORIGIN+'worldnews_processed_unlabeled_comments_70k.csv'
TXT_FILE    = ORIGIN+'worldnews_processed_unlabeled_comments_70k.txt'

# training params
batch_size = 16
epochs = 10
seed_val = 1234

# Model
HF_BERT_MODEL = 'roberta-base'
MODEL_PATH  = f'./models/{HF_BERT_MODEL}_retrained/'

# ---------------

import os
import sys

os.chdir(root)
sys.path.append(root)

import re
import os
from tqdm import tqdm
import yaml
import json
import sys
import numpy as np
import pandas as pd
import time
import datetime
import random

import torch
from torch.utils.data import TensorDataset, random_split, SubsetRandomSampler
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torch import nn

from transformers import BertTokenizer
from transformers import RobertaTokenizer, RobertaForSequenceClassification, RobertaForMaskedLM
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Load Data

In [None]:
data = pd.read_csv(DATA_FILE,index_col=0)
data.iloc[:5]

Unnamed: 0,title,comment
0,Khamenei: Iran will never give up its nuclear ...,And why should Iran give up its nuclear progra...
1,Police caught on tape brutally beating two Pal...,Full article for those who dont have a premium...
2,"""If the Palestinian Authority agrees to stop i...","At this point, Palestinians are better off see..."
3,"""If the Palestinian Authority agrees to stop i...","If I were Palestine, id counter the offer by s..."
4,"""If the Palestinian Authority agrees to stop i...",Israel should just say fuck it to the Palestin...


# Process Data

In [None]:
if 'deberta' in HF_BERT_MODEL:
  tokenizer = AutoTokenizer.from_pretrained(HF_BERT_MODEL)
elif 'roberta' in HF_BERT_MODEL:
  tokenizer = RobertaTokenizer.from_pretrained(HF_BERT_MODEL)
else:
  tokenizer = BertTokenizer.from_pretrained(HF_BERT_MODEL, do_lower_case = True)
max_len = 512 # max of BERT, we have even longer sentences



In [None]:
# Open the txt file in write mode
with open(TXT_FILE, 'w') as f:
    # Write each row of the DataFrame to the txt file
    for row in data.itertuples():
        f.write(f'{row.title} - {row.comment}'.replace(';','.'))


In [None]:
from transformers import LineByLineTextDataset

train = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path=TXT_FILE,
    block_size=512,
)



In [None]:
print('Total of', len(train.examples), ' examples.')
train.examples[:4]

Total of 81750  examples.


[{'input_ids': tensor([    0,   530,  1908, 28918,    35,  1603,    40,   393,   492,    62,
             63,  1748,  3020,     2])},
 {'input_ids': tensor([    0,   111,   178,   596,   197,  1603,   492,    62,    63,  1748,
           3020,   116,  4337,   247,    34,     5,   235,     7,  2382,     6,
           7053,  1748,  1007,     4,   870,     5,   169,     6,   596,    16,
           1870,    98,  3915,    59,  1603,    18,   295, 23369,    77,  1752,
            220,  1883,    16,  5909, 33265,    19,    82,    54,  4157,  1870,
              8,    16,   416,  1748,   116,  5534,     6,     8,  2145,   141,
           1603,    34,    57,    22,   245,   377,   113,    31,   562,   295,
          23369,   187,  4013,   116,  9497,  2037,    15,  7898, 23134,  4108,
             80,  8345,     2])},
 {'input_ids': tensor([    0,   111,  6583,  1566,    13,   167,    54, 33976,    33,    10,
           4549,  1316,    35,    83,  1830,    12, 17283,   569,  1278,   804,
      

# Load Model

In [None]:
if 'deberta' in HF_BERT_MODEL:
  model = AutoModelForMaskedLM.from_pretrained(
      HF_BERT_MODEL,
      num_labels=2,
      output_attentions = False,
      output_hidden_states = False
      )
elif 'roberta' in HF_BERT_MODEL:
  model = RobertaForMaskedLM.from_pretrained(
      HF_BERT_MODEL,          # Use the 12-layer BERT model, with an uncased vocab.
      num_labels = 2,               # The number of output labels--2 for binary classification.
                                    # You can increase this for multi-class tasks.
      output_attentions = False,    # Whether the model returns attentions weights.
      output_hidden_states = False # Whether the model returns all hidden-states.
      )
else:
  model = AutoModelForMaskedLM.from_pretrained(
      HF_BERT_MODEL,          # Use the 12-layer BERT model, with an uncased vocab.
      num_labels = 2,               # The number of output labels--2 for binary classification.
                                    # You can increase this for multi-class tasks.
      output_attentions = False,    # Whether the model returns attentions weights.
      output_hidden_states = False # Whether the model returns all hidden-states.
      )

# Tell pytorch to run this model on the GPU.
model.cuda()

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): 

# Train

In [None]:
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling
from torch.utils.data import DataLoader



data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

training_args = TrainingArguments(
    output_dir=MODEL_PATH,
    overwrite_output_dir=True,
    num_train_epochs=epochs,
    # per_gpu_train_batch_size=3*20,
    per_device_train_batch_size=8,
    save_steps=20_000,
    save_total_limit=5,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train,
)

trainer.train()

Step,Training Loss
500,2.0697
1000,2.0485
1500,2.0094
2000,2.0243
2500,2.0218
3000,2.0127
3500,1.9884
4000,1.997
4500,1.9593
5000,1.9527


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-10-28b552f778fe>", line 29, in <cell line: 29>
    trainer.train()
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 1555, in train
    return inner_training_loop(
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 1922, in _inner_training_loop
    self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval)
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 2257, in _maybe_log_save_evaluate
    self.log(logs)
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 2645, in log
    self.control = self.callback_handler.on_log(self.args, self.state, self.control, logs)
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer_callback.p

# Load Model after Crush

In [None]:
from transformers import AutoModelForMaskedLM
model = RobertaForMaskedLM.from_pretrained("./models/roberta-base_retrained/checkpoint-80000")
model

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): 

# Save Pre-trained model

In [None]:
# Save model and tokenizer
model.save_pretrained(f"./models/{HF_BERT_MODEL}_pretrained_80000.pt")
# tokenizer.save_pretrained(f"./models/{HF_BERT_MODEL}_pretrained_tokenizer")


In [None]:
sys.exit(0)

SystemExit: ignored

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
