In [1]:
import os
import torch
import pickle
import glob
import numpy as np
from tqdm.notebook import tqdm, trange
from collections import Counter
from sklearn.metrics import (matthews_corrcoef, confusion_matrix, 
                              accuracy_score, f1_score, precision_score, recall_score)
from transformers import (WEIGHTS_NAME, BertConfig, BertForSequenceClassification, BertTokenizer,
                                  RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer,
                                  AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer,
                                  XLMRobertaConfig, XLMRobertaForSequenceClassification, XLMRobertaTokenizer,
                                  AdamW, get_linear_schedule_with_warmup)

In [2]:
from utility import SubtaskAData, convert_examples_to_features, SubtaskAProcessor
"""For subtask B,C, import
   SubtaskBData, SubtaskCData
   SubtaskBProcessor, SubtaskCProcessor,
   """
torch.cuda.empty_cache()

In [3]:
"""Load train & test data"""

def load_data(raw_train, raw_test):    
    train = []
    for i, t in enumerate(raw_train.texts):
        if i == -1:
            break
        train.append((t, raw_train.labels[i]))
    
    test = []
    for i, t in enumerate(raw_test.texts):
        test.append((t, raw_test.ids[i]))

    return train, test

raw_train, raw_test = SubtaskAData(path="datasets/OffensEval20").getData()
#For subtask B,C, use SubtaskBProcessor, SubtaskCProcessor
X_train, X_dev = load_data(raw_train, raw_test)

In [4]:
"""!!!Modify this cell to change models!!!"""

Model_type = 'albert'     #All types: bert, roberta, albert, xlmroberta
Model_name = 'albert-base-v2'
    #All models: 'albert-base-v1', 'albert-large-v1', 'albert-xlarge-v1', 'albert-xxlarge-v1' 
                #'albert-base-v2', 'albert-large-v2', 'albert-xlarge-v2', 'albert-xxlarge-v2'
                #'roberta-base', 'roberta-large'
                #Browse https://huggingface.co/ for more pretrained models
Task_name = 'subtask_A'   #All tasks: subtask_A, subtask_B, subtask_C
Output_path = 'cached-results/' + Model_type + '/'

seq_len = 128
train_batch_size = 4
eval_batch_size = 4
epochs = 6
weight_decay = 0
LR = 5e-6
adam_eps = 1e-9
max_norm = 1.0

Output_path = Output_path+Task_name+"/"+Model_name+"/"
print("Output Dir:",Output_path)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

config_class, model_class, tokenizer_class = AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer
"""For bert, use BertConfig, BertForSequenceClassification, BertTokenizer,
   roberta, use RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer,
   albert, use AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer,
   xlmroberta, use XLMRobertaConfig, XLMRobertaForSequenceClassification, XLMRobertaTokenizer.
"""  

config = config_class.from_pretrained(Model_name, num_labels=2, finetuning_task=Task_name) #Task A
tokenizer = tokenizer_class.from_pretrained((Model_name))

processor = SubtaskAProcessor(X_train, X_dev)
#For subtask B,C, use SubtaskBProcessor, SubtaskCProcessor
label_list = processor.get_labels()
num_labels = len(label_list)

Output Dir: cached-results/albert/subtask_A/albert-base-v2/


In [5]:
def load_examples(task, tokenizer):
    """Process training data"""
    
    print("Creating features from datasets...")
    label_list = processor.get_labels()
    examples = processor.get_train_examples()
    
    features = convert_examples_to_features(examples, label_list, seq_len, tokenizer,
        cls_token=tokenizer.cls_token,
        sep_token=tokenizer.sep_token,
        cls_token_segment_id= 0,
        pad_token_segment_id= 0)
        
        
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
    all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)

    dataset = torch.utils.data.TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
    return dataset
                                        
def train(train_dataset, model, tokenizer):
    """Training process"""
    
    train_sampler = torch.utils.data.RandomSampler(train_dataset)
    train_dataloader = torch.utils.data.DataLoader(train_dataset, sampler=train_sampler, batch_size=train_batch_size)
    
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': weight_decay},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=LR, eps=adam_eps)
        
    print("===== Training =====")
    print("  Num examples =", len(train_dataset))
    print("  Num Epochs =", epochs)
    print("  Total train batch size =", train_batch_size)

    step = 0
    tr_loss = 0.0
    model.zero_grad()
    train_iterator = trange(epochs, desc="Epoch")
    
    epoch_i = 0
    max_metric = 0
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration")
        epoch_i += 1
        print("Training Epoch", epoch_i)
        for step, batch in enumerate(epoch_iterator):
            model.train()
            batch = tuple(t.to(device) for t in batch)
            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      'token_type_ids': batch[2] if Model_type in ['bert'] else None,
                      'labels':         batch[3]}
            outputs = model(**inputs)
            loss = outputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)
            print("\r%f" % loss, end='')

                
            
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)

            tr_loss += loss.item()

            optimizer.step()
            model.zero_grad()

            step += 1

        # Save model checkpoint
        output_dir = os.path.join(Output_path, 'checkpoint-{}'.format(epoch_i))
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        model.save_pretrained(output_dir)
        print(" ")
        print("Saving model checkpoint to", output_dir)


    return step, tr_loss / step

In [6]:
"""Do train"""
model = model_class.from_pretrained(Model_name, num_labels=num_labels)
model.to(device)
train_dataset = load_examples(Task_name, tokenizer)
step, tr_loss = train(train_dataset, model, tokenizer)
print(" step =", step, ", average loss =", tr_loss)

Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.dense.bias', 'predictions.LayerNorm.bias', 'predictions.bias', 'predictions.decoder.bias', 'predictions.dense.weight', 'predictions.decoder.weight', 'predictions.LayerNorm.weight']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You sho

Creating features from datasets...


  0%|          | 0/14100 [00:00<?, ?it/s]

===== Training =====
  Num examples = 14100
  Num Epochs = 6
  Total train batch size = 4


Epoch:   0%|          | 0/6 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3525 [00:00<?, ?it/s]

Training Epoch 1
0.796114 
Saving model checkpoint to cached-results/albert/subtask_A/albert-base-v2/checkpoint-1


Iteration:   0%|          | 0/3525 [00:00<?, ?it/s]

Training Epoch 2
0.422683 
Saving model checkpoint to cached-results/albert/subtask_A/albert-base-v2/checkpoint-2


Iteration:   0%|          | 0/3525 [00:00<?, ?it/s]

Training Epoch 3
1.313888 
Saving model checkpoint to cached-results/albert/subtask_A/albert-base-v2/checkpoint-3


Iteration:   0%|          | 0/3525 [00:00<?, ?it/s]

Training Epoch 4
0.942182 
Saving model checkpoint to cached-results/albert/subtask_A/albert-base-v2/checkpoint-4


Iteration:   0%|          | 0/3525 [00:00<?, ?it/s]

Training Epoch 5
0.002344 
Saving model checkpoint to cached-results/albert/subtask_A/albert-base-v2/checkpoint-5


Iteration:   0%|          | 0/3525 [00:00<?, ?it/s]

Training Epoch 6
0.000175 
Saving model checkpoint to cached-results/albert/subtask_A/albert-base-v2/checkpoint-6
 step = 3525 , average loss = 3.1543332351691333


In [7]:
"""Save trained model"""
if not os.path.exists(Output_path):
        os.makedirs(Output_path)
print("Saving model checkpoint to " + Output_path)

model.save_pretrained(Output_path)
tokenizer.save_pretrained(Output_path)

Saving model checkpoint to cached-results/albert/subtask_A/albert-base-v2/


('cached-results/albert/subtask_A/albert-base-v2/tokenizer_config.json',
 'cached-results/albert/subtask_A/albert-base-v2/special_tokens_map.json',
 'cached-results/albert/subtask_A/albert-base-v2/spiece.model',
 'cached-results/albert/subtask_A/albert-base-v2/added_tokens.json')

In [8]:
def prepare_prediction(task, X_predict, tokenizer):
    """Process testing data"""
    
    processor = SubtaskAProcessor(X_predict, None)
    #For subtask B,C, use SubtaskBProcessor, SubtaskCProcessor
    examples = processor.get_train_examples()
    features = convert_examples_to_features(examples, label_list, seq_len, tokenizer,
        cls_token=tokenizer.cls_token,
        sep_token=tokenizer.sep_token,
        cls_token_segment_id= 0,
        pad_token_segment_id= 0)
    
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
    all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)

    dataset = torch.utils.data.TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
    return dataset

def predict_sentences(sentences):
    """Testing process"""
    
    X = [(s, 'OFF') for s in sentences]
    predict_dataset = prepare_prediction(Task_name, X, tokenizer)
    eval_sampler = torch.utils.data.SequentialSampler(predict_dataset)
    eval_dataloader = torch.utils.data.DataLoader(predict_dataset, sampler=eval_sampler, batch_size=eval_batch_size)
    prefix = ""

    print("***** Running evaluation {} *****".format(prefix))
    print("  Num examples =", len(predict_dataset))
    print("  Batch size =", eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    preds = None
    out_label_ids = None
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      'token_type_ids': batch[2] if Model_type in ['bert'] else None,
                      'labels':         batch[3]}
            outputs = model(**inputs)
            tmp_eval_loss, logits = outputs[:2]

            eval_loss += tmp_eval_loss.mean().item()
        nb_eval_steps += 1
        if preds is None:
            preds = logits.detach().cpu().numpy()
            out_label_ids = inputs['labels'].detach().cpu().numpy()
        else:
            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
            out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)

    eval_loss = eval_loss / nb_eval_steps
    
    sm = torch.nn.Softmax(dim=1)
    probabilities = sm(torch.from_numpy(preds)).numpy()

    return probabilities

def eval_stats(labels, preds):
    """Generate stats"""
    
    mcc = matthews_corrcoef(labels, preds)
    tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
    acc = accuracy_score(labels, preds)
    
    # macro
    f1 = f1_score(labels, preds, average='macro')
    p = precision_score(labels, preds, average='macro')
    r = recall_score(labels, preds, average='macro')
    
    # not
    f1_0 = f1_score(labels, preds, average='binary', pos_label=0)
    p_0 = precision_score(labels, preds, average='binary', pos_label=0)
    r_0 = recall_score(labels, preds, average='binary', pos_label=0)
    
    # off
    f1_1 = f1_score(labels, preds, average='binary', pos_label=1)
    p_1 = precision_score(labels, preds, average='binary', pos_label=1)
    r_1 = recall_score(labels, preds, average='binary', pos_label=1)
    
    return {
        "mcc": mcc,
        "tp": tp,
        "tn": tn,
        "fp": fp,
        "fn": fn,
        "acc" : acc,
        "f1" : f1,
        "precision" : p,
        "recall" : r,
        "p_not" : p_0,
        "r_not" : r_0,
        "f1_not" : f1_0,
        "p_off" : p_1,
        "r_off" : r_1,
        "f1_off" : f1_1
    }, get_mismatched(labels, preds)

"""Do test"""
test_sentences, test_ids = zip(*X_dev)
fold_model_dirs = list(os.path.dirname(c) for c in sorted(glob.glob(Output_path + '/**/' + WEIGHTS_NAME, recursive=True)))
emsemble_preds = np.empty((len(test_sentences), len(fold_model_dirs)))
for i, fold_dir in enumerate(fold_model_dirs):
    print(fold_dir)
    model = model_class.from_pretrained(fold_dir)
    model.to(device)
    prob_scores = predict_sentences(test_sentences)
    predicted_labels = [a.argmax() for a in prob_scores]
    emsemble_preds[:, i] = predicted_labels

cached-results/albert/subtask_A/albert-base-v2\checkpoint-1


  0%|          | 0/3887 [00:00<?, ?it/s]

***** Running evaluation  *****
  Num examples = 3887
  Batch size = 4


Evaluating:   0%|          | 0/972 [00:00<?, ?it/s]

cached-results/albert/subtask_A/albert-base-v2\checkpoint-2


  0%|          | 0/3887 [00:00<?, ?it/s]

***** Running evaluation  *****
  Num examples = 3887
  Batch size = 4


Evaluating:   0%|          | 0/972 [00:00<?, ?it/s]

cached-results/albert/subtask_A/albert-base-v2\checkpoint-3


  0%|          | 0/3887 [00:00<?, ?it/s]

***** Running evaluation  *****
  Num examples = 3887
  Batch size = 4


Evaluating:   0%|          | 0/972 [00:00<?, ?it/s]

cached-results/albert/subtask_A/albert-base-v2\checkpoint-4


  0%|          | 0/3887 [00:00<?, ?it/s]

***** Running evaluation  *****
  Num examples = 3887
  Batch size = 4


Evaluating:   0%|          | 0/972 [00:00<?, ?it/s]

cached-results/albert/subtask_A/albert-base-v2\checkpoint-5


  0%|          | 0/3887 [00:00<?, ?it/s]

***** Running evaluation  *****
  Num examples = 3887
  Batch size = 4


Evaluating:   0%|          | 0/972 [00:00<?, ?it/s]

cached-results/albert/subtask_A/albert-base-v2\checkpoint-6


  0%|          | 0/3887 [00:00<?, ?it/s]

***** Running evaluation  *****
  Num examples = 3887
  Batch size = 4


Evaluating:   0%|          | 0/972 [00:00<?, ?it/s]

cached-results/albert/subtask_A/albert-base-v2


  0%|          | 0/3887 [00:00<?, ?it/s]

***** Running evaluation  *****
  Num examples = 3887
  Batch size = 4


Evaluating:   0%|          | 0/972 [00:00<?, ?it/s]

In [9]:
"""Save predictions"""

pickle.dump(emsemble_preds, file=open(os.path.join(Output_path, "testset_predictions.p"), "wb"))
mean_preds = emsemble_preds.mean(axis=1)
mean_preds.tolist()

path = 'cached-results/albert/subtask_A/'
    #For subtask B,C, use subtask_B, subtask_C
files = []
for r, d, f in os.walk(path):
    for file in f:
        if '.p' in file:
            files.append(os.path.join(r, file))
print("Total number of predictions:",len(files))
for f in files:
    print(f)

Total number of predictions: 1
cached-results/albert/subtask_A/albert-base-v2\testset_predictions.p


In [11]:
def get_mismatched(labels, preds):
    mismatched = labels != preds
    examples = processor.get_dev_examples()
    wrong = [i for (i, v) in zip(examples, mismatched) if v]
    
    return wrong

def taskA_getid(label):
    if label == 'NOT':
        return 0
    else:
        return 1

def taskB_getid(label):
    if label == 'TIN':
        return 0
    else:
        return 1

test_sentences, test_ids = zip(*X_dev)


"""Emsemble preds from different models"""
preds = []
for f in files:
    #if 'albert-xlarge-v2' in f:
        #preds.append(pickle.load(open( f, "rb" )))
    if 'albert-xlarge-v1' in f:
        preds.append(pickle.load(open( f, "rb" )))
    if 'albert-xxlarge-v2' in f:
        preds.append(pickle.load(open( f, "rb" )))
    if 'albert-xxlarge-v1' in f:
        preds.append(pickle.load(open( f, "rb" )))
    if 'albert-base-v2' in f:
        preds.append(pickle.load(open( f, "rb" )))
    #if 'albert-base-v1' in f:
        #preds.append(pickle.load(open( f, "rb" )))
    #if 'roberta-base' in f:
        #preds.append(pickle.load(open( f, "rb" )))
    #if 'albert-large-v2' in f:
        #preds.append(pickle.load(open( f, "rb" )))
        
merged_preds = np.concatenate(preds, axis = 1)

majority_preds = []
for i in range(merged_preds.shape[0]):
    majority_preds.append(Counter(merged_preds[i].astype(int)).most_common(1)[0][0])
mean_preds = merged_preds.mean(axis=1)
final_preds = majority_preds

lables = []
for i, t in enumerate(raw_test.texts):
      lables.append(taskA_getid(raw_test.labels[i]))
    #For subtask B,C, use taskB_getid, taskC_getid
        
result, wrong = eval_stats(np.array(lables), final_preds)
print("NOT:" + "\t" +  "P: %s" %(str(round(result["p_not"]*100, 3))) + "\t" +  "R: %s" %(str(round(result["r_not"]*100, 2))) + "\t" +  "F1: %s" %(str(round(result["f1_not"]*100, 2))))
print("OFF:" + "\t" +  "P: %s" %(str(round(result["p_off"]*100, 2))) + "\t" +  "R: %s" %(str(round(result["r_off"]*100, 2))) + "\t" +  "F1: %s" %(str(round(result["f1_off"]*100, 2))))
print("F1: %s" %(str(round(result["f1"]*100, 2))) + "\t" + "ACC: %s" %(str(round(result["acc"]*100, 2))))

NOT:	P: 98.501	R: 91.27	F1: 94.75
OFF:	P: 80.95	R: 96.39	F1: 88.0
F1: 91.37	ACC: 92.69
