In [1]:
import dataclasses
import logging
import os
import sys
from dataclasses import dataclass, field
from typing import Dict, Optional

import numpy as np
import pandas as pd
import torch
import torch.nn as nn 
from transformers import AutoTokenizer, EvalPrediction, GlueDataset, GlueDataTrainingArguments, AutoModel, BertPreTrainedModel, AutoConfig, BertModel
from transformers import GlueDataTrainingArguments 
from transformers import (
    Trainer,
    TrainingArguments,
    glue_compute_metrics,
    glue_tasks_num_labels,
    set_seed,
)

import warnings
warnings.filterwarnings('ignore')

In [2]:
#!git clone https://github.com/huggingface/transformers
#!python transformers/utils/download_glue_data.py --tasks RTE
#!pip install transformers

### From non-pretrained BERT

In [3]:
class SequenceClassificationBERT(nn.Module):
      
    def __init__(self, config, bert_model, weights = [0.5,0.5]):
        super().__init__()
        self.config = config
        self.num_labels = config.num_labels
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.bert = bert_model
        self.weights = weights

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # make sure that all the arguments in the forward() function is used
        # somewhere in the code

        ##### 

        # TODO

        #####

        outputs = self.bert(input_ids=input_ids, 
                               attention_mask=attention_mask, 
                               token_type_ids=token_type_ids,
                               position_ids=position_ids,
                               head_mask=head_mask,
                               inputs_embeds=inputs_embeds, 
                               output_attentions=output_attentions,
                               output_hidden_states=output_hidden_states,
                               return_dict=return_dict) 

        pooled_out = outputs[1] 
        pooled_out = self.dropout(pooled_out)
        #print(pooled_out.shape)
        #h_cls = pooled_out[:, 0]
        #print(h_cls.shape)
        logits = self.classifier(pooled_out)

        # RTE (2 classes)
        loss_fct = nn.CrossEntropyLoss(torch.tensor(self.weights, device='cuda'))
        loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) 

        # do not change the lines below, so make sure your code works for the
        # lines below

        output = (logits,) + outputs[2:]
        return ((loss,) + output) if loss is not None else output


In [4]:
def compute_metrics_pretrained(p: EvalPrediction):
    preds = np.argmax(p.predictions, axis=1)
    return glue_compute_metrics(data_args.task_name, preds, p.label_ids)

def compute_metrics(p: EvalPrediction):
    preds = np.argmax(p.predictions[0], axis=1) #predictions
    return glue_compute_metrics(data_args.task_name, preds, p.label_ids)

In [5]:
def non_pretrained(weights):
    set_seed(42)
    num_labels = glue_tasks_num_labels[data_args.task_name]

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    config = AutoConfig.from_pretrained(model_name, output_hidden_states=False, output_attentions=True)
    bert_model = AutoModel.from_config(config)
    # Loading a model from its configuration file does not load the model weights. 
    # It only affects the model’s configuration. Use from_pretrained() to load the model weights.

    model = SequenceClassificationBERT(config=config, bert_model=bert_model,weights=weights)

    train_dataset = GlueDataset(data_args, tokenizer=tokenizer)
    eval_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="dev")

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics,
    )



    trainer.train()
    trainer.evaluate()
    
    d=trainer.evaluate()
    return d['eval_acc'], d['eval_loss']

### From pretrained BERT

In [6]:
def pretrained(weights):
    set_seed(42)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    config = AutoConfig.from_pretrained(model_name, output_hidden_states=False, output_attentions=True)
    bert_model = AutoModel.from_pretrained(model_name, config)

    train_dataset = GlueDataset(data_args, tokenizer=tokenizer)
    eval_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="dev")

    model = SequenceClassificationBERT(config=config, bert_model=bert_model, weights=weights)


    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics_pretrained,
    )

    trainer.train()
    d=trainer.evaluate()
    return d['eval_acc'], d['eval_loss']

In [7]:
non_pre_acc=[]
pre_acc=[]
non_pre_loss=[]
pre_loss=[]
for i in [0.1, 0.25, 0.5, 0.75, 1]:
    train = pd.read_csv('RTE/train.tsv', sep='\t')
    dev = pd.read_csv('RTE/dev.tsv', sep='\t')
    test = pd.read_csv('RTE/test.tsv', sep='\t')
    
    df1= train[train.label=='not_entailment'].sample(frac=i, random_state=1).reset_index(drop=True)
    print(train.label.value_counts())
    print(df1.shape)
    df2= train[train.label=='entailment'].reset_index(drop=True)
    frames=[df1,df2]
    df=pd.concat(frames).reset_index(drop=True)
    
    df.to_csv('RTE_NEW/train.tsv', sep='\t',index=False)
    dev.to_csv('RTE_NEW/dev.tsv', sep='\t',index=False)
    test.to_csv('RTE_NEW/test.tsv', sep='\t',index=False)
    
    model_name = "bert-base-uncased"

    data_args = GlueDataTrainingArguments(task_name="rte", data_dir="./RTE_NEW")

    training_args = TrainingArguments(
        logging_steps=50, 
        per_device_train_batch_size=32, 
        per_device_eval_batch_size=64, 
        save_steps=1000,
        evaluation_strategy='epoch',
        output_dir="./models/rte",
        overwrite_output_dir=True,
        do_train=True,
        do_eval=True,
        do_predict=True,
        learning_rate=0.00001,
        num_train_epochs=15,
    )
    weights = torch.tensor([1,1/i])
    acc1, loss1= non_pretrained(weights)
    acc2, loss2= pretrained(weights)
    non_pre_acc.append(acc1)
    pre_acc.append(acc2)
    non_pre_loss.append(loss1)
    pre_loss.append(loss2)
    


entailment        1249
not_entailment    1240
Name: label, dtype: int64
(124, 4)


Epoch,Training Loss,Validation Loss,Acc,Runtime,Samples Per Second
1,0.4802,0.384969,0.472924,3.5016,79.107
2,0.4786,0.367571,0.472924,3.5358,78.342
3,0.4695,0.339333,0.472924,3.5401,78.246
4,0.4695,0.343571,0.472924,3.5174,78.752
5,0.4722,0.345816,0.472924,3.5101,78.916
6,0.4717,0.332142,0.472924,3.5452,78.133
7,0.4518,0.334457,0.472924,3.5117,78.878
8,0.4611,0.435886,0.480144,3.5597,77.815
9,0.3922,0.457539,0.494585,3.5329,78.406
10,0.364,0.805197,0.498195,3.5094,78.93


Epoch,Training Loss,Validation Loss,Acc,Runtime,Samples Per Second
1,0.5527,0.367333,0.472924,0.734,377.378
2,0.4503,0.33495,0.472924,0.7361,376.292
3,0.4051,0.398969,0.501805,0.7349,376.916
4,0.3425,0.506724,0.638989,0.7354,376.644
5,0.2617,0.558075,0.66426,0.7354,376.689
6,0.1625,0.750224,0.642599,0.7341,377.321
7,0.1236,0.828695,0.635379,0.7343,377.222
8,0.1004,0.962463,0.649819,0.7347,377.002
9,0.0699,1.120447,0.646209,0.7329,377.947
10,0.061,1.284714,0.638989,0.7349,376.9


entailment        1249
not_entailment    1240
Name: label, dtype: int64
(310, 4)


Epoch,Training Loss,Validation Loss,Acc,Runtime,Samples Per Second
1,0.6607,0.648006,0.472924,3.5426,78.191
2,0.6599,0.589468,0.472924,3.5394,78.262
3,0.65,0.545528,0.472924,3.5446,78.148
4,0.6468,0.549448,0.472924,3.5418,78.208
5,0.6541,0.560168,0.472924,3.547,78.093
6,0.6418,0.527236,0.472924,3.5267,78.544
7,0.6337,0.535244,0.472924,3.5201,78.691
8,0.5958,0.714726,0.505415,3.5056,79.016
9,0.5256,0.764575,0.537906,3.5446,78.148
10,0.4809,0.783615,0.490975,3.5433,78.175


Epoch,Training Loss,Validation Loss,Acc,Runtime,Samples Per Second
1,0.6703,0.564893,0.472924,0.734,377.395
2,0.6074,0.54394,0.519856,0.7361,376.293
3,0.5449,0.59755,0.606498,0.7334,377.679
4,0.4654,0.657721,0.646209,0.7347,377.02
5,0.3512,0.73579,0.649819,0.7339,377.41
6,0.2054,0.860927,0.635379,0.7337,377.515
7,0.159,0.818332,0.66065,0.7337,377.544
8,0.1193,1.167462,0.642599,0.7344,377.166
9,0.0765,1.252027,0.642599,0.7351,376.844
10,0.058,1.390512,0.646209,0.7345,377.109


entailment        1249
not_entailment    1240
Name: label, dtype: int64
(620, 4)


Epoch,Training Loss,Validation Loss,Acc,Runtime,Samples Per Second
1,0.7118,0.792534,0.527076,3.5419,78.206
2,0.7067,0.731673,0.523466,3.5258,78.564
3,0.7039,0.683183,0.534296,3.5426,78.19
4,0.6949,0.674363,0.476534,3.5405,78.237
5,0.7046,0.693613,0.559567,3.5604,77.801
6,0.6782,0.654412,0.480144,3.5069,78.988
7,0.671,0.661799,0.498195,3.5084,78.954
8,0.6605,0.864798,0.519856,3.5084,78.954
9,0.5476,0.915106,0.541516,3.5081,78.961
10,0.4801,0.960996,0.519856,3.5243,78.597


Epoch,Training Loss,Validation Loss,Acc,Runtime,Samples Per Second
1,0.7025,0.690202,0.555957,0.7304,379.27
2,0.6476,0.676017,0.610108,0.7326,378.117
3,0.5871,0.694391,0.620939,0.7376,375.518
4,0.5055,0.732776,0.631769,0.736,376.366
5,0.3884,0.823667,0.631769,0.7369,375.905
6,0.2362,0.937005,0.624549,0.7387,374.991
7,0.1906,0.958401,0.638989,0.7366,376.063
8,0.1501,1.184606,0.628159,0.7353,376.697
9,0.1039,1.178029,0.65704,0.7367,376.015
10,0.0883,1.311075,0.66065,0.7348,376.986


entailment        1249
not_entailment    1240
Name: label, dtype: int64
(930, 4)


Epoch,Training Loss,Validation Loss,Acc,Runtime,Samples Per Second
1,0.6872,0.872348,0.527076,3.5216,78.656
2,0.6832,0.767056,0.527076,3.5296,78.478
3,0.6844,0.726461,0.527076,3.5292,78.487
4,0.6732,0.719567,0.527076,3.548,78.072
5,0.6857,0.736455,0.523466,3.5619,77.767
6,0.6559,0.69058,0.519856,3.5284,78.506
7,0.6418,0.755327,0.534296,3.5187,78.721
8,0.5587,1.091507,0.527076,3.5605,77.797
9,0.494,0.895535,0.534296,3.5607,77.794
10,0.4546,1.003952,0.512635,3.5277,78.521


Epoch,Training Loss,Validation Loss,Acc,Runtime,Samples Per Second
1,0.6818,0.726977,0.563177,0.7343,377.22
2,0.6248,0.712586,0.592058,0.7345,377.145
3,0.5726,0.706739,0.620939,0.7356,376.579
4,0.491,0.761384,0.642599,0.7351,376.83
5,0.3841,0.839095,0.642599,0.7349,376.9
6,0.2292,0.938746,0.638989,0.7359,376.384
7,0.191,0.987098,0.631769,0.733,377.875
8,0.1517,1.207218,0.617329,0.7334,377.672
9,0.1112,1.310975,0.65343,0.7345,377.14
10,0.087,1.300988,0.646209,0.7334,377.694


entailment        1249
not_entailment    1240
Name: label, dtype: int64
(1240, 4)


Epoch,Training Loss,Validation Loss,Acc,Runtime,Samples Per Second
1,0.6491,0.862271,0.527076,3.528,78.514
2,0.6431,0.768264,0.527076,3.5413,78.219
3,0.6513,0.741244,0.527076,3.5239,78.607
4,0.6367,0.722118,0.527076,3.5059,79.01
5,0.6488,0.735004,0.519856,3.5477,78.078
6,0.6189,0.693059,0.537906,3.5525,77.973
7,0.6156,0.849838,0.519856,3.5082,78.957
8,0.5438,0.780382,0.559567,3.5265,78.548
9,0.4968,0.931023,0.530686,3.5404,78.24
10,0.4972,0.928974,0.527076,3.5291,78.49


Epoch,Training Loss,Validation Loss,Acc,Runtime,Samples Per Second
1,0.6496,0.725813,0.552347,0.7321,378.341
2,0.589,0.715586,0.577617,0.7335,377.643
3,0.5438,0.704,0.638989,0.7319,378.46
4,0.4638,0.756826,0.635379,0.733,377.925
5,0.3675,0.848482,0.638989,0.734,377.39
6,0.2206,0.902078,0.631769,0.7326,378.105
7,0.1804,1.02541,0.635379,0.7348,376.968
8,0.1446,1.170527,0.624549,0.735,376.851
9,0.1065,1.25046,0.638989,0.7331,377.842
10,0.0936,1.253942,0.642599,0.7331,377.834


In [8]:
print(non_pre_acc)
print(pre_acc)
print(non_pre_loss)
print(pre_loss)

[0.5270758122743683, 0.5379061371841155, 0.5270758122743683, 0.5487364620938628, 0.5595667870036101]
[0.6462093862815884, 0.6425992779783394, 0.6534296028880866, 0.6534296028880866, 0.6534296028880866]
[0.8413060307502747, 1.0962923765182495, 1.2345272302627563, 1.285244345664978, 1.1910253763198853]
[1.5908265113830566, 1.8645741939544678, 1.9019352197647095, 1.6948457956314087, 1.5574308633804321]


In [None]:
non_pre_acc1=[]
pre_acc1=[]
non_pre_loss1=[]
pre_loss1=[]
for i in [0.1, 0.25, 0.5, 0.75, 1]:
    train = pd.read_csv('RTE/train.tsv', sep='\t')
    dev = pd.read_csv('RTE/dev.tsv', sep='\t')
    test = pd.read_csv('RTE/test.tsv', sep='\t')
    
    df1= train[train.label=='not_entailment'].reset_index(drop=True)
    
    df2= train[train.label=='entailment'].sample(frac=i, random_state=1).reset_index(drop=True)
    print(train.label.value_counts())
    print(df2.shape)
    frames=[df1,df2]
    df=pd.concat(frames).reset_index(drop=True)
    
    df.to_csv('RTE_NEW/train.tsv', sep='\t',index=False)
    dev.to_csv('RTE_NEW/dev.tsv', sep='\t',index=False)
    test.to_csv('RTE_NEW/test.tsv', sep='\t',index=False)
    
    model_name = "bert-base-uncased"

    data_args = GlueDataTrainingArguments(task_name="rte", data_dir="./RTE_NEW")

    training_args = TrainingArguments(
        logging_steps=50, 
        per_device_train_batch_size=32, 
        per_device_eval_batch_size=64, 
        save_steps=1000,
        evaluation_strategy='epoch',
        output_dir="./models/rte",
        overwrite_output_dir=True,
        do_train=True,
        do_eval=True,
        do_predict=True,
        learning_rate=0.00001,
        num_train_epochs=15,
    )
    weights = torch.tensor([1/i,1])
    acc1, loss1= non_pretrained(weights)
    acc2, loss2= pretrained(weights)
    non_pre_acc1.append(acc1)
    pre_acc1.append(acc2)
    non_pre_loss1.append(loss1)
    pre_loss1.append(loss2)
    


entailment        1249
not_entailment    1240
Name: label, dtype: int64
(125, 4)


Epoch,Training Loss,Validation Loss,Acc,Runtime,Samples Per Second
1,0.2085,0.308046,0.527076,3.5261,78.557
2,0.1914,0.307737,0.527076,3.5016,79.106
3,0.2029,0.307088,0.527076,3.5384,78.283
4,0.1949,0.301927,0.527076,3.5379,78.295
5,0.2006,0.301159,0.527076,3.5021,79.095
6,0.1948,0.306284,0.527076,3.5053,79.022
7,0.2025,0.298461,0.527076,3.5418,78.208
8,0.187,0.311382,0.527076,3.5025,79.087
9,0.1912,0.294798,0.527076,3.5401,78.247
10,0.1778,0.326065,0.530686,3.5029,79.076


Epoch,Training Loss,Validation Loss,Acc,Runtime,Samples Per Second
1,0.2864,0.304162,0.527076,0.733,377.892
2,0.1847,0.296987,0.527076,1.5785,175.482
3,0.1847,0.276779,0.552347,0.7358,376.466
4,0.1541,0.291632,0.570397,0.7349,376.931
5,0.1355,0.318998,0.570397,0.7358,376.445
6,0.0957,0.338488,0.599278,0.7358,376.437
7,0.0851,0.363328,0.602888,0.7353,376.695
8,0.0656,0.367842,0.606498,0.7352,376.75
9,0.0558,0.406682,0.599278,0.734,377.39
10,0.0495,0.416673,0.631769,0.7355,376.638


entailment        1249
not_entailment    1240
Name: label, dtype: int64
(312, 4)


Epoch,Training Loss,Validation Loss,Acc,Runtime,Samples Per Second
1,0.3707,0.536404,0.527076,3.665,75.579
2,0.3491,0.523328,0.527076,3.4887,79.4
3,0.3672,0.516419,0.527076,3.5014,79.111
4,0.3523,0.510556,0.527076,3.5189,78.717
5,0.3633,0.509069,0.527076,3.5007,79.127
6,0.3497,0.519474,0.527076,3.5218,78.653
7,0.3635,0.489655,0.527076,3.5459,78.118
8,0.3327,0.512821,0.527076,3.5048,79.034
9,0.3215,0.578206,0.541516,3.5261,78.556
10,0.2849,0.635432,0.523466,3.5024,79.089


In [None]:
print(non_pre_acc1)
print(pre_acc1)
print(non_pre_loss1)
print(pre_loss1)