# Problem 4

In this problem, we simply finetune a BERT model (not pretrained) on RTE dataset, and then finetune a BERT model (pretrained) on RTE dataset.

**IMPORTANT NOTES**:
- Please make sure that you have already read the part of hw5 pdf that corresponds to this problem. This is very important.
- At the end of the hw5, you will need to submit a zip folder containing three things. The instruction is also included in the first paragraph of the hw5 pdf.
  - (1) The writeup pdf containing your solutions to Problems 1, 2, 3, 4, 5. Yes, there're things you need to respond in your writeup (see hw5 pdf).
  - (2) The downloaded colab corresponding to Problem 4.
  - (3) The downloaded colab corresponding to Problem 5.

Some imports and data downloading

In [1]:
import pandas as pd

In [2]:
import dataclasses
import logging
import os
import sys
from dataclasses import dataclass, field
from typing import Dict, Optional

import numpy as np
import pandas as pd
import torch
import torch.nn as nn 
from transformers import AutoTokenizer, BertConfig, BertForSequenceClassification, BertTokenizer,EvalPrediction, GlueDataset, GlueDataTrainingArguments, AutoModel, BertPreTrainedModel, AutoConfig, BertModel
from transformers import GlueDataTrainingArguments 
from transformers import (
    Trainer,
    TrainingArguments,
    glue_compute_metrics,
    glue_tasks_num_labels,
    set_seed,
)

import warnings
warnings.filterwarnings('ignore')

In [3]:
#!git clone https://github.com/huggingface/transformers
#!python transformers/utils/download_glue_data.py --tasks RTE
#!pip install transformers

In [4]:
df=pd.read_csv('SST/train.tsv', sep='\t')

In [5]:
df.head()

Unnamed: 0,sentence,label
0,hide new secretions from the parental units,0
1,"contains no wit , only labored gags",0
2,that loves its characters and communicates som...,1
3,remains utterly satisfied to remain the same t...,0
4,on the worst revenge-of-the-nerds clichés the ...,0


### From non-pretrained BERT

In [6]:
class SequenceClassificationBERT(nn.Module):
      
    def __init__(self, config, bert_model, weights = [0.5,0.5]):
        super().__init__()
        self.config = config
        self.num_labels = config.num_labels
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.bert = bert_model
        self.weights = weights

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # make sure that all the arguments in the forward() function is used
        # somewhere in the code

        ##### 

        # TODO

        #####

        outputs = self.bert(input_ids=input_ids, 
                               attention_mask=attention_mask, 
                               token_type_ids=token_type_ids,
                               position_ids=position_ids,
                               head_mask=head_mask,
                               inputs_embeds=inputs_embeds, 
                               output_attentions=output_attentions,
                               output_hidden_states=output_hidden_states,
                               return_dict=return_dict) 

        pooled_out = outputs[1] 
        pooled_out = self.dropout(pooled_out)
        #print(pooled_out.shape)
        #h_cls = pooled_out[:, 0]
        #print(h_cls.shape)
        logits = self.classifier(pooled_out)

        # RTE (2 classes)
        loss_fct = nn.CrossEntropyLoss(torch.tensor(self.weights, device='cuda'))
        loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) 

        # do not change the lines below, so make sure your code works for the
        # lines below

        output = (logits,) + outputs[2:]
        return ((loss,) + output) if loss is not None else output


In [7]:
def compute_metrics_pretrained(p: EvalPrediction):
    preds = np.argmax(p.predictions, axis=1)
    return glue_compute_metrics(data_args.task_name, preds, p.label_ids)

def compute_metrics(p: EvalPrediction):
    preds = np.argmax(p.predictions[0], axis=1) #predictions
    return glue_compute_metrics(data_args.task_name, preds, p.label_ids)

In [8]:
def non_pretrained(weights):
    set_seed(42)
    num_labels = glue_tasks_num_labels[data_args.task_name]

    tokenizer = BertTokenizer.from_pretrained(model_name)
    config = BertConfig.from_pretrained(model_name, output_hidden_states=False, output_attentions=True)
    bert_model = AutoModel.from_config(config)
    # Loading a model from its configuration file does not load the model weights. 
    # It only affects the model’s configuration. Use from_pretrained() to load the model weights.

    model = SequenceClassificationBERT(config=config, bert_model=bert_model,weights=weights)

    train_dataset = GlueDataset(data_args, tokenizer=tokenizer)
    eval_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="dev")

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics,
    )



    trainer.train()
    trainer.evaluate()
    
    d=trainer.evaluate()
    return d['eval_acc'], d['eval_loss']

### From pretrained BERT

In [9]:
def pretrained(weights):
    set_seed(42)
    tokenizer = BertTokenizer.from_pretrained(model_name)
    config = BertConfig.from_pretrained(model_name, output_hidden_states=False, output_attentions=True)
    bert_model = AutoModel.from_pretrained(model_name, config)

    train_dataset = GlueDataset(data_args, tokenizer=tokenizer)
    eval_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="dev")

    model = SequenceClassificationBERT(config=config, bert_model=bert_model,weights=weights)


    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics_pretrained,
    )

    trainer.train()
    d=trainer.evaluate()
    return d['eval_acc'], d['eval_loss']

In [10]:
non_pre_acc=[]
pre_acc=[]
non_pre_loss=[]
pre_loss=[]
for i in [0.1, 0.25, 0.5, 0.75, 1]:
    train = pd.read_csv('SST/train.tsv', sep='\t')
    dev = pd.read_csv('SST/dev.tsv', sep='\t')
    test = pd.read_csv('SST/test.tsv', sep='\t')
    
    df1= train[train.label==1].sample(frac=i, random_state=1).reset_index(drop=True)
    print(train.label.value_counts())
    print(df1.shape)
    
    df2= train[train.label==0].reset_index(drop=True)
    frames=[df1,df2]
    df=pd.concat(frames).reset_index(drop=True)
    #df[df['label'] == 0]['label'] = 'neg'
    #df[df['label'] == 1]['label'] = 'pos'
    #print(df.head())
    #df
    #dev[dev['label'] == 0]['label'] = 'neg'
    #dev[dev['label'] == 1]['label'] = 'pos'
    
    df.to_csv('SST_NEW/train.tsv', sep='\t',index=False)
    dev.to_csv('SST_NEW/dev.tsv', sep='\t',index=False)
    test.to_csv('SST_NEW/test.tsv', sep='\t',index=False)
    
    model_name = "bert-base-cased"

    data_args = GlueDataTrainingArguments(task_name="sst-2", data_dir="./SST_NEW")

    training_args = TrainingArguments(
        logging_steps=50, 
        per_device_train_batch_size=32, 
        per_device_eval_batch_size=64, 
        save_steps=1000,
        evaluation_strategy='epoch',
        output_dir="./models/sst-2",
        overwrite_output_dir=True,
        do_train=True,
        do_eval=True,
        do_predict=True,
        learning_rate=0.00001,
        num_train_epochs=15,
    )
    weights = torch.tensor([1,1/i])
    print('Fraction ', i)
    print('Non Pretrained ')
    acc1, loss1= non_pretrained(weights)
    print('Pretrained')
    acc2, loss2= pretrained(weights)
    non_pre_acc.append(acc1)
    pre_acc.append(acc2)
    non_pre_loss.append(loss1)
    pre_loss.append(loss2)
    


1    37569
0    29780
Name: label, dtype: int64
(3757, 2)
Fraction  0.1
Non Pretrained 


Epoch,Training Loss,Validation Loss,Acc,Runtime,Samples Per Second
1,0.5647,0.509555,0.727064,11.4245,76.327
2,0.4493,0.871545,0.730505,11.1722,78.051
3,0.2963,0.683795,0.763761,11.172,78.052
4,0.343,1.246229,0.692661,11.0631,78.82
5,0.2907,0.998677,0.728211,11.1593,78.141
6,0.2787,1.231931,0.732798,11.1494,78.211
7,0.2617,0.829916,0.766055,11.1026,78.54
8,0.2416,1.457525,0.735092,11.1545,78.175
9,0.2186,1.351178,0.774083,11.2313,77.64
10,0.1901,1.873821,0.737385,11.1927,77.908


Pretrained


Epoch,Training Loss,Validation Loss,Acc,Runtime,Samples Per Second
1,0.2904,0.475874,0.876147,2.239,389.456
2,0.3006,0.652426,0.87844,2.2297,391.081
3,0.1602,0.558211,0.893349,2.2461,388.226
4,0.1755,0.803028,0.891055,2.2665,384.733
5,0.0531,0.553898,0.91055,2.2549,386.717
6,0.063,1.102687,0.870413,2.2428,388.8
7,0.056,0.921723,0.900229,2.2372,389.776
8,0.008,1.141868,0.896789,2.2343,390.281
9,0.0291,1.21232,0.899083,2.2475,387.981
10,0.0576,1.545506,0.883028,2.2607,385.72


1    37569
0    29780
Name: label, dtype: int64
(9392, 2)
Fraction  0.25
Non Pretrained 


Epoch,Training Loss,Validation Loss,Acc,Runtime,Samples Per Second
1,0.5311,0.63359,0.709862,11.5313,75.62
2,0.3814,0.832522,0.741972,11.1643,78.106
3,0.2795,0.57683,0.762615,11.1931,77.905
4,0.3023,1.099218,0.731651,11.2844,77.275
5,0.2534,1.181514,0.696101,11.1618,78.124
6,0.2231,1.408228,0.713303,11.1504,78.203
7,0.1994,0.810309,0.759174,11.1328,78.327
8,0.1681,1.959038,0.706422,11.1359,78.305
9,0.1882,1.39108,0.761468,11.1016,78.547
10,0.1458,1.426921,0.752294,11.0459,78.943


Pretrained


Epoch,Training Loss,Validation Loss,Acc,Runtime,Samples Per Second
1,0.2576,0.506093,0.868119,2.2614,385.597
2,0.213,0.755722,0.856651,2.2689,384.326
3,0.1217,0.6832,0.868119,2.2635,385.251
4,0.0989,0.772969,0.883028,2.2623,385.452
5,0.0636,0.642849,0.892202,2.2625,385.421
6,0.0744,1.013363,0.87156,2.2649,385.009
7,0.04,0.670802,0.902523,2.2607,385.727
8,0.0199,1.194721,0.886468,2.2635,385.242
9,0.0178,1.503488,0.876147,2.2639,385.179
10,0.0008,1.752797,0.869266,2.2651,384.97


1    37569
0    29780
Name: label, dtype: int64
(18784, 2)
Fraction  0.5
Non Pretrained 


Epoch,Training Loss,Validation Loss,Acc,Runtime,Samples Per Second
1,0.4175,0.917208,0.641055,11.4117,76.413
2,0.2984,0.872383,0.739679,11.0094,79.205
3,0.2228,0.629225,0.760321,11.0771,78.721
4,0.2252,1.447408,0.68922,11.0796,78.703
5,0.1962,0.970847,0.701835,11.0903,78.627
6,0.1804,1.305283,0.701835,11.2006,77.853
7,0.1611,0.949712,0.740826,11.1343,78.316
8,0.1371,2.02859,0.675459,11.0034,79.248
9,0.1402,1.290353,0.761468,11.1884,77.938
10,0.1132,1.288357,0.764908,11.0033,79.249


Pretrained


Epoch,Training Loss,Validation Loss,Acc,Runtime,Samples Per Second
1,0.2036,0.478825,0.873853,2.2627,385.372
2,0.151,0.76984,0.852064,2.2606,385.737
3,0.0814,0.605175,0.876147,2.2637,385.21
4,0.0658,0.702679,0.881881,2.2634,385.267
5,0.0578,0.512441,0.902523,2.2617,385.552
6,0.0335,0.918392,0.872706,2.265,384.982
7,0.0298,0.770221,0.895642,2.2636,385.221
8,0.0088,0.931873,0.896789,2.2612,385.628
9,0.0148,1.229621,0.877294,2.2628,385.359
10,0.007,1.303206,0.875,2.2601,385.815


1    37569
0    29780
Name: label, dtype: int64
(28177, 2)
Fraction  0.75
Non Pretrained 


Epoch,Training Loss,Validation Loss,Acc,Runtime,Samples Per Second
1,0.3433,0.7202,0.68578,11.3995,76.495
2,0.2507,0.993053,0.680046,11.0549,78.879
3,0.1932,0.581881,0.761468,11.05,78.914
4,0.197,1.244653,0.68578,11.1224,78.401
5,0.1709,1.097437,0.663991,11.1698,78.067
6,0.1549,1.245999,0.683486,11.0547,78.88
7,0.14,0.887505,0.740826,11.1874,77.945
8,0.1221,1.705644,0.677752,11.131,78.34
9,0.1169,1.14585,0.75,11.1326,78.328
10,0.0995,1.139463,0.760321,11.0809,78.694


Pretrained


Epoch,Training Loss,Validation Loss,Acc,Runtime,Samples Per Second
1,0.1693,0.475354,0.866972,2.2561,386.507
2,0.1222,0.690083,0.848624,2.2655,384.912
3,0.0724,0.727412,0.848624,2.2681,384.456
4,0.0524,0.551563,0.893349,2.271,383.971
5,0.0349,0.530743,0.900229,2.2709,383.983
6,0.0339,0.729244,0.881881,2.2735,383.556
7,0.0178,0.593829,0.901376,2.2634,385.266
8,0.0082,0.96916,0.879587,2.2682,384.447
9,0.0066,1.0226,0.881881,2.2638,385.186
10,0.0051,1.031071,0.886468,2.2606,385.744


1    37569
0    29780
Name: label, dtype: int64
(37569, 2)
Fraction  1
Non Pretrained 


Epoch,Training Loss,Validation Loss,Acc,Runtime,Samples Per Second
1,0.2923,0.949649,0.588303,11.4515,76.147
2,0.2188,1.05226,0.635321,11.0097,79.203
3,0.1765,0.766539,0.701835,11.1276,78.363
4,0.1764,1.123114,0.701835,11.1183,78.429
5,0.1453,0.897699,0.683486,11.1487,78.215
6,0.1298,1.241023,0.672018,11.1051,78.522
7,0.1233,0.92503,0.724771,11.2049,77.823
8,0.1058,1.602572,0.678899,11.2042,77.828
9,0.1043,1.207148,0.725917,11.1378,78.292
10,0.09,1.109685,0.747706,11.1036,78.533


Pretrained


Epoch,Training Loss,Validation Loss,Acc,Runtime,Samples Per Second
1,0.1448,0.412951,0.870413,2.2577,386.234
2,0.1035,0.597958,0.853211,2.2693,384.266
3,0.062,0.523389,0.860092,2.2588,386.04
4,0.0475,0.559671,0.883028,2.2596,385.915
5,0.0409,0.44275,0.895642,2.2573,386.305
6,0.0315,0.687456,0.87844,2.2662,384.782
7,0.0128,0.655201,0.895642,2.2611,385.657
8,0.0131,0.773781,0.888761,2.2584,386.121
9,0.0084,0.916159,0.879587,2.2586,386.072
10,0.0046,0.878264,0.897936,2.2593,385.968


In [11]:
print(non_pre_acc)
print(pre_acc)
print(non_pre_loss)
print(pre_loss)

[0.7408256880733946, 0.7408256880733946, 0.7327981651376146, 0.7282110091743119, 0.7178899082568807]
[0.8967889908256881, 0.8807339449541285, 0.8830275229357798, 0.8807339449541285, 0.8818807339449541]
[2.3670077323913574, 2.324096202850342, 2.0706653594970703, 1.841522455215454, 1.651656150817871]
[1.5855176448822021, 1.624110460281372, 1.3880287408828735, 1.2626886367797852, 1.1161582469940186]


non_pre_acc=[]
pre_acc=[]
non_pre_loss=[]
pre_loss=[]
for i in [0.1, 0.25, 0.5, 0.75, 1]:
    train = pd.read_csv('SST/train.tsv', sep='\t')
    dev = pd.read_csv('SST/dev.tsv', sep='\t')
    test = pd.read_csv('SST/test.tsv', sep='\t')
    
    df1= train[train.label==0].sample(frac=i, random_state=1).reset_index(drop=True)
    print(train.label.value_counts())
    print(df1.shape)
    
    df2= train[train.label==1].reset_index(drop=True)
    frames=[df1,df2]
    df=pd.concat(frames).reset_index(drop=True)
    #df[df['label'] == 0]['label'] = 'neg'
    #df[df['label'] == 1]['label'] = 'pos'
    #print(df.head())
    #df
    #dev[dev['label'] == 0]['label'] = 'neg'
    #dev[dev['label'] == 1]['label'] = 'pos'
    
    df.to_csv('SST_NEW/train.tsv', sep='\t',index=False)
    dev.to_csv('SST_NEW/dev.tsv', sep='\t',index=False)
    test.to_csv('SST_NEW/test.tsv', sep='\t',index=False)
    
    model_name = "bert-base-cased"

    data_args = GlueDataTrainingArguments(task_name="sst-2", data_dir="./SST_NEW")

    training_args = TrainingArguments(
        logging_steps=50, 
        per_device_train_batch_size=32, 
        per_device_eval_batch_size=64, 
        save_steps=1000,
        evaluation_strategy='epoch',
        output_dir="./models/sst-2",
        overwrite_output_dir=True,
        do_train=True,
        do_eval=True,
        do_predict=True,
        learning_rate=0.00001,
        num_train_epochs=15,
    )
    weights = torch.tensor([1/i,1])
    print('Fraction ', i)
    print('Non Pretrained ')
    acc1, loss1= non_pretrained(weights)
    print('Pretrained')
    acc2, loss2= pretrained(weights)
    non_pre_acc.append(acc1)
    pre_acc.append(acc2)
    non_pre_loss.append(loss1)
    pre_loss.append(loss2)
    


print(non_pre_acc1)
print(pre_acc1)
print(non_pre_loss1)
print(pre_loss1)