In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import torch
from transformers import BertTokenizer
from transformers import RobertaTokenizer
# from transformers import GPT2TokenizerFast
from transformers import AlbertTokenizer
from transformers import DistilBertTokenizer
from transformers import TrainingArguments, Trainer
from transformers import BertForSequenceClassification
from transformers import RobertaForSequenceClassification
# from transformers import GPT2ForSequenceClassification
from transformers import AlbertForSequenceClassification
from transformers import DistilBertForSequenceClassification
import warnings

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ht_small=pd.read_csv('/Volumes/T7/mt-hemingway/data/hypothesis/small.csv')
ht_small=ht_small.dropna()
ht_big=pd.read_csv('/Volumes/T7/mt-hemingway/data/hypothesis/big.csv')
ht_big=ht_big.dropna()
mt_first=pd.read_csv('/Volumes/T7/mt-hemingway/data/hypothesis/exp1/MBart/MBart_KO1.csv')
mt_first=mt_first.dropna()
mt_second=pd.read_csv('/Volumes/T7/mt-hemingway/data/hypothesis/exp1/MBart/MBart_KO2.csv')
mt_second=mt_second.dropna()
mt_third=pd.read_csv('/Volumes/T7/mt-hemingway/data/hypothesis/exp1/MBart/MBart_KO3.csv')
mt_third=mt_third.dropna()

In [3]:
small_first=[ht_small.sample(ht_small.shape[0]), mt_first.sample(mt_first.shape[0])]
small_first=pd.concat(small_first)
small_second=[ht_small.sample(ht_small.shape[0]), mt_second.sample(mt_second.shape[0])]
small_second=pd.concat(small_second)
small_third=[ht_small.sample(ht_small.shape[0]), mt_third.sample(mt_third.shape[0])]
small_third=pd.concat(small_third)
big_first=[ht_big.sample(ht_big.shape[0]), mt_first.sample(mt_first.shape[0])]
big_first=pd.concat(big_first)
big_second=[ht_big.sample(ht_big.shape[0]), mt_second.sample(mt_second.shape[0])]
big_second=pd.concat(big_second)
big_third=[ht_big.sample(ht_big.shape[0]), mt_third.sample(mt_third.shape[0])]
big_third=pd.concat(big_third)

In [4]:
def convert_label(data:pd.DataFrame):
    data['label']=data['label'].replace({data['label'].groupby(data['label']).count().keys()[0]:0, data['label'].groupby(data['label']).count().keys()[1]:1})
    return data

In [5]:
small_first=convert_label(small_first)
small_second=convert_label(small_second)
small_third=convert_label(small_third)
big_first=convert_label(big_first)
big_second=convert_label(big_second)
big_third=convert_label(big_third)

In [6]:
%%time
X=list(big_third['text'])
y=list(big_third['label'])
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, stratify=y, random_state=42)

CPU times: user 1.56 ms, sys: 453 µs, total: 2.01 ms
Wall time: 1.63 ms


In [7]:
bert_tokenizer=BertTokenizer.from_pretrained('bert-base-uncased')
albert_tokenizer=AlbertTokenizer.from_pretrained('albert-base-v2')
roberta_tokenizer=RobertaTokenizer.from_pretrained('roberta-base')
bert_model=BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
albert_model=AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=2)
roberta_model=RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)
import warnings

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [8]:
device=torch.device('mps')
bert_model.to(device)
roberta_model.to(torch.device('cpu'))
albert_model.to(torch.device(device))
device

device(type='mps')

In [9]:
X_train_tokenized_bert=bert_tokenizer(X_train, padding=True, truncation=True)#, max_length=512)
X_test_tokenized_bert=bert_tokenizer(X_test, padding=True, truncation=True)#, max_lenght=512)
X_train_tokenized_roberta=roberta_tokenizer(X_train, padding=True, truncation=True)#, max_length=512)
X_test_tokenized_roberta=roberta_tokenizer(X_test, padding=True, truncation=True)#, max_lenght=512)
X_train_tokenized_albert=albert_tokenizer(X_train, padding=True, truncation=True)#, max_length=512)
X_test_tokenized_albert=albert_tokenizer(X_test, padding=True, truncation=True)#, max_lenght=512)

In [10]:
#create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [11]:
train_dataset_bert=Dataset(X_train_tokenized_bert, y_train)
test_dataset_bert=Dataset(X_test_tokenized_bert, y_test)
train_dataset_roberta=Dataset(X_train_tokenized_roberta, y_train)
test_dataset_roberta=Dataset(X_test_tokenized_roberta, y_test)
train_dataset_albert=Dataset(X_train_tokenized_albert, y_train)
test_dataset_albert=Dataset(X_test_tokenized_albert, y_test)

In [12]:
def compute_metrics(p):
    print(type(p))
    pred, labels = p
    pred=np.argmax(pred, axis=1)
    accuracy=accuracy_score(y_true=labels, y_pred=pred)
    recall=recall_score(y_true=labels, y_pred=pred)
    precision=precision_score(y_true=labels, y_pred=pred)
    f1=f1_score(y_true=labels, y_pred=pred)
    return {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1}

In [13]:
#define Trainer
args_bert=TrainingArguments(
    output_dir='output',
    num_train_epochs=1,#higher may get the higer accuracy
    per_device_train_batch_size=8,
    use_mps_device=True
)
trainer_bert=Trainer(
    model=bert_model,
    args=args_bert,
    train_dataset=train_dataset_bert,
    eval_dataset=test_dataset_bert,
    compute_metrics=compute_metrics,
)

In [14]:
args_roberta=TrainingArguments(
    output_dir='output',
    num_train_epochs=1,#higher may get the higer accuracy
    per_device_train_batch_size=8,
    # use_mps_device=True,
    # pytorch_device='cpu',
)
trainer_roberta=Trainer(
    model=roberta_model,
    args=args_roberta,
    train_dataset=train_dataset_roberta,
    eval_dataset=test_dataset_roberta,
    compute_metrics=compute_metrics,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [17]:
args_albert=TrainingArguments(
    output_dir='output',
    num_train_epochs=1,#higher may get the higer accuracy
    per_device_train_batch_size=8,
    use_mps_device=True
)
trainer_albert=Trainer(
    model=albert_model,
    args=args_albert,
    train_dataset=train_dataset_albert,
    eval_dataset=test_dataset_albert,
    compute_metrics=compute_metrics,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [15]:
trainer_bert.train()

***** Running training *****
  Num examples = 1228
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 154
  Number of trainable parameters = 109483778
100%|██████████| 154/154 [01:31<00:00,  1.82it/s]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 154/154 [01:31<00:00,  1.68it/s]

{'train_runtime': 91.6352, 'train_samples_per_second': 13.401, 'train_steps_per_second': 1.681, 'train_loss': 0.6099062337503804, 'epoch': 1.0}





TrainOutput(global_step=154, training_loss=0.6099062337503804, metrics={'train_runtime': 91.6352, 'train_samples_per_second': 13.401, 'train_steps_per_second': 1.681, 'train_loss': 0.6099062337503804, 'epoch': 1.0})

In [15]:
trainer_roberta.train()

***** Running training *****
  Num examples = 1228
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 154
  Number of trainable parameters = 124647170
100%|██████████| 154/154 [13:00<00:00,  4.39s/it]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 154/154 [13:00<00:00,  5.07s/it]

{'train_runtime': 780.0745, 'train_samples_per_second': 1.574, 'train_steps_per_second': 0.197, 'train_loss': 0.6896689774154069, 'epoch': 1.0}





TrainOutput(global_step=154, training_loss=0.6896689774154069, metrics={'train_runtime': 780.0745, 'train_samples_per_second': 1.574, 'train_steps_per_second': 0.197, 'train_loss': 0.6896689774154069, 'epoch': 1.0})

In [18]:
trainer_albert.train()

***** Running training *****
  Num examples = 1228
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 154
  Number of trainable parameters = 11685122
100%|██████████| 154/154 [01:15<00:00,  2.15it/s]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 154/154 [01:15<00:00,  2.05it/s]

{'train_runtime': 75.1786, 'train_samples_per_second': 16.334, 'train_steps_per_second': 2.048, 'train_loss': 0.7131630042930702, 'epoch': 1.0}





TrainOutput(global_step=154, training_loss=0.7131630042930702, metrics={'train_runtime': 75.1786, 'train_samples_per_second': 16.334, 'train_steps_per_second': 2.048, 'train_loss': 0.7131630042930702, 'epoch': 1.0})

In [16]:
trainer_bert.evaluate()

***** Running Evaluation *****
  Num examples = 308
  Batch size = 8
100%|██████████| 39/39 [00:05<00:00,  6.54it/s]

<class 'transformers.trainer_utils.EvalPrediction'>





{'eval_loss': 0.4934057891368866,
 'eval_accuracy': 0.7564935064935064,
 'eval_precision': 0.7762237762237763,
 'eval_recall': 0.7207792207792207,
 'eval_f1': 0.7474747474747475,
 'eval_runtime': 6.3702,
 'eval_samples_per_second': 48.35,
 'eval_steps_per_second': 6.122,
 'epoch': 1.0}

In [16]:
trainer_roberta.evaluate()

***** Running Evaluation *****
  Num examples = 308
  Batch size = 8
100%|██████████| 39/39 [00:37<00:00,  1.03it/s]

<class 'transformers.trainer_utils.EvalPrediction'>





{'eval_loss': 0.6648804545402527,
 'eval_accuracy': 0.711038961038961,
 'eval_precision': 0.8350515463917526,
 'eval_recall': 0.525974025974026,
 'eval_f1': 0.6454183266932272,
 'eval_runtime': 38.7947,
 'eval_samples_per_second': 7.939,
 'eval_steps_per_second': 1.005,
 'epoch': 1.0}

In [19]:
trainer_albert.evaluate()

***** Running Evaluation *****
  Num examples = 308
  Batch size = 8
100%|██████████| 39/39 [00:04<00:00,  8.24it/s]

<class 'transformers.trainer_utils.EvalPrediction'>





{'eval_loss': 0.6928556561470032,
 'eval_accuracy': 0.5,
 'eval_precision': 0.5,
 'eval_recall': 1.0,
 'eval_f1': 0.6666666666666666,
 'eval_runtime': 4.9964,
 'eval_samples_per_second': 61.644,
 'eval_steps_per_second': 7.806,
 'epoch': 1.0}