In [1]:
import numpy as np
import pandas as pd
import argparse
import os
import re
import time
import glob
import joblib

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn import preprocessing
import torch
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification, AutoModelForSequenceClassification, AutoTokenizer
from transformers import EarlyStoppingCallback
from transformers import AutoTokenizer, DataCollatorWithPadding
from datasets import Dataset, DatasetDict
import plotly.graph_objects as go
from IPython.core.display import HTML
from utils import *

from azureml.core import Workspace, Environment, Experiment, Datastore, Dataset, ScriptRunConfig
from azureml.train.automl.run import AutoMLRun
from azureml.core.datastore import Datastore
from azureml.data.data_reference import DataReference

from captum.attr import visualization as viz
from captum.attr import IntegratedGradients, LayerConductance, LayerIntegratedGradients
from captum.attr import configure_interpretable_embedding_layer, remove_interpretable_embedding_layer
import torch
import matplotlib.pyplot as plt

# Check core SDK version number
# print("SDK version:", azureml.core.VERSION)


In [2]:
import sys

sys.path.append(os.getcwd() + "/project")

from train_transformer import get_model, adjust_tokenizer, compute_metrics, get_encode_labels, tokenize_function, generate_tokenized_dataset, get_datasets

In [5]:
base_checkpoint = "bert-base-uncased"
text_field_name = "TEXT_FINAL"
target_name = "target"
batch_size = 16
is_test = 1
is_local = 1
no_epochs = 3

if is_test:
    no_epochs = 2

In [6]:
os.chdir(os.getcwd() + "/project")
os.getcwd()

'/mnt/batch/tasks/shared/LS_root/mounts/clusters/hossein-nc6-ci/code/Users/hosarsha/og_classification/project'

In [7]:
pdf_train, pdf_validation, pdf_test, pdf_temporal_test = get_datasets(is_local, is_test)

num_labels = len(pdf_train[target_name].unique())
print(f'num_labels: {num_labels}')

model, tokenizer = get_model(base_checkpoint, num_labels)
model, tokenizer = adjust_tokenizer(model, tokenizer)

le = get_encode_labels(pdf_train, target_name)

fields = [text_field_name, target_name, 'labels']

train_ds, tokenized_train_ds = generate_tokenized_dataset(pdf_train, fields, le, target_name, text_field_name, tokenizer)
validation_ds, tokenized_validation_ds = generate_tokenized_dataset(pdf_validation, fields, le, target_name, text_field_name, tokenizer)
test_ds, tokenized_test_ds = generate_tokenized_dataset(pdf_test, fields, le, target_name, text_field_name, tokenizer)
temporal_test_ds, tokenized_temporal_test_ds = generate_tokenized_dataset(pdf_temporal_test, fields, le, target_name, text_field_name, tokenizer)

the job is running locally
the job is a test job
pdf_train is imported wit "(2500, 2)" rows
pdf_validation is imported wit "(2500, 2)" rows
pdf_test is imported wit "(2500, 2)" rows
pdf_temporal_test is imported wit "(2500, 2)" rows
num_labels: 51


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

In [8]:
print('Tokenized data is generated')

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Tokenized data is generated


In [9]:
args = TrainingArguments(
    output_dir="outputs",
    evaluation_strategy="epoch",
    eval_steps=500,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=no_epochs,
    seed=0,
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_validation_ds,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

In [10]:
# Train pre-trained model
print("Training started")
trainer.train()
print("Training is finished")

Training started
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Trainer is attempting to log a value of "{0: 'LABEL_0', 1: 'LABEL_1', 2: 'LABEL_2', 3: 'LABEL_3', 4: 'LABEL_4', 5: 'LABEL_5', 6: 'LABEL_6', 7: 'LABEL_7', 8: 'LABEL_8', 9: 'LABEL_9', 10: 'LABEL_10', 11: 'LABEL_11', 12: 'LABEL_12', 13: 'LABEL_13', 14: 'LABEL_14', 15: 'LABEL_15', 16: 'LABEL_16', 17: 'LABEL_17', 18: 'LABEL_18', 19: 'LABEL_19', 20: 'LABEL_20', 21: 'LABEL_21', 22: 'LABEL_22', 23: 'LABEL_23', 24: 'LABEL_24', 25: 'LABEL_25', 26: 'LABEL_26', 27: 'LABEL_27', 28: 'LABEL_28', 29: 'LABEL_29', 30: 'LABEL_30', 31: 'LABEL_31', 32: 'LABEL_32', 33: 'LABEL_33', 34: 'LABEL_34', 35: 'LABEL_35', 36: 'LABEL_36', 37: 'LABEL_37', 38: 'LABEL_38', 39: 'LABEL_39', 40: 'LABEL_40', 41: 'LABEL_41', 42: 'LABEL_42', 43: 'LABEL_43', 44: 'LABEL_44', 45: 'LABEL_45', 46: 'LABEL_46', 47: 'LABEL_47', 48: 'LABEL_48', 49: 'LABEL_49', 50: 'LABEL_50'}" for key "id2label" as a parameter. MLflow's log_param() only accepts values no longer than 250 characters so we dropped this attribute.
Trainer is attempting to 

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Recall Weighted,Precision Weighted,F1 Weighted,Runtime,Samples Per Second
1,No log,2.199767,0.534455,0.033343,0.053389,0.038959,0.534455,0.318269,0.39653,21.2573,117.418
2,No log,2.047934,0.56891,0.110769,0.09951,0.089693,0.56891,0.393976,0.449391,21.321,117.068


  _warn_prf(average, modifier, msg_start, len(result))


Attempted to log scalar metric eval_loss:
2.1997668743133545
Attempted to log scalar metric eval_accuracy:
0.5344551282051282
Attempted to log scalar metric eval_precision:
0.033342896199472206
Attempted to log scalar metric eval_recall:
0.05338930421811698
Attempted to log scalar metric eval_f1:
0.038959419892849796
Attempted to log scalar metric eval_recall_weighted:
0.5344551282051282
Attempted to log scalar metric eval_precision_weighted:
0.3182694635194428
Attempted to log scalar metric eval_f1_weighted:
0.3965299591993869
Attempted to log scalar metric eval_runtime:
21.2573
Attempted to log scalar metric eval_samples_per_second:
117.418
Attempted to log scalar metric epoch:
1.0
Attempted to log scalar metric eval_loss:
2.04793381690979
Attempted to log scalar metric eval_accuracy:
0.5689102564102564
Attempted to log scalar metric eval_precision:
0.11076915248920266
Attempted to log scalar metric eval_recall:
0.09950983119111605
Attempted to log scalar metric eval_f1:
0.0896932753

In [55]:
def test_model(trainer, ds, prefix):
    test_result = trainer.predict(tokenized_test_ds)

    metrics = test_result.metrics.keys()
    # print(f'len(metrics): {metrics}')
    
    for m in metrics:
        print(f'{prefix}_{m.replace("test_", "")}', f'{test_result.metrics[m]}')


In [56]:
test_model(trainer, tokenized_test_ds, 'test')

test_loss 1.9834322929382324
test_accuracy 0.5886354541816726
test_precision 0.10803025391935897
test_recall 0.09252521854691055
test_f1 0.08474615273562676
test_recall_weighted 0.5886354541816726
test_precision_weighted 0.4158531557489751
test_f1_weighted 0.4722916259768107
test_runtime 21.2984
test_samples_per_second 117.333
test_mem_cpu_alloc_delta 0
test_mem_gpu_alloc_delta 0
test_mem_cpu_peaked_delta 0
test_mem_gpu_peaked_delta 579536896


In [57]:
test_model(trainer, tokenized_temporal_test_ds, 'temporal_test')

temporal_test_loss 1.9834322929382324
temporal_test_accuracy 0.5886354541816726
temporal_test_precision 0.10803025391935897
temporal_test_recall 0.09252521854691055
temporal_test_f1 0.08474615273562676
temporal_test_recall_weighted 0.5886354541816726
temporal_test_precision_weighted 0.4158531557489751
temporal_test_f1_weighted 0.4722916259768107
temporal_test_runtime 21.4312
temporal_test_samples_per_second 116.606
temporal_test_mem_cpu_alloc_delta 0
temporal_test_mem_gpu_alloc_delta 0
temporal_test_mem_cpu_peaked_delta 0
temporal_test_mem_gpu_peaked_delta 579536896


In [1]:
trainer.evaluate()

NameError: name 'trainer' is not defined

In [42]:
a = trainer.predict(tokenized_test_ds)

# tokenized_temporal_test_ds

In [12]:
type(a)

transformers.trainer_utils.PredictionOutput

In [43]:
a.metrics

{'test_loss': 1.9834322929382324,
 'test_accuracy': 0.5886354541816726,
 'test_precision': 0.10803025391935897,
 'test_recall': 0.09252521854691055,
 'test_f1': 0.08474615273562676,
 'test_recall_weighted': 0.5886354541816726,
 'test_precision_weighted': 0.4158531557489751,
 'test_f1_weighted': 0.4722916259768107,
 'test_runtime': 21.3746,
 'test_samples_per_second': 116.914,
 'test_mem_cpu_alloc_delta': 0,
 'test_mem_gpu_alloc_delta': 0,
 'test_mem_cpu_peaked_delta': 0,
 'test_mem_gpu_peaked_delta': 579536896}

In [None]:
trainer.save_model("outputs/model")

# to save encoder 
joblib.dump(le,'outputs/model/labelEncoder.joblib',compress=9)
print("Model and encoder are saved")

print("Evaluation is started")
trainer.evaluate()
print("Evaluation is completed")

In [2]:
from azureml.core import Workspace
ws = Workspace.from_config()