# GPU
Get the information of GPU

In [None]:
%%bash
nvidia-smi

Sat Mar  5 02:58:01 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   44C    P0    29W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Install Packages
If you're opening this Notebook on colab, you will probably need to install 🤗 Transformers, Datasets, wandb, prettytable. Uncomment the following cell and run it.

In [None]:
!pip install transformers
!pip install datasets
!pip install wandb -qqq
!pip install prettytable

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 5.0 MB/s 
Collecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 53.5 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 5.8 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 56.1 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 46.3 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Foun

# Load Data
Get Data From my github: https://github.com/Christmas-Wong

We get the data from toutiao news. The text is the title of news, eg:
```
[{'text': '前女友喷赫塞：儿子有需要，你却在买车和名牌衣服', 'label': 'news_sports'}, {'text': '以色列竟想策反俄罗斯支持打击伊朗，而且还要先下手为强！', 'label': 'news_military'}, {'text': '苹果爆料“差异定价”猫腻，原来App商家也挖坑！', 'label': 'news_tech'}]

```

In [None]:
!git clone https://github.com/Christmas-Wong/my_dataset.git

Cloning into 'my_dataset'...
remote: Enumerating objects: 9, done.[K
remote: Counting objects: 100% (9/9), done.[K
remote: Compressing objects: 100% (7/7), done.[K
remote: Total 9 (delta 0), reused 6 (delta 0), pack-reused 0[K
Unpacking objects: 100% (9/9), done.


In [None]:
!ls -R

.:
my_dataset  sample_data

./my_dataset:
README.md  toutiao_news

./my_dataset/toutiao_news:
eval.json  test.json  train.json

./sample_data:
anscombe.json		      mnist_test.csv
california_housing_test.csv   mnist_train_small.csv
california_housing_train.csv  README.md


# Fine Tuning
fine_tuning bert model to classify the news

## Import Packages

In [None]:
import json
import torch
import wandb
import codecs
import pandas as pd
import numpy as np
from tqdm import tqdm
from datasets import load_dataset
from prettytable import PrettyTable
from transformers import(
    BertTokenizer,
    BertConfig,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback,
    default_data_collator,
    DataCollatorWithPadding,
)
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    classification_report,
    confusion_matrix
)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"


wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

## Init Wandb

In [None]:
wandb.init(
    project="Colab_Mix_Precision",
    group="group_01",
    name="mix_precision", # you should change it when start a new training
)

[34m[1mwandb[0m: Currently logged in as: [33m8christmas8[0m (use `wandb login --relogin` to force relogin)


## Read Dataset
We will use the 🤗 Datasets library to download the data. This can be easily done with the functions load_dataset.

In [None]:
data_files = {
    "train": f"./my_dataset/toutiao_news/train.json",
    "test": f"./my_dataset/toutiao_news/test.json",
    "valid": f"./my_dataset/toutiao_news/eval.json"
}
raw_datasets = load_dataset("json", data_files=data_files)
raw_datasets

Using custom data configuration default-82f7f928150b0439


Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-82f7f928150b0439/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b...


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-82f7f928150b0439/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 42688
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 5336
    })
    valid: Dataset({
        features: ['text', 'label'],
        num_rows: 5336
    })
})

## Build Label_2_id

In [None]:
label_list = raw_datasets["train"].unique("label")
label_list.sort()  # Let's sort it for determinism
num_labels = len(label_list)

# label_2_id
label_2_id = {}
id_2_label = {}
for index, ele in enumerate(label_list):
    label_2_id[ele] = index
    id_2_label[index] = ele

print(label_2_id)
print(id_2_label)

{'news_agriculture': 0, 'news_car': 1, 'news_culture': 2, 'news_edu': 3, 'news_entertainment': 4, 'news_finance': 5, 'news_game': 6, 'news_house': 7, 'news_military': 8, 'news_sports': 9, 'news_stock': 10, 'news_story': 11, 'news_tech': 12, 'news_travel': 13, 'news_world': 14}
{0: 'news_agriculture', 1: 'news_car', 2: 'news_culture', 3: 'news_edu', 4: 'news_entertainment', 5: 'news_finance', 6: 'news_game', 7: 'news_house', 8: 'news_military', 9: 'news_sports', 10: 'news_stock', 11: 'news_story', 12: 'news_tech', 13: 'news_travel', 14: 'news_world'}


## Load Pre_trained Model From Huggingface
Now that our data is ready, we can download the pretrained model and fine-tune it. Since all our tasks are about sentence classification, we use the BertForSequenceClassification class. Like with the tokenizer, the from_pretrained method will download and cache the model for us. The only thing we have to specify is the number of labels for our problem (We can get the number of label by count label_2_id keys)



In [None]:
tokenizer = BertTokenizer.from_pretrained("hfl/chinese-bert-wwm-ext")
bert_config = BertConfig.from_pretrained("hfl/chinese-bert-wwm-ext", num_labels=num_labels)
model = BertForSequenceClassification.from_pretrained("hfl/chinese-bert-wwm-ext", config=bert_config)

Downloading:   0%|          | 0.00/107k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/19.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/647 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/393M [00:00<?, ?B/s]

Some weights of the model checkpoint at hfl/chinese-bert-wwm-ext were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkp

## Preprocessing the data
Before we can feed those texts to our model, we need to preprocess them. This is done by a 🤗 Transformers Tokenizer which will (as the name indicates) tokenize the inputs (including converting the tokens to their corresponding IDs in the pretrained vocabulary) and put it in a format the model expects, as well as generate the other inputs that model requires.

Meanwhile we should also convert the label from text to numbers

To do all of this, we instantiate our tokenizer with the BertTokenizer.from_pretrained method, which will ensure:

we get a tokenizer that corresponds to the model architecture we want to use,
we download the vocabulary used when pretraining this specific checkpoint.

We can them write the function that will preprocess our samples. We just feed them to the tokenizer with the argument truncation=True. This will ensure that an input longer that what the model selected can handle will be truncated to the maximum length accepted by the model.


To apply this function on all the sentences (or pairs of sentences) in our dataset, we just use the map method of our dataset object we created earlier. This will apply the function on all the elements of all the splits in dataset, so our training, validation and testing data will be preprocessed in one single command.

In [None]:
def preprocess_function(examples):
    # Tokenize the texts
    result = tokenizer(
        examples["text"],
        padding="max_length",
        max_length=128,
        truncation=True
    )

    # Map labels to IDs
    result["label"] = [(label_2_id[item] if item != -1 else -1) for item in examples["label"]]
    return result

raw_datasets = raw_datasets.map(
    preprocess_function,
    batched=True,
    load_from_cache_file=True,
    desc="Running tokenizer on dataset",
)

Running tokenizer on dataset:   0%|          | 0/43 [00:00<?, ?ba/s]

Running tokenizer on dataset:   0%|          | 0/6 [00:00<?, ?ba/s]

Running tokenizer on dataset:   0%|          | 0/6 [00:00<?, ?ba/s]

## Compute Mertirc

The definition for our Trainer is how to compute the metrics from the predictions. We need to define a function for this, which will just use the metric from sklearn, the only preprocessing we have to do is to take the argmax of our predicted logits:

In [None]:
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(
        y_true=labels,
        y_pred=pred
    )
    precision, recall, f1, _ = precision_recall_fscore_support(
        y_true=labels,
        y_pred=pred,
        average="macro"
    )
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

## Training Arguments

To instantiate a Trainer, we will need to define two more things. The most important is the TrainingArguments, which is a class that contains all the attributes to customize the training. It requires one folder name, which will be used to save the checkpoints of the model, and all other arguments are optional:

In [None]:
training_args = TrainingArguments(
    output_dir="./mix_precisioin",
    do_train=True,
    do_eval=True,
    do_predict=False,
    learning_rate=0.00001,
    per_device_train_batch_size=32,
    seed=2022,
    num_train_epochs=20,
    load_best_model_at_end=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=32,
    eval_steps=5000,
    save_steps=5000,
    fp16=True # if mix_precision_train is on, set this True, else False
)
# Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding.
pad_to_max_length = "max_length"
if pad_to_max_length:
    data_collator = default_data_collator
elif training_args.fp16:
    data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)
else:
    data_collator = None


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


Here we set the evaluation to be done at the end of each epoch, tweak the learning rate, use the batch_size(16) and customize the number of epochs for training, as well as the weight decay. Since the best model might not be the one at the end of training, we ask the Trainer to load the best model it saved (according to metric_name) at the end of training.

## Train
Then we just need to pass all of this along with our datasets to the Trainer:

We can now finetune our model by just calling the train method:

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=raw_datasets["train"] if training_args.do_train else None,
    eval_dataset=raw_datasets["valid"] if training_args.do_eval else None,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)],
    tokenizer=tokenizer,
    data_collator=data_collator
)
trainer.train()

Using amp half precision backend
The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 42688
  Num Epochs = 20
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 26680
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
5000,0.4495,1.638402,0.563906,0.547883,0.545389,0.545361
10000,0.1854,2.079749,0.558471,0.546837,0.541075,0.542473
15000,0.1072,2.457842,0.560157,0.544977,0.539592,0.541445
20000,0.0867,2.714648,0.555097,0.534205,0.537818,0.535089
25000,0.0508,2.874787,0.557346,0.539265,0.532592,0.534653


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 5336
  Batch size = 32
Saving model checkpoint to ./mix_precisioin/checkpoint-5000
Configuration saved in ./mix_precisioin/checkpoint-5000/config.json
Model weights saved in ./mix_precisioin/checkpoint-5000/pytorch_model.bin
tokenizer config file saved in ./mix_precisioin/checkpoint-5000/tokenizer_config.json
Special tokens file saved in ./mix_precisioin/checkpoint-5000/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num 

TrainOutput(global_step=26680, training_loss=0.2116598318005609, metrics={'train_runtime': 11265.9361, 'train_samples_per_second': 75.782, 'train_steps_per_second': 2.368, 'total_flos': 5.616497856577536e+16, 'train_loss': 0.2116598318005609, 'epoch': 20.0})

# Evaluate

## Define Inference function

In [None]:
def bert_inference(model, tokenizer, json_object: list, id_2_label: dict):
    model.to(DEVICE)
    model.eval()
    with torch.no_grad():
        for ele in tqdm(json_object):
          text = ele["text"]
          encoding = tokenizer(text, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
          input_ids = encoding["input_ids"].to(DEVICE)
          attention_mask = encoding["attention_mask"].to(DEVICE)
          token_type_ids = encoding["token_type_ids"].to(DEVICE)
          output = model(
              input_ids,
              token_type_ids=token_type_ids,
              attention_mask=attention_mask,
              return_dict=True
          )
          label_code = output.logits.argmax(axis=1).detach().cpu().numpy()[0]
          topk = torch.topk(output.logits.flatten(), len(id_2_label)).indices.detach().cpu().numpy()
          ele["topk"] = [id_2_label[item] for item in topk]
          ele["pred_code"] = topk[0]
          ele["pred_label"] = ele["topk"][0]

## Define PrettyTable Functioin
Use Pretty Table to show report

In [None]:
def pretty_report(report: dict):
    result = PrettyTable()
    result.field_names = ["Class", "precision", "recall", "f1-score", "support"]
    for key, value in report.items():
        if key == "accuracy":
            result.add_row([key, 0, 0, value, report["weighted avg"]["support"]])
            continue
        result.add_row([key, str(value["precision"]), str(value["recall"]), str(value["f1-score"]), str(value["support"])])
    return result

## Define wandb Function
Tansport Data to Wandb Server

In [None]:
def wandb_confusion(list_true: list, list_pred: list, labels: list):
    wandb.log(
        {
            "conf_mat": wandb.plot.confusion_matrix(
                probs=None,
                y_true=list_true,
                preds=list_pred,
                class_names=labels
            )
        }
    )

def wandb_pr_recall_f1(evaluation_index: str, report: dict, labels: list):
    data_report = [[ele, report[ele][evaluation_index]] for ele in labels]
    table = wandb.Table(
        data=data_report,
        columns=["class_name", evaluation_index]
    )
    wandb.log(
        {
            evaluation_index+"_chart": wandb.plot.bar(
                table,
                "class_name",
                evaluation_index,
                title="Per Class "+evaluation_index
            )
        }
    )

In [None]:
def evaluate(model, tokenizer, json_object: list, id_2_label: dict, label_2_id: dict):
    bert_inference(model, tokenizer, json_object, id_2_label)
    list_true = [ele["pred_code"] for ele in json_object]
    list_pred = [label_2_id[ele["label"]] for ele in json_object]

    confusion = confusion_matrix(list_true, list_pred)

    report_dict = classification_report(
        list_true,
        list_pred,
        target_names=label_2_id.keys(),
        output_dict=True,
        digits=4)
    report_txt = classification_report(
        list_true,
        list_pred,
        target_names=label_2_id.keys(),
        output_dict=False,
        digits=4)
    report_table = pretty_report(report_dict)
    wandb_confusion(list_true, list_pred, list(label_2_id.keys()))
    for ele in ["precision", "recall", "f1-score"]:
        wandb_pr_recall_f1(ele, report_dict, list(label_2_id.keys()))

    return report_table, report_txt, confusion

In [None]:
def confusion_2_csv(matrix: list, labels: list) -> None:
    """Save Confusion Matrix into CSV File

    :param matrix: Confusion Matrix
    :param labels: Labels
    :param file: Target File
    :return: None
    """
    df_confusion = pd.DataFrame(matrix)
    df_confusion.columns = labels
    df_confusion["labels"] = labels
    df_confusion.set_index(["labels"], inplace=True)
    return df_confusion

## Evaluate

In [None]:
def jsonl_reader(file: str) -> list:
    """Get Json_List from jsonl File

    :param file: JSONL file
    :return:
    """
    result = list()
    with codecs.open(file, "r") as f:
        for line in f.readlines():
            json_ele = json.loads(line)
            result.append(json_ele)
    f.close()
    print("Read [{}] lines from JSONL File".format(len(result)))
    return result

valid_json = jsonl_reader(data_files["valid"])
report_table, report_txt, confusion = evaluate(
    model,
    tokenizer,
    valid_json,
    id_2_label, label_2_id
)
df_confusion = confusion_2_csv(confusion, list(label_2_id.keys()))
print("\n"+"report")
print(report_table)
print("\n"+"confusion matrix")
print(df_confusion)