In [1]:
from pandas import DataFrame
import numpy as np
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report
import torch
import os
from tqdm import tqdm
import pandas as pd
from typing import Counter
import pickle
from transformers import TrainingArguments
from transformers import Trainer
from torch.nn import CrossEntropyLoss
import torch.nn.functional as F
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import EarlyStoppingCallback
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from typing import Optional, List
import wandb
wandb.init(project="rdrop_demo", 
           name="bert_base",
           tags=["baseline"],
           group="bert")
os.environ["CUDA_VISIBLE_DEVICES"]="0"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
data_source_path = "/data/tmp/nlp-data/open_source_data/classification/ChnSentiCorp_htl_all.csv"
output_path = "/data/christmas.wang/project/project_demo/output/classification/"

[34m[1mwandb[0m: Currently logged in as: [33m8christmas8[0m (use `wandb login --relogin` to force relogin)


In [2]:
data_source_df = pd.read_csv(data_source_path)
data_source_df.dropna(how="any", axis=0, inplace=True)
data_source_df.head()

Unnamed: 0,label,review
0,1,"距离川沙公路较近,但是公交指示不对,如果是""蔡陆线""的话,会非常麻烦.建议用别的路线.房间较..."
1,1,商务大床房，房间很大，床有2M宽，整体感觉经济实惠不错!
2,1,早餐太差，无论去多少人，那边也不加食品的。酒店应该重视一下这个问题了。房间本身很好。
3,1,宾馆在小街道上，不大好找，但还好北京热心同胞很多~宾馆设施跟介绍的差不多，房间很小，确实挺小...
4,1,"CBD中心,周围没什么店铺,说5星有点勉强.不知道为什么卫生间没有电吹风"


In [3]:
x = data_source_df["label"].value_counts()
x

1    5322
0    2443
Name: label, dtype: int64

In [4]:
train_df, test_df = train_test_split(data_source_df, test_size=0.2, random_state=42, shuffle=True)
print("Train contains [{}] records & Test contain s [{}] records".format(train_df.shape[0], test_df.shape[0]))

Train contains [6212] records & Test contain s [1553] records


In [5]:
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred, average='binary')
    precision = precision_score(y_true=labels, y_pred=pred, average='binary')
    f1 = f1_score(y_true=labels, y_pred=pred, average='binary')

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [6]:
class MultilabelTrainer(Trainer):
    
    def compute_kl_loss(self, p, q):
    
        p_loss = F.kl_div(F.log_softmax(p, dim=-1), F.softmax(q, dim=-1), reduction='none')
        q_loss = F.kl_div(F.log_softmax(q, dim=-1), F.softmax(p, dim=-1), reduction='none')

        # You can choose whether to use function "sum" and "mean" depending on your task
        p_loss = p_loss.sum()
        q_loss = q_loss.sum()

        loss = (p_loss + q_loss) / 2
        return loss

    def compute_loss(self, model, inputs, return_outputs=False):
        """
        How the loss is computed by Trainer. By default, all models return the loss in the first element.

        Subclass and override for custom behavior.
        """
        labels = inputs.get("labels")
        outputs_1 = model(**inputs)
        outputs_2 = model(**inputs)

        
        logits_1 = outputs_1.get('logits')
        logits_2 = outputs_2.get('logits')
        loss_fct = CrossEntropyLoss()
        
        # cross entropy loss for classifier
        ce_loss = 0.5 * (loss_fct(logits_1, labels) + loss_fct(logits_2, labels))
        kl_loss = self.compute_kl_loss(logits_1, logits_2)

        # carefully choose hyper-parameters
        loss = ce_loss + 4 * kl_loss
        return (loss, outputs_1) if return_outputs else loss

In [7]:
def data_oversampling(x: Optional[List], y: Optional[List]):
    x = np.array(x).reshape(-1, 1)
    ros = RandomOverSampler(random_state=0)
    x, y = ros.fit_resample(x, y)
    x = [ele[0] for ele in x]
    return x, y

In [8]:
# Create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [9]:
def classification_report_csv(report, path):
    report_data = []
    lines = report.split('\n')
    for line in lines[2:-5]:
        row = {}
        row_data = line.split('      ')
        row['class'] = row_data[1]
        row['precision'] = float(row_data[2].strip())
        row['recall'] = float(row_data[3].strip())
        row['f1_score'] = float(row_data[4].strip())
        row['support'] = float(row_data[5].strip())
        report_data.append(row)
    dataframe = pd.DataFrame.from_dict(report_data)
    dataframe.to_csv(path, index = False)

In [28]:
def train(df: DataFrame, rdrop: False):
    # Define pretrained tokenizer and model
    tokenizer = BertTokenizer.from_pretrained('/data/tmp/christmas.wang/chinese_wwm_ext_pytorch/', do_lower_case=False)
    model = BertForSequenceClassification.from_pretrained("/data/tmp/christmas.wang/chinese_wwm_ext_pytorch/", num_labels=2, hidden_dropout_prob=0.3)
    model = model.to(device)

    # ********************** Data Process **********************
    x = []
    y = []
    for index, row in tqdm(df.iterrows(), total=df.shape[0]):
        text = str(row["review"]).strip()
        if len(text) < 1:
            continue
        x.append(text)
        y.append(row["label"])
    # Label Encode
    le = LabelEncoder()
    le.fit(y)
    y = list(le.transform(y))
    output = open(os.path.join(output_path, "output_encoder.pkl"), 'wb')
    pickle.dump(le, output) 
    print(dict(Counter(y)))
    
    x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2)
    x_train_tokenized = tokenizer(x_train, padding=True, truncation=True, max_length=512)
    x_val_tokenized = tokenizer(x_val, padding=True, truncation=True, max_length=512)
    train_dataset = Dataset(x_train_tokenized, y_train)
    val_dataset = Dataset(x_val_tokenized, y_val)

    # ********************** train **********************
    # Define Trainer
    args = TrainingArguments(
        report_to="wandb",
        output_dir=output_path,
        evaluation_strategy="steps",
        eval_steps=500,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        learning_rate=2e-5,
        num_train_epochs=6,
        seed=0,
        load_best_model_at_end=True,
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    )
    
    if rdrop:
        trainer = MultilabelTrainer(
            model=model,
            args=args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            compute_metrics=compute_metrics,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
        )

    trainer.train()
    model_dir = os.path.join(output_path, "model_save")
    model_to_save = model.module if hasattr(model, 'module') else model
    torch.save(model_to_save.state_dict(), os.path.join(model_dir, "pytorch_model.bin"))
    model_to_save.config.to_json_file(os.path.join(model_dir, "config.json"))
    tokenizer.save_vocabulary(os.path.join(model_dir, "vocab.txt"))

    # ********************** Predict **********************
    x_test = list(test_df["review"])
    test_df['label_id'] = le.transform(test_df["label"].tolist())
    y_true = list(test_df['label_id'])
    x_test_tokenized = tokenizer(x_test, padding=True, truncation=True, max_length=512)

    # Create torch dataset
    test_dataset = Dataset(x_test_tokenized)

    model = BertForSequenceClassification.from_pretrained(model_dir, num_labels=2)
    # Define test trainer
    test_trainer = Trainer(model)

    # Make prediction
    raw_pred, _, _ = test_trainer.predict(test_dataset)

    # Preprocess raw predictions
    y_pred = np.argmax(raw_pred, axis=1)
    report = classification_report(y_true, y_pred)
    classification_report_csv(report, output_path+"report.csv")
    print(report)

In [29]:
train(train_df, False)

Didn't find file /data/tmp/christmas.wang/chinese_wwm_ext_pytorch/added_tokens.json. We won't load it.
Didn't find file /data/tmp/christmas.wang/chinese_wwm_ext_pytorch/special_tokens_map.json. We won't load it.
Didn't find file /data/tmp/christmas.wang/chinese_wwm_ext_pytorch/tokenizer_config.json. We won't load it.
Didn't find file /data/tmp/christmas.wang/chinese_wwm_ext_pytorch/tokenizer.json. We won't load it.
loading file /data/tmp/christmas.wang/chinese_wwm_ext_pytorch/vocab.txt
loading file None
loading file None
loading file None
loading file None
loading configuration file /data/tmp/christmas.wang/chinese_wwm_ext_pytorch/config.json
loading configuration file /data/tmp/christmas.wang/chinese_wwm_ext_pytorch/config.json
Model config BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.3,
  "hidden_size": 768,
  "initializer_range": 0.02

{1: 4274, 0: 1938}


PyTorch: setting up devices
***** Running training *****
  Num examples = 4969
  Num Epochs = 6
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1866
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss
