In [1]:
! pip install fast-bert



In [2]:
import numpy as np
import pandas as pd
import apex
from fast_bert.data import BertDataBunch
from fast_bert.learner import BertLearner
from fast_bert.metrics import accuracy, auc



from pathlib import Path
from typing import *

from sklearn.model_selection import train_test_split



import torch
import torch.optim as optim
from fastai import *
from fastai.vision import *
from fastai.text import *
from fastai.callbacks import *
from sklearn.model_selection import  train_test_split

package_dir_a = "../input/pytorch-pretrained-BERT/"
sys.path.insert(0, package_dir_a)


from pytorch_pretrained_bert import BertTokenizer
from pytorch_pretrained_bert.modeling import BertConfig, BertForSequenceClassification

import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="4"  # specify which GPU(s) to be used


In [2]:


DATA_ROOT = '../input/'

class Config(dict):

    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        for k, v in kwargs.items():
            setattr(self, k, v)

    def set(self, key, val):
        self[key] = val
        setattr(self, key, val)


config = Config(
    testing=False,
    bert_model_name="bert-base-uncased",
    max_lr=2e-5,
    epochs=3,
    use_fp16=True,
    bs=32,
    discriminative=False,
    max_seq_len=240,
)


bert_tok = BertTokenizer.from_pretrained('../input/bert-pretrained-models/uncased_L-12_H-768_A-12/'
)


def _join_texts(texts:Collection[str], mark_fields:bool=False, sos_token:Optional[str]=BOS):
    """Borrowed from fast.ai source"""
    if not isinstance(texts, np.ndarray): texts = np.array(texts)
    if is1d(texts): texts = texts[:,None]
    df = pd.DataFrame({i:texts[:,i] for i in range(texts.shape[1])})
    text_col = f'{FLD} {1}' + df[0].astype(str) if mark_fields else df[0].astype(str)
    if sos_token is not None: text_col = f"{sos_token} " + text_col
    for i in range(1,len(df.columns)):
        #text_col += (f' {FLD} {i+1} ' if mark_fields else ' ') + df[i]
        text_col += (f' {FLD} {i+1} ' if mark_fields else ' ') + df[i].astype(str)
    return text_col.values


class FastAiBertTokenizer(BaseTokenizer):

    """Wrapper around BertTokenizer to be compatible with fast.ai"""
    def __init__(self, tokenizer: BertTokenizer, max_seq_len: int=128, **kwargs):
        self._pretrained_tokenizer = tokenizer
        self.max_seq_len = max_seq_len

    def __call__(self, *args, **kwargs):
        return self

    def tokenizer(self, t:str) -> List[str]:
        """Limits the maximum sequence length"""
        return ["[CLS]"] + self._pretrained_tokenizer.tokenize(t)[:self.max_seq_len - 2] + ["[SEP]"]


train = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv')
train, val = train_test_split(train,test_size=0.01,shuffle=True, random_state=1024)

test = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv')


fastai_bert_vocab = Vocab(list(bert_tok.vocab.keys()))

fastai_tokenizer = Tokenizer(tok_func=FastAiBertTokenizer(bert_tok, max_seq_len=config.max_seq_len), pre_rules=[], post_rules=[])

label_cols = ['target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']



In [None]:
from pytorch_pretrained_bert import convert_tf_checkpoint_to_pytorch
from pytorch_pretrained_bert import BertTokenizer, BertForSequenceClassification,BertAdam
# Translate model from tensorflow to pytorch
WORK_DIR = "../working/"
BERT_MODEL_PATH = '../input/bert-pretrained-models/uncased_L-12_H-768_A-12/'
convert_tf_checkpoint_to_pytorch.convert_tf_checkpoint_to_pytorch(
    BERT_MODEL_PATH + 'bert_model.ckpt',
BERT_MODEL_PATH + 'bert_config.json',
WORK_DIR + 'pytorch_model.bin')

shutil.copyfile(BERT_MODEL_PATH + 'bert_config.json', WORK_DIR + 'bert_config.json')

  from ._conv import register_converters as _register_converters


Building PyTorch model from configuration: {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size": 30522
}

Converting TensorFlow checkpoint from /home/windywinter/JigSaw/input/bert-pretrained-models/uncased_L-12_H-768_A-12/bert_model.ckpt
Loading TF weight bert/embeddings/LayerNorm/beta with shape [768]
Loading TF weight bert/embeddings/LayerNorm/gamma with shape [768]
Loading TF weight bert/embeddings/position_embeddings with shape [512, 768]
Loading TF weight bert/embeddings/token_type_embeddings with shape [2, 768]
Loading TF weight bert/embeddings/word_embeddings with shape [30522, 768]
Loading TF weight bert/encoder/layer_0/attention/output/LayerNorm/beta with shape [768]
Loading TF weight bert/encoder/layer_0/attention/output/LayerNor

Loading TF weight bert/encoder/layer_4/intermediate/dense/kernel with shape [768, 3072]
Loading TF weight bert/encoder/layer_4/output/LayerNorm/beta with shape [768]
Loading TF weight bert/encoder/layer_4/output/LayerNorm/gamma with shape [768]
Loading TF weight bert/encoder/layer_4/output/dense/bias with shape [768]
Loading TF weight bert/encoder/layer_4/output/dense/kernel with shape [3072, 768]
Loading TF weight bert/encoder/layer_5/attention/output/LayerNorm/beta with shape [768]
Loading TF weight bert/encoder/layer_5/attention/output/LayerNorm/gamma with shape [768]
Loading TF weight bert/encoder/layer_5/attention/output/dense/bias with shape [768]
Loading TF weight bert/encoder/layer_5/attention/output/dense/kernel with shape [768, 768]
Loading TF weight bert/encoder/layer_5/attention/self/key/bias with shape [768]
Loading TF weight bert/encoder/layer_5/attention/self/key/kernel with shape [768, 768]
Loading TF weight bert/encoder/layer_5/attention/self/query/bias with shape [768

'../working/bert_config.json'

In [None]:
class BertTokenizeProcessor(TokenizeProcessor):

    def __init__(self, tokenizer):
        super().__init__(tokenizer=tokenizer, include_bos=False, include_eos=False)


class BertNumericalizeProcessor(NumericalizeProcessor):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, vocab=Vocab(list(bert_tok.vocab.keys())), **kwargs)


def get_bert_processor(tokenizer:Tokenizer=None, vocab:Vocab=None):
    """
    Constructing preprocessors for BERT
    We remove sos/eos tokens since we add that ourselves in the tokenizer.
    We also use a custom vocabulary to match the numericalization with the original BERT model.
    """
    return [BertTokenizeProcessor(tokenizer=tokenizer),
            NumericalizeProcessor(vocab=vocab)]


class BertDataBunch(TextDataBunch):

    @classmethod
    def from_df(cls, path:PathOrStr, train_df:DataFrame, valid_df:DataFrame, test_df:Optional[DataFrame]=None,
                tokenizer:Tokenizer=None, vocab:Vocab=None, classes:Collection[str]=None, text_cols:IntsOrStrs=1,
                label_cols:IntsOrStrs=0, label_delim:str=None, **kwargs) -> DataBunch:
        "Create a `TextDataBunch` from DataFrames."
        p_kwargs, kwargs = split_kwargs_by_func(kwargs, get_bert_processor)
        # use our custom processors while taking tokenizer and vocab as kwargs
        processor = get_bert_processor(tokenizer=tokenizer, vocab=vocab, **p_kwargs)
        if classes is None and is_listy(label_cols) and len(label_cols) > 1: classes = label_cols
        src = ItemLists(path, TextList.from_df(train_df, path, cols=text_cols, processor=processor),
                        TextList.from_df(valid_df, path, cols=text_cols, processor=processor))
        src = src.label_for_lm() if cls==TextLMDataBunch else src.label_from_df(cols=label_cols, classes=classes)
        if test_df is not None: src.add_test(TextList.from_df(test_df, path, cols=text_cols))
        return src.databunch(**kwargs)


# this will produce a virtually identical databunch to the code above
databunch = BertDataBunch.from_df(".", train, val, test,
                  tokenizer=fastai_tokenizer,
                  vocab=fastai_bert_vocab,
                  text_cols="comment_text",
                  label_cols=label_cols,
                  bs=config.bs,
                  collate_fn=partial(pad_collate, pad_first=False, pad_idx=0),
             )


bert_model =  BertForSequenceClassification.from_pretrained("../working/",cache_dir=None,num_labels=6)


# def custom_loss(data, targets):
#     ''' Define custom loss function for weighted BCE on 'target' column '''
#     bce_loss_1 = nn.BCEWithLogitsLoss(weight=targets[:,1:2])(data[:,:1],targets[:,:1])
#     bce_loss_2 = nn.BCEWithLogitsLoss()(data[:,1:],targets[:,2:])
#     return (bce_loss_1 * loss_weight) + bce_loss_2

def custom_loss(y_pred, y_true):
    bce_loss_1 = F.binary_cross_entropy_with_logits(y_pred[:,:1].reshape(-1), y_true[:,:1].reshape(-1), weight=y_true[:,1:2].reshape(-1))   
    N = y_true.shape[1]
    custom_loss = bce_loss_1
    for i in range(N-2):
        custom_loss += F.binary_cross_entropy_with_logits(y_pred[:,1+i], y_true[:,2+i])
    return custom_loss


learner = Learner(databunch, bert_model, loss_func=torch.nn.BCEWithLogitsLoss())


learner.fit_one_cycle(config.epochs, max_lr=config.max_lr)

print("开始保存模型")

torch.save(learner.model,'./fastai_bert_full_3_epoch.model')



epoch,train_loss,valid_loss,time
