In [1]:
from pathlib import Path

In [2]:
DATA_PATH = Path("./data")
LABEL_PATH = Path("./labels")
OUTPUT_DIR = Path("./output")

In [None]:
from fast_bert.data_cls import BertDataBunch

labels_list = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']

databunch = BertDataBunch(DATA_PATH, LABEL_PATH,
                          tokenizer='distilbert-base-cased',
                          train_file='train_smaller.csv',
                          val_file='valid.csv',
                          label_file='labels.csv',
                          text_col='comment_text',
                          label_col=labels_list,
                          batch_size_per_gpu=16,
                          max_seq_length=512,
                          multi_gpu=False,
                          multi_label=True,
                          model_type='distilbert')


In [None]:
from fast_bert.learner_cls import BertLearner
from fast_bert.metrics import accuracy, roc_auc, fbeta
import logging
import torch

logger = logging.getLogger()
device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')

metrics = [{'name': 'accuracy', 'function': accuracy}]
metrics.append({'name': 'roc_auc', 'function': roc_auc})
metrics.append({'name': 'fbeta', 'function': fbeta})

learner = BertLearner.from_pretrained_model(
        databunch,
        pretrained_path='distilbert-base-cased',
        metrics=metrics,
        device=device,
        logger=logger,
        output_dir=OUTPUT_DIR,
        finetuned_wgts_path=None,
        warmup_steps=500,
        multi_gpu=False,
        is_fp16=True,
        multi_label=True,
        logging_steps=0)

In [None]:
learner.fit(epochs=5,
        lr=6e-5,
        validate=True, # Evaluate the model after each epoch
        schedule_type="warmup_cosine",
        optimizer_type="lamb")

In [None]:
learner.save_model()

In [None]:
texts = ['you motherfucker, i am going to kill you',
         'this is a nice comment, i love you so much']
predictions = learner.predict_batch(texts)

In [None]:
i = 0
for text in predictions:
    print(f"Prediction for sentence: {texts[i]}:")
    i+=1
    for pred in text:
        print(pred)
    print()

### Prediction object for when a fine-tuned model isn't in memory

In [6]:
from fast_bert.prediction import BertClassificationPredictor

MODEL_PATH = "./output/model_out"

predictor = BertClassificationPredictor(
            model_path=MODEL_PATH,
            label_path=LABEL_PATH, # location for labels.csv file
            multi_label=True,
            model_type='distilbert',
            do_lower_case=False)

# Single prediction
single_text = "i hate you"
single_prediction = predictor.predict(single_text)

# Batch predictions
multiple_texts = ['i am going to kill you',
         'this is a nice comment, i love you so much']

multiple_predictions = predictor.predict_batch(multiple_texts)

./output/model_out
<class 'str'>


In [7]:
print(f"prediction for sentence: {single_text}:")
for pred in single_prediction:
    print(pred)

prediction for sentence: i hate you:
('toxic', 0.7379467487335205)
('insult', 0.29986605048179626)
('obscene', 0.05977706238627434)
('identity_hate', 0.03005264513194561)
('threat', 0.021713832393288612)
('severe_toxic', 0.0054395017214119434)


In [8]:
i = 0
for text in multiple_predictions:
    print(f"Prediction for sentence: {multiple_texts[i]}:")
    i+=1
    for pred in text:
        print(pred)
    print()

Prediction for sentence: i am going to kill you:
('toxic', 0.8307578563690186)
('insult', 0.3980618715286255)
('obscene', 0.10295048356056213)
('identity_hate', 0.056071922183036804)
('threat', 0.04543887451291084)
('severe_toxic', 0.010167397558689117)

Prediction for sentence: this is a nice comment, i love you so much:
('toxic', 0.0031685384456068277)
('obscene', 0.0009596769232302904)
('insult', 0.000811850477475673)
('identity_hate', 0.00038552848855033517)
('threat', 0.00035436335019767284)
('severe_toxic', 0.00014609545178245753)

