In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID";
os.environ["CUDA_VISIBLE_DEVICES"]="0";   

In [None]:
!pip install ktrain

# Text Classification with Hugging Face Transformers

## Load Data Into Arrays

In [None]:
categories = ['alt.atheism', 'soc.religion.christian',
             'comp.graphics', 'sci.med']
from sklearn.datasets import fetch_20newsgroups
train_b = fetch_20newsgroups(subset='train', random_state=42)
test_b = fetch_20newsgroups(subset='test', random_state=42)

print('size of training set: %s' % (len(train_b['data'])))
print('size of validation set: %s' % (len(test_b['data'])))
print('classes: %s' % (train_b.target_names))

x_train = train_b.data
y_train = train_b.target
x_test = test_b.data
y_test = test_b.target

## STEP 1: Preprocess Data and Build a Transformer Model

For `MODEL_NAME`, *ktrain* supports both the "official" built-in models [available here](https://huggingface.co/transformers/pretrained_models.html) and the [community-upoaded models available here](https://huggingface.co/models).

In [None]:
import ktrain
from ktrain import text
MODEL_NAME = 'distilbert-base-uncased'
t = text.Transformer(MODEL_NAME, maxlen=500, class_names=train_b.target_names)
trn = t.preprocess_train(x_train, y_train)
val = t.preprocess_test(x_test, y_test)
model = t.get_classifier()
learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=6)

## STEP 2 [Optional]: Estimate a Good Learning Rate

Learning rates between `2e-5` and `5e-5` tend to work well with transformer models based on papers from Google. However, we will run our learning-rate-finder for two epochs to estimate the LR on this particular dataset.

As shown below, our results are consistent Google's findings.

In [None]:
learner.lr_find(show_plot=True, max_epochs=2)

## STEP 3: Train Model

Train using a [1cycle learning rate schedule](https://arxiv.org/pdf/1803.09820.pdf).

In [None]:
learner.fit_onecycle(8e-7, 4)

## STEP 4: Evaluate/Inspect Model

In [None]:
learner.validate(class_names=t.get_classes())

In [None]:
# the one we got most wrong
learner.view_top_losses(n=1, preproc=t)

In [None]:
# understandable mistake - 
print(x_test[521])

## STEP 5: Make Predictions on New Data in Deployment

In [None]:
predictor = ktrain.get_predictor(learner.model, preproc=t)

In [None]:
text= 'Hello, I am having development flaw, that needs to changed'

In [None]:
predictor.predict(text)

In [None]:
predictor.explain(text)

In [None]:
predictor.save('/tmp/req_predictor')

In [None]:
reloaded_predictor = ktrain.load_predictor('/tmp/req_predictor')

In [None]:
reloaded_predictor.predict(text)

In [None]:
reloaded_predictor.predict_proba(text)

In [None]:
reloaded_predictor.get_classes()