In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from fastai.text import * 
import fastai
import pandas as pd
import torch

In [3]:
path = Path('hushed')

torch.cuda.is_available()

True

In [4]:
train_df = pd.read_csv(f'{path}/texts_train.csv')
validation_df = pd.read_csv(f'{path}/texts_valid.csv')

### Language Model

Create the data bunches that will be used to train our models.

In [5]:
data_lm = TextLMDataBunch.from_df(path, train_df, validation_df)
data_clas = TextClasDataBunch.from_df(path, train_df, validation_df)

# data_lm.save()
# data_clas.save()

Show some of the texts

In [6]:
# [data_lm.train_ds[i][0] for i in range(0, 10)]

Create our language model with a pre-trained model.

In [7]:
learn = language_model_learner(data_lm, pretrained_model=URLs.WT103_1, drop_mult=0.5)

Find our learning rate

In [8]:
# lr_find(learn)
# learn.recorder.plot()

Train the layers which are sitting on top of the pre-trained model.

In [9]:
learn.fit(10, lr=[1e-1, 1e-1, 1e-2, 1e-2])

epoch,train_loss,valid_loss,accuracy
1,5.494576,4.511455,0.229018
2,4.867121,4.077989,0.254777
3,4.478685,3.893178,0.273304
4,4.204893,3.791157,0.282857
5,3.996842,3.720086,0.293080
6,3.820628,3.667747,0.299152
7,3.684815,3.630629,0.307634
8,3.560889,3.606210,0.312768
9,3.461468,3.581243,0.317411
10,3.376632,3.570024,0.320759


Unfreeze the pre-trained model and train the entire stack

In [10]:
learn.unfreeze()
learn.fit_one_cycle(8, 1e-3, moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy
1,3.132373,3.546133,0.327054
2,3.081509,3.494530,0.332902
3,3.015216,3.466767,0.337366
4,2.920486,3.429966,0.347634
5,2.823235,3.416904,0.352455
6,2.733496,3.408597,0.354821
7,2.661086,3.411832,0.357768
8,2.601935,3.407901,0.358839


In [11]:
learn.save_encoder('ft_enc')

In [12]:
learn.predict("verification code", n_words=2)

'verification code ( on'

### Classifier

Create the text clasififer model which will use the language model to read the texts.

In [13]:
learn = text_classifier_learner(data_clas, drop_mult=0.5)
learn.load_encoder('ft_enc')
learn.freeze()

In [14]:
# data_clas.show_batch(rows=15)

In [15]:
# learn.lr_find()
# learn.recorder.plot()

Progressively train then unfreeze our classifier layers

In [16]:
learn.fit_one_cycle(1, 2e-2, moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy
1,0.228391,0.062690,0.980392


In [17]:
learn.freeze_to(-2)
learn.fit_one_cycle(1, slice(1e-2/(2.6**4),1e-2), moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy
1,0.127746,0.055450,0.983333


In [18]:
learn.freeze_to(-3)
learn.fit_one_cycle(1, slice(5e-3/(2.6**4),5e-3), moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy
1,0.073480,0.042131,0.984314


In [19]:
learn.unfreeze()
learn.fit_one_cycle(2, slice(1e-3/(2.6**4),1e-3), moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy
1,0.053236,0.033660,0.985294
2,0.061736,0.034653,0.988235


In [20]:
# learn.save('val-codes-000')

### Predictions

In [21]:
learn.predict("Can you send me the verification on those documents")

(Category good, tensor(0), tensor([0.9949, 0.0034, 0.0017]))

In [22]:
learn.predict("VK: 60679 - код для создания нового приложения.")

(Category verification,
 tensor(2),
 tensor([1.0273e-06, 2.8317e-06, 1.0000e+00]))

In [23]:
learn.predict("imo verification code: 3374")

(Category verification,
 tensor(2),
 tensor([1.9317e-05, 8.3473e-05, 9.9990e-01]))

In [24]:
learn.predict("8866 (WeChat Verification Code)")

(Category verification,
 tensor(2),
 tensor([1.7202e-08, 3.6625e-07, 1.0000e+00]))

In [25]:
learn.predict("Urgent UR awarded a complimentary trip to EuroDisinc Trav, Aco&Entry41 Or �1000. To claim txt DIS to 87121")

(Category spam, tensor(1), tensor([2.2232e-03, 9.9777e-01, 5.8124e-06]))

### Export the model

In [26]:
learn.export()