In [71]:
%reload_ext autoreload
%autoreload 2

In [2]:
#hide
!pip install -Uqq fastbook
import fastbook
fastbook.setup_book()

In [3]:
#hide
from fastbook import *
from IPython.display import display,HTML

# NLP Deep Dive: RNNs

## Text Preprocessing

### Tokenization

### Word Tokenization with fastai

In [6]:
from fastai.text.all import *
path = untar_data(URLs.IMDB)

In [7]:
files = get_text_files(path, folders = ['train', 'test', 'unsup'])

In [8]:
txt = files[0].open().read(); txt[:75]

'Soulless milking of cash cow franchise. Generic superhero flick. CGI showca'

In [9]:
spacy = WordTokenizer()
toks = first(spacy([txt]))
print(coll_repr(toks, 30))

(#432) ['Soulless','milking','of','cash','cow','franchise','.','Generic','superhero','flick','.','CGI','showcase','.','Gavin','Hood',"'s",'"','A','Series','of','Improbable','Events','.','"','Combinatoric','iteration','of','mutant','fight'...]


In [10]:
first(spacy(['The U.S. dollar $1 is $1.00.']))

(#9) ['The','U.S.','dollar','$','1','is','$','1.00','.']

In [11]:
tkn = Tokenizer(spacy)
print(coll_repr(tkn(txt), 31))

(#477) ['xxbos','xxmaj','soulless','milking','of','cash','cow','franchise','.','xxmaj','generic','superhero','flick','.','xxup','cgi','showcase','.','xxmaj','gavin','xxmaj','hood',"'s",'"','a','xxmaj','series','of','xxmaj','improbable','xxmaj'...]


In [12]:
defaults.text_proc_rules

[<function fastai.text.core.fix_html(x)>,
 <function fastai.text.core.replace_rep(t)>,
 <function fastai.text.core.replace_wrep(t)>,
 <function fastai.text.core.spec_add_spaces(t)>,
 <function fastai.text.core.rm_useless_spaces(t)>,
 <function fastai.text.core.replace_all_caps(t)>,
 <function fastai.text.core.replace_maj(t)>,
 <function fastai.text.core.lowercase(t, add_bos=True, add_eos=False)>]

In [13]:
coll_repr(tkn('&copy;   Fast.ai www.fast.ai/INDEX'), 31)

"(#11) ['xxbos','©','xxmaj','fast.ai','xxrep','3','w','.fast.ai','/','xxup','index'...]"

### Subword Tokenization

In [14]:
txts = L(o.open().read() for o in files[:2000])

In [15]:
def subword(sz):
    sp = SubwordTokenizer(vocab_sz=sz)
    sp.setup(txts)
    return ' '.join(first(sp([txt]))[:40])

In [16]:
subword(1000)

'▁So ul less ▁m il k ing ▁of ▁ca sh ▁co w ▁f ra n ch ise . ▁G en er ic ▁su per he ro ▁flick . ▁C G I ▁show ca se . ▁G a vi n ▁Ho'

In [17]:
subword(200)

'▁S o ul le s s ▁ m i l k ing ▁of ▁ca s h ▁ c ow ▁f r an ch i se . ▁ G en er ic ▁ s u p er h er o ▁f'

In [18]:
subword(10000)

'▁So ul less ▁milk ing ▁of ▁cash ▁cow ▁franchise . ▁Gene r ic ▁superhero ▁flick . ▁CGI ▁showcase . ▁Ga vin ▁Hood \' s ▁" A ▁Se ries ▁of ▁I m p ro b able ▁Even t s ." ▁Co'

### Numericalization with fastai

In [19]:
toks = tkn(txt)
print(coll_repr(tkn(txt), 31))

(#477) ['xxbos','xxmaj','soulless','milking','of','cash','cow','franchise','.','xxmaj','generic','superhero','flick','.','xxup','cgi','showcase','.','xxmaj','gavin','xxmaj','hood',"'s",'"','a','xxmaj','series','of','xxmaj','improbable','xxmaj'...]


In [20]:
toks200 = txts[:200].map(tkn)
toks200[0]

(#477) ['xxbos','xxmaj','soulless','milking','of','cash','cow','franchise','.','xxmaj'...]

In [21]:
num = Numericalize()
num.setup(toks200)
coll_repr(num.vocab,20)

"(#2272) ['xxunk','xxpad','xxbos','xxeos','xxfld','xxrep','xxwrep','xxup','xxmaj','the','.',',','a','and','of','to','is','i','in','it'...]"

In [22]:
nums = num(toks)[:20]; nums

tensor([   2,    8,    0,    0,   14, 1297,    0, 1628,   10,    8, 1629, 1630,  502,   10,    7, 1298,    0,   10,    8,    0])

In [23]:
' '.join(num.vocab[o] for o in nums)

'xxbos xxmaj xxunk xxunk of cash xxunk franchise . xxmaj generic superhero flick . xxup cgi xxunk . xxmaj xxunk'

### Putting Our Texts into Batches for a Language Model

In [25]:
stream = "In this chapter, we will go back over the example of classifying movie reviews we studied in chapter 1 and dig deeper under the surface. First we will look at the processing steps necessary to convert text into numbers and how to customize it. By doing this, we'll have another example of the PreProcessor used in the data block API.\nThen we will study how we build a language model and train it for a while."
tokens = tkn(stream)
bs,seq_len = 6,15
d_tokens = np.array([tokens[i*seq_len:(i+1)*seq_len] for i in range(bs)])
df = pd.DataFrame(d_tokens)
display(HTML(df.to_html(index=False,header=None)))

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
xxbos,xxmaj,in,this,chapter,",",we,will,go,back,over,the,example,of,classifying
movie,reviews,we,studied,in,chapter,1,and,dig,deeper,under,the,surface,.,xxmaj
first,we,will,look,at,the,processing,steps,necessary,to,convert,text,into,numbers,and
how,to,customize,it,.,xxmaj,by,doing,this,",",we,'ll,have,another,example
of,the,preprocessor,used,in,the,data,block,xxup,api,.,\n,xxmaj,then,we
will,study,how,we,build,a,language,model,and,train,it,for,a,while,.


In [26]:
bs,seq_len = 6,5
d_tokens = np.array([tokens[i*15:i*15+seq_len] for i in range(bs)])
df = pd.DataFrame(d_tokens)
display(HTML(df.to_html(index=False,header=None)))

0,1,2,3,4
xxbos,xxmaj,in,this,chapter
movie,reviews,we,studied,in
first,we,will,look,at
how,to,customize,it,.
of,the,preprocessor,used,in
will,study,how,we,build


In [27]:
bs,seq_len = 6,5
d_tokens = np.array([tokens[i*15+seq_len:i*15+2*seq_len] for i in range(bs)])
df = pd.DataFrame(d_tokens)
display(HTML(df.to_html(index=False,header=None)))

0,1,2,3,4
",",we,will,go,back
chapter,1,and,dig,deeper
the,processing,steps,necessary,to
xxmaj,by,doing,this,","
the,data,block,xxup,api
a,language,model,and,train


In [28]:
bs,seq_len = 6,5
d_tokens = np.array([tokens[i*15+10:i*15+15] for i in range(bs)])
df = pd.DataFrame(d_tokens)
display(HTML(df.to_html(index=False,header=None)))

0,1,2,3,4
over,the,example,of,classifying
under,the,surface,.,xxmaj
convert,text,into,numbers,and
we,'ll,have,another,example
.,\n,xxmaj,then,we
it,for,a,while,.


In [29]:
nums200 = toks200.map(num)

In [30]:
dl = LMDataLoader(nums200)

In [31]:
x,y = first(dl)
x.shape,y.shape

(torch.Size([64, 72]), torch.Size([64, 72]))

In [32]:
' '.join(num.vocab[o] for o in x[0][:20])

'xxbos xxmaj xxunk xxunk of cash xxunk franchise . xxmaj generic superhero flick . xxup cgi xxunk . xxmaj xxunk'

In [33]:
' '.join(num.vocab[o] for o in y[0][:20])

'xxmaj xxunk xxunk of cash xxunk franchise . xxmaj generic superhero flick . xxup cgi xxunk . xxmaj xxunk xxmaj'

## Training a Text Classifier

### Language Model Using DataBlock

In [34]:
get_imdb = partial(get_text_files, folders=['train', 'test', 'unsup'])

dls_lm = DataBlock(
    blocks=TextBlock.from_folder(path, is_lm=True),
    get_items=get_imdb, splitter=RandomSplitter(0.1)
).dataloaders(path, path=path, bs=64, seq_len=80)
# ).dataloaders(path, path=path, bs=128, seq_len=80)

In [35]:
dls_lm.show_batch(max_n=2)

Unnamed: 0,text,text_
0,"xxbos * the whereabouts of xxmaj al xxmaj capone \n\n * who shot xxup jfk ? \n\n * cynthia xxmaj gibb lands the part of "" gypsy "" in the xxup tv remake \n\n xxmaj these are some of the great unsolved mysteries of the 20th century . xxmaj how else can i say it , except , i thought she was xxunk awful . xxmaj mannequin mannerisms , poor reactionary acting ( ie : that blank , stoic stare","* the whereabouts of xxmaj al xxmaj capone \n\n * who shot xxup jfk ? \n\n * cynthia xxmaj gibb lands the part of "" gypsy "" in the xxup tv remake \n\n xxmaj these are some of the great unsolved mysteries of the 20th century . xxmaj how else can i say it , except , i thought she was xxunk awful . xxmaj mannequin mannerisms , poor reactionary acting ( ie : that blank , stoic stare while"
1,"who have some really awkward faces and texture animation but hey when the game play and story is so good it does n't make it much of an issue . \n\n xxunk xxmaj xxunk keeps on improving and improving as the games go along and whiles for me personally i think "" silent xxmaj hill 2 "" contains the best music this is one is very close behind . xxmaj it still has all the weird sound effects mixed with","have some really awkward faces and texture animation but hey when the game play and story is so good it does n't make it much of an issue . \n\n xxunk xxmaj xxunk keeps on improving and improving as the games go along and whiles for me personally i think "" silent xxmaj hill 2 "" contains the best music this is one is very close behind . xxmaj it still has all the weird sound effects mixed with orchestral"


### Fine-Tuning the Language Model

In [36]:
learn = language_model_learner(
    dls_lm, AWD_LSTM, drop_mult=0.3, 
    metrics=[accuracy, Perplexity()]).to_fp16()

In [33]:
# torch.cuda.empty_cache()

In [None]:
# ! pip install psutil

In [None]:
# import psutil


In [None]:
# psutil.virtual_memory()

In [None]:
learn.fit_one_cycle(1, 2e-2)

epoch,train_loss,valid_loss,accuracy,perplexity,time
0,4.147646,3.928762,0.298387,50.84399,2:06:57


### Saving and Loading Models

In [39]:
learn.save('1epoch')

Path('/home/ec2-user/.fastai/data/imdb/models/1epoch.pth')

In [37]:
learn = learn.load('1epoch')

In [None]:
learn.unfreeze()
learn.fit_one_cycle(10, 2e-3)

epoch,train_loss,valid_loss,accuracy,perplexity,time
0,3.955422,3.777591,0.317378,43.710625,2:14:06
1,3.816653,3.725461,0.323216,41.490364,2:14:00
2,3.759091,3.678164,0.328752,39.573662,2:13:46


In [41]:
learn.save_encoder('finetuned')

### Text Generation

In [56]:
TEXT = "I like action comedy movies because"
N_WORDS = 40
N_SENTENCES = 2
preds = [learn.predict(TEXT, N_WORDS, temperature=0.75) 
         for _ in range(N_SENTENCES)]

In [57]:
print("\n".join(preds))

i like action comedy movies because it has it 's own genre that can be very funny . So i thought this movie is n't on the bad scale . 

 For those who like cheesy comedies , should n't watch this movie .
i like action comedy movies because they are fun , and the director is also very good . This movie for me does n't quite have the humor , but there is very little . The central character , Hank , is a


### Creating the Classifier DataLoaders

In [58]:
dls_clas = DataBlock(
    blocks=(TextBlock.from_folder(path, vocab=dls_lm.vocab),CategoryBlock),
    get_y = parent_label,
    get_items=partial(get_text_files, folders=['train', 'test']),
    splitter=GrandparentSplitter(valid_name='test')
).dataloaders(path, path=path, bs=128, seq_len=72)

In [59]:
dls_clas.show_batch(max_n=3)

Unnamed: 0,text,category
0,"xxbos xxmaj match 1 : xxmaj tag xxmaj team xxmaj table xxmaj match xxmaj bubba xxmaj ray and xxmaj spike xxmaj dudley vs xxmaj eddie xxmaj guerrero and xxmaj chris xxmaj benoit xxmaj bubba xxmaj ray and xxmaj spike xxmaj dudley started things off with a xxmaj tag xxmaj team xxmaj table xxmaj match against xxmaj eddie xxmaj guerrero and xxmaj chris xxmaj benoit . xxmaj according to the rules of the match , both opponents have to go through tables in order to get the win . xxmaj benoit and xxmaj guerrero heated up early on by taking turns hammering first xxmaj spike and then xxmaj bubba xxmaj ray . a xxmaj german xxunk by xxmaj benoit to xxmaj bubba took the wind out of the xxmaj dudley brother . xxmaj spike tried to help his brother , but the referee restrained him while xxmaj benoit and xxmaj guerrero",pos
1,xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad,pos
2,xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad,neg


In [60]:
nums_samp = toks200[:10].map(num)

In [61]:
nums_samp.map(len)

(#10) [477,60,395,1696,306,222,151,131,1422,306]

In [62]:
learn = text_classifier_learner(dls_clas, AWD_LSTM, drop_mult=0.5, 
                                metrics=accuracy).to_fp16()

In [63]:
learn = learn.load_encoder('finetuned')

### Fine-Tuning the Classifier

In [84]:
learn.fit_one_cycle(1, 2e-2)

epoch,train_loss,valid_loss,accuracy,time
0,0.301453,0.269929,0.89372,16:37


In [65]:
learn.freeze_to(-2)
learn.fit_one_cycle(1, slice(1e-2/(2.6**4),1e-2))

epoch,train_loss,valid_loss,accuracy,time
0,0.273572,0.167956,0.93716,09:16


In [None]:
learn.freeze_to(-3)
learn.fit_one_cycle(1, slice(5e-3/(2.6**4),5e-3))

epoch,train_loss,valid_loss,accuracy,time


In [67]:
learn.unfreeze()
learn.fit_one_cycle(2, slice(1e-3/(2.6**4),1e-3))

epoch,train_loss,valid_loss,accuracy,time
0,0.175708,0.14768,0.94552,16:46
1,0.165748,0.147538,0.94572,16:48


In [68]:
learn.save('superfinetuned')

Path('/home/ec2-user/.fastai/data/imdb/models/superfinetuned.pth')

# HBA GPU Tools

In [73]:
# Install a pip package in the current Jupyter kernel
import sys
!{sys.executable} -m pip install nvidia-ml-py3

Processing /home/ec2-user/.cache/pip/wheels/7f/26/a3/33f2079871e2bebb3f53a2b21c3ec64129b8efdd18a6263a52/nvidia_ml_py3-7.352.0-py3-none-any.whl
Installing collected packages: nvidia-ml-py3
Successfully installed nvidia-ml-py3-7.352.0


## Jupyter Environments

In [89]:
!type python

python is /home/ec2-user/anaconda3/envs/JupyterSystemEnv/bin/python


In [90]:
!type -a python

python is /home/ec2-user/anaconda3/envs/JupyterSystemEnv/bin/python
python is /home/ec2-user/anaconda3/bin/python
python is /usr/bin/python


In [None]:
!type -a pip

In [92]:
!conda env list

# conda environments:
#
base                     /home/ec2-user/anaconda3
JupyterSystemEnv      *  /home/ec2-user/anaconda3/envs/JupyterSystemEnv
R                        /home/ec2-user/anaconda3/envs/R
amazonei_mxnet_p27       /home/ec2-user/anaconda3/envs/amazonei_mxnet_p27
amazonei_mxnet_p36       /home/ec2-user/anaconda3/envs/amazonei_mxnet_p36
amazonei_tensorflow2_p27     /home/ec2-user/anaconda3/envs/amazonei_tensorflow2_p27
amazonei_tensorflow2_p36     /home/ec2-user/anaconda3/envs/amazonei_tensorflow2_p36
amazonei_tensorflow_p27     /home/ec2-user/anaconda3/envs/amazonei_tensorflow_p27
amazonei_tensorflow_p36     /home/ec2-user/anaconda3/envs/amazonei_tensorflow_p36
chainer_p27              /home/ec2-user/anaconda3/envs/chainer_p27
chainer_p36              /home/ec2-user/anaconda3/envs/chainer_p36
mxnet_latest_p37         /home/ec2-user/anaconda3/envs/mxnet_latest_p37
mxnet_p27                /home/ec2-user/anaconda3/envs/mxnet_p27
mxnet_p36                /home/

In [96]:
!conda list fastai

# packages in environment at /home/ec2-user/anaconda3/envs/JupyterSystemEnv:
#
# Name                    Version                   Build  Channel
fastai                    2.0.16                   pypi_0    pypi


In [98]:
import torch 
print(torch.__version__)

1.6.0


## GPU Utilization  Utils (Continued)

In [85]:
from pynvml import *
nvmlInit()
handle = nvmlDeviceGetHandleByIndex(0)
info = nvmlDeviceGetMemoryInfo(handle)
print("Total memory:", info.total)
print("Free memory:", info.free)
print("Used memory:", info.used)

Total memory: 11996954624
Free memory: 803667968
Used memory: 11193286656


In [None]:
# from pynvml import *
nvmlInit()

deviceCount = nvmlDeviceGetCount()
for i in range(deviceCount):
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print("Total memory:", info.total)
    print("Free memory:", info.free)
    print("Used memory:", info.used)

In [75]:
# from pynvml import *
nvmlInit()
try:
    deviceCount = nvmlDeviceGetCount()
    for i in range(deviceCount):
        handle = nvmlDeviceGetHandleByIndex(i)
        print("Device", i, ":", nvmlDeviceGetName(handle))
except NVMLError as error:
    print(error)

Device 0 : b'Tesla K80'
Device 1 : b'Tesla K80'
Device 2 : b'Tesla K80'
Device 3 : b'Tesla K80'
Device 4 : b'Tesla K80'
Device 5 : b'Tesla K80'
Device 6 : b'Tesla K80'
Device 7 : b'Tesla K80'


In [87]:
import nvidia_smi

nvidia_smi.nvmlInit()
handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0)
# card id 0 hardcoded here, there is also a call to get all available card ids, so we could iterate

res = nvidia_smi.nvmlDeviceGetUtilizationRates(handle)
print(f'gpu: {res.gpu}%, gpu-mem: {res.memory}%')

gpu: 0%, gpu-mem: 0%


## GPU Multi GPU

In [97]:
import torch 
print(torch.__version__)

1.6.0


## GPU Mem Cleanup

In [88]:
import torch
torch.cuda.empty_cache()

In [78]:
torch.cuda.memory_cached()




3483369472

In [79]:
torch.cuda.memory_allocated()

1484178432

In [80]:
import gc;
gc.collect()

668

# Multi GPU

In [100]:
# import torch 

print(torch.cuda.current_device())
print(torch.cuda.device(0))
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(0))
print(torch.cuda.is_available())
print(torch.cuda.current_device())


1
<torch.cuda.device object at 0x7f9cdd126320>
8
Tesla K80
True
1


In [101]:
# Not sure if this is NEEDED
import os;
os.environ['CUDA_VISIBLE_DEVICES']='0,1,2,3,4,5,6,7'

### Work on a Certain GPU

In [102]:
torch.cuda.set_device(4)

## Disinformation and Language Models

## Conclusion

## Questionnaire

1. What is "self-supervised learning"?
1. What is a "language model"?
1. Why is a language model considered self-supervised?
1. What are self-supervised models usually used for?
1. Why do we fine-tune language models?
1. What are the three steps to create a state-of-the-art text classifier?
1. How do the 50,000 unlabeled movie reviews help us create a better text classifier for the IMDb dataset?
1. What are the three steps to prepare your data for a language model?
1. What is "tokenization"? Why do we need it?
1. Name three different approaches to tokenization.
1. What is `xxbos`?
1. List four rules that fastai applies to text during tokenization.
1. Why are repeated characters replaced with a token showing the number of repetitions and the character that's repeated?
1. What is "numericalization"?
1. Why might there be words that are replaced with the "unknown word" token?
1. With a batch size of 64, the first row of the tensor representing the first batch contains the first 64 tokens for the dataset. What does the second row of that tensor contain? What does the first row of the second batch contain? (Careful—students often get this one wrong! Be sure to check your answer on the book's website.)
1. Why do we need padding for text classification? Why don't we need it for language modeling?
1. What does an embedding matrix for NLP contain? What is its shape?
1. What is "perplexity"?
1. Why do we have to pass the vocabulary of the language model to the classifier data block?
1. What is "gradual unfreezing"?
1. Why is text generation always likely to be ahead of automatic identification of machine-generated texts?

### Further Research

1. See what you can learn about language models and disinformation. What are the best language models today? Take a look at some of their outputs. Do you find them convincing? How could a bad actor best use such a model to create conflict and uncertainty?
1. Given the limitation that models are unlikely to be able to consistently recognize machine-generated texts, what other approaches may be needed to handle large-scale disinformation campaigns that leverage deep learning?