# Package Installation

In [1]:
!git clone https://github.com/devkosal/fastai_roberta.git

Cloning into 'fastai_roberta'...
remote: Enumerating objects: 171, done.[K
remote: Counting objects: 100% (171/171), done.[K
remote: Compressing objects: 100% (121/121), done.[K
remote: Total 171 (delta 91), reused 111 (delta 44), pack-reused 0[K
Receiving objects: 100% (171/171), 25.46 MiB | 18.58 MiB/s, done.
Resolving deltas: 100% (91/91), done.


In [2]:
!pip install fastai==1.0.60 transformers==2.3.0

Collecting fastai==1.0.60
[?25l  Downloading https://files.pythonhosted.org/packages/f5/e4/a7025bf28f303dbda0f862c09a7f957476fa92c9271643b4061a81bb595f/fastai-1.0.60-py3-none-any.whl (237kB)
[K     |████████████████████████████████| 245kB 4.7MB/s 
[?25hCollecting transformers==2.3.0
[?25l  Downloading https://files.pythonhosted.org/packages/50/10/aeefced99c8a59d828a92cc11d213e2743212d3641c87c82d61b035a7d5c/transformers-2.3.0-py3-none-any.whl (447kB)
[K     |████████████████████████████████| 450kB 7.4MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 7.0MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |█████████████████████

# Load And Set Configuration

In [3]:
from fastai.text import *
from fastai.metrics import *
from transformers import RobertaTokenizer

In [4]:
# Creating a config object to store task specific information
class Config(dict):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        for k, v in kwargs.items():
            setattr(self, k, v)
    
    def set(self, key, val):
        self[key] = val
        setattr(self, key, val)
        
config = Config(
    testing=True,
    seed = 2019,
    roberta_model_name='roberta-base', # can also be exchnaged with roberta-large 
    max_lr=1e-5,
    epochs=1,
    use_fp16=False,
    bs=4, 
    max_seq_len=256, 
    num_labels = 2,
    hidden_dropout_prob=.05,
    hidden_size=768, # 1024 for roberta-large
    start_tok = "<s>",
    end_tok = "</s>",
)

In [5]:
df = pd.read_csv("fastai_roberta/fastai_roberta_imdb/imdb_dataset.csv")

In [6]:
if config.testing: df = df[:5000]
print(df.shape)

(5000, 2)


In [7]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [8]:
feat_cols = "review"
label_cols = "sentiment"

# Setting Up the Tokenizer

In [9]:
class FastAiRobertaTokenizer(BaseTokenizer):
    """Wrapper around RobertaTokenizer to be compatible with fastai"""
    def __init__(self, tokenizer: RobertaTokenizer, max_seq_len: int=128, **kwargs): 
        self._pretrained_tokenizer = tokenizer
        self.max_seq_len = max_seq_len 
    def __call__(self, *args, **kwargs): 
        return self 
    def tokenizer(self, t:str) -> List[str]: 
        """Adds Roberta bos and eos tokens and limits the maximum sequence length""" 
        return [config.start_tok] + self._pretrained_tokenizer.tokenize(t)[:self.max_seq_len - 2] + [config.end_tok]

In [10]:
# create fastai tokenizer for roberta
roberta_tok = RobertaTokenizer.from_pretrained("roberta-base")

fastai_tokenizer = Tokenizer(tok_func=FastAiRobertaTokenizer(roberta_tok, max_seq_len=config.max_seq_len), 
                             pre_rules=[], post_rules=[])

In [11]:
# create fastai vocabulary for roberta
path = Path()
roberta_tok.save_vocabulary(path)

with open('vocab.json', 'r') as f:
    roberta_vocab_dict = json.load(f)
    
fastai_roberta_vocab = Vocab(list(roberta_vocab_dict.keys()))

In [12]:
# Setting up pre-processors
class RobertaTokenizeProcessor(TokenizeProcessor):
    def __init__(self, tokenizer):
         super().__init__(tokenizer=tokenizer, include_bos=False, include_eos=False)

class RobertaNumericalizeProcessor(NumericalizeProcessor):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)


def get_roberta_processor(tokenizer:Tokenizer=None, vocab:Vocab=None):
    """
    Constructing preprocessors for Roberta
    We remove sos and eos tokens since we add that ourselves in the tokenizer.
    We also use a custom vocabulary to match the numericalization with the original Roberta model.
    """
    return [RobertaTokenizeProcessor(tokenizer=tokenizer), RobertaNumericalizeProcessor(vocab=vocab)]

# Setting up the DataBunch

In [13]:
# Creating a Roberta specific DataBunch class
class RobertaDataBunch(TextDataBunch):
    "Create a `TextDataBunch` suitable for training Roberta"
    @classmethod
    def create(cls, train_ds, valid_ds, test_ds=None, path:PathOrStr='.', bs:int=64, val_bs:int=None, pad_idx=1,
               pad_first=True, device:torch.device=None, no_check:bool=False, backwards:bool=False, 
               dl_tfms:Optional[Collection[Callable]]=None, **dl_kwargs) -> DataBunch:
        "Function that transform the `datasets` in a `DataBunch` for classification. Passes `**dl_kwargs` on to `DataLoader()`"
        datasets = cls._init_ds(train_ds, valid_ds, test_ds)
        val_bs = ifnone(val_bs, bs)
        collate_fn = partial(pad_collate, pad_idx=pad_idx, pad_first=pad_first, backwards=backwards)
        train_sampler = SortishSampler(datasets[0].x, key=lambda t: len(datasets[0][t][0].data), bs=bs)
        train_dl = DataLoader(datasets[0], batch_size=bs, sampler=train_sampler, drop_last=True, **dl_kwargs)
        dataloaders = [train_dl]
        for ds in datasets[1:]:
            lengths = [len(t) for t in ds.x.items]
            sampler = SortSampler(ds.x, key=lengths.__getitem__)
            dataloaders.append(DataLoader(ds, batch_size=val_bs, sampler=sampler, **dl_kwargs))
        return cls(*dataloaders, path=path, device=device, dl_tfms=dl_tfms, collate_fn=collate_fn, no_check=no_check)

In [14]:
class RobertaTextList(TextList):
    _bunch = RobertaDataBunch
    _label_cls = TextList

In [15]:
# loading the tokenizer and vocab processors
processor = get_roberta_processor(tokenizer=fastai_tokenizer, vocab=fastai_roberta_vocab)

In [16]:
# creating our databunch 
data = RobertaTextList.from_df(df, ".", cols=feat_cols, processor=processor) \
    .split_by_rand_pct(seed=config.seed) \
    .label_from_df(cols=label_cols,label_cls=CategoryList) \
    .databunch(bs=config.bs, pad_first=False, pad_idx=0)

In [17]:
data

RobertaDataBunch;

Train: LabelList (4000 items)
x: RobertaTextList
<s> One Ġof Ġthe Ġother Ġreviewers Ġhas Ġmentioned Ġthat Ġafter Ġwatching Ġjust Ġ1 ĠOz Ġepisode Ġyou 'll Ġbe Ġhooked . ĠThey Ġare Ġright , Ġas Ġthis Ġis Ġexactly Ġwhat Ġhappened Ġwith Ġme .< br Ġ/ >< br Ġ/> The Ġfirst Ġthing Ġthat Ġstruck Ġme Ġabout ĠOz Ġwas Ġits Ġbrutality Ġand Ġunfl inch ing Ġscenes Ġof Ġviolence , Ġwhich Ġset Ġin Ġright Ġfrom Ġthe Ġword ĠGO . ĠTrust Ġme , Ġthis Ġis Ġnot Ġa Ġshow Ġfor Ġthe Ġfaint Ġheart ed Ġor Ġtimid . ĠThis Ġshow Ġpulls Ġno Ġpunches Ġwith Ġregards Ġto Ġdrugs , Ġsex Ġor Ġviolence . ĠIts Ġis Ġhardcore , Ġin Ġthe Ġclassic Ġuse Ġof Ġthe Ġword .< br Ġ/ >< br Ġ/> It Ġis Ġcalled ĠO Z Ġas Ġthat Ġis Ġthe Ġnickname Ġgiven Ġto Ġthe ĠOswald ĠMaximum ĠSecurity ĠState ĠPen itent ary . ĠIt Ġfocuses Ġmainly Ġon ĠEmerald ĠCity , Ġan Ġexperimental Ġsection Ġof Ġthe Ġprison Ġwhere Ġall Ġthe Ġcells Ġhave Ġglass Ġfronts Ġand Ġface Ġin wards , Ġso Ġprivacy Ġis Ġnot Ġhigh Ġon Ġthe Ġagenda . ĠEm ĠCity Ġis 

# Building the Model

In [18]:
import torch
import torch.nn as nn
from transformers import RobertaModel

# defining our model architecture 
class CustomRobertaModel(nn.Module):
    def __init__(self,num_labels=2):
        super(CustomRobertaModel,self).__init__()
        self.num_labels = num_labels
        self.roberta = RobertaModel.from_pretrained(config.roberta_model_name)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, num_labels) # defining final output layer
        
    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
        _ , pooled_output = self.roberta(input_ids, token_type_ids, attention_mask) # 
        logits = self.classifier(pooled_output)        
        return logits

In [35]:
roberta_model = CustomRobertaModel(num_labels=config.num_labels)

learn = Learner(data, roberta_model, metrics=[accuracy])

In [20]:
learn.model.roberta.train() # setting roberta to train as it is in eval mode by default
learn.fit_one_cycle(config.epochs, max_lr=config.max_lr)

epoch,train_loss,valid_loss,accuracy,time
0,0.184614,0.180552,0.929,02:17


# Getting Predictions

In [21]:
def get_preds_as_nparray(ds_type) -> np.ndarray:
    learn.model.roberta.eval()
    preds = learn.get_preds(ds_type)[0].detach().cpu().numpy()
    sampler = [i for i in data.dl(ds_type).sampler]
    reverse_sampler = np.argsort(sampler)
    ordered_preds = preds[reverse_sampler, :]
    pred_values = np.argmax(ordered_preds, axis=1)
    return ordered_preds, pred_values

In [39]:
preds, pred_values = get_preds_as_nparray(DatasetType.Valid)

In [40]:
# accuracy on valid
(pred_values == data.valid_ds.y.items).mean()

0.929

# Saving/Loading the model weights

In [24]:
def save_model(learner, file_name):
    st = learner.model.state_dict()
    torch.save(st, file_name) # will save model in current dir # backend is pickle 

def load_model(learner, file_name):
    st = torch.load(file_name)
    learner.model.load_state_dict(st)

In [25]:
# monkey patching Learner methods to save and load model file
Learner.save_model = save_model
Learner.load_model = load_model

In [38]:
learn.save_model("my_model.bin")
learn.load_model("my_model.bin")