In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
from typing import *
import torch
import torch.optim as optim
import numpy as np
import pandas as pd
from functools import partial
from overrides import overrides

from allennlp.data import Instance
from allennlp.data.token_indexers import TokenIndexer
from allennlp.data.tokenizers import Token
from allennlp.nn import util as nn_util

In [3]:
class Config(dict):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        for k, v in kwargs.items():
            setattr(self, k, v)
    
    def set(self, key, val):
        self[key] = val
        setattr(self, key, val)
        
config = Config(
    testing=True,
    seed=1,
    batch_size=64,
    lr=3e-4,
    epochs=2,
    hidden_sz=64,
    max_seq_len=100, # necessary to limit memory usage
    max_vocab_size=100000,
)

In [4]:
from allennlp.common.checks import ConfigurationError

In [5]:
USE_GPU = torch.cuda.is_available()
USE_GPU

False

In [6]:
DATA_ROOT = Path("data") / "jigsaw"
DATA_ROOT

PosixPath('data/jigsaw')

Set random seed manually to replicate results

In [7]:
torch.manual_seed(config.seed)

<torch._C.Generator at 0x7ff118947930>

# Load Data

In [8]:
from allennlp.data.vocabulary import Vocabulary
from allennlp.data.dataset_readers import DatasetReader

### Prepare dataset

In [9]:
label_cols = ["toxic", "severe_toxic", "obscene",
              "threat", "insult", "identity_hate"]

In [10]:
from allennlp.data.fields import TextField, MetadataField, ArrayField

class JigsawDatasetReader(DatasetReader):
    def __init__(self, tokenizer: Callable[[str], List[str]]=lambda x: x.split(),
                 token_indexers: Dict[str, TokenIndexer] = None,
                 max_seq_len: Optional[int]=config.max_seq_len) -> None:
        super().__init__(lazy=False)
        self.tokenizer = tokenizer
        self.token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
        self.max_seq_len = max_seq_len

    @overrides
    def text_to_instance(self, tokens: List[Token], id: str,
                         labels: np.ndarray) -> Instance:
        sentence_field = TextField(tokens, self.token_indexers)
        fields = {"tokens": sentence_field}
        
        id_field = MetadataField(id)
        fields["id"] = id_field
        
        label_field = ArrayField(array=labels)
        fields["label"] = label_field

        return Instance(fields)
    
    @overrides
    def _read(self, file_path: str) -> Iterator[Instance]:
        df = pd.read_csv(file_path)
        if config.testing: df = df.head(1000)
        for i, row in df.iterrows():
            yield self.text_to_instance(
                [Token(x) for x in self.tokenizer(row["comment_text"])],
                row["id"], row[label_cols].values,
            )

### Prepare token handlers

We will use the spacy tokenizer here

In [11]:
from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter
from allennlp.data.token_indexers import SingleIdTokenIndexer

# the token indexer is responsible for mapping tokens to integers
token_indexer = SingleIdTokenIndexer()

def tokenizer(x: str):
    return [w.text for w in
            SpacyWordSplitter(language='en_core_web_sm', 
                              pos_tags=False).split_words(x)[:config.max_seq_len]]

In [12]:
reader = JigsawDatasetReader(
    tokenizer=tokenizer,
    token_indexers={"tokens": token_indexer}
)

In [14]:
train_ds,  = (reader.read(DATA_ROOT / fname) for fname in ["train.csv", ])
val_ds = None


0it [00:00, ?it/s][A
1it [00:00,  1.31it/s][A
39it [00:00,  1.87it/s][A
70it [00:00,  2.66it/s][A
114it [00:01,  3.79it/s][A
147it [00:01,  5.39it/s][A
184it [00:01,  7.65it/s][A
226it [00:01, 10.85it/s][A
266it [00:01, 15.32it/s][A
315it [00:01, 21.60it/s][A
355it [00:01, 30.12it/s][A
394it [00:01, 41.37it/s][A
431it [00:01, 56.12it/s][A
475it [00:02, 75.92it/s][A
513it [00:02, 98.40it/s][A
550it [00:02, 124.26it/s][A
585it [00:02, 147.18it/s][A
618it [00:02, 175.27it/s][A
651it [00:02, 190.77it/s][A
681it [00:02, 211.49it/s][A
714it [00:02, 236.32it/s][A
748it [00:02, 259.78it/s][A
781it [00:03, 275.27it/s][A
822it [00:03, 305.33it/s][A
861it [00:03, 325.84it/s][A
897it [00:03, 273.44it/s][A
932it [00:03, 291.75it/s][A
967it [00:03, 305.15it/s][A
1000it [00:03, 300.53it/s][A

In [15]:
len(train_ds)

1000

In [16]:
train_ds[:10]

[<allennlp.data.instance.Instance at 0x7ff0bbb6b588>,
 <allennlp.data.instance.Instance at 0x7ff0bbb615c0>,
 <allennlp.data.instance.Instance at 0x7ff0bbb4e2b0>,
 <allennlp.data.instance.Instance at 0x7ff0bbbe5550>,
 <allennlp.data.instance.Instance at 0x7ff0bbbd42b0>,
 <allennlp.data.instance.Instance at 0x7ff0bbbc3a58>,
 <allennlp.data.instance.Instance at 0x7ff0bbbc30b8>,
 <allennlp.data.instance.Instance at 0x7ff0bbbc6c18>,
 <allennlp.data.instance.Instance at 0x7ff0bf507cc0>,
 <allennlp.data.instance.Instance at 0x7ff0bf50af60>]

In [17]:
vars(train_ds[0].fields["tokens"])

{'tokens': [Explanation,
  Why,
  the,
  edits,
  made,
  under,
  my,
  username,
  Hardcore,
  Metallica,
  Fan,
  were,
  reverted,
  ?,
  They,
  were,
  n't,
  vandalisms,
  ,,
  just,
  closure,
  on,
  some,
  GAs,
  after,
  I,
  voted,
  at,
  New,
  York,
  Dolls,
  FAC,
  .,
  And,
  please,
  do,
  n't,
  remove,
  the,
  template,
  from,
  the,
  talk,
  page,
  since,
  I,
  'm,
  retired,
  now.89.205.38.27],
 '_token_indexers': {'tokens': <allennlp.data.token_indexers.single_id_token_indexer.SingleIdTokenIndexer at 0x7ff0bfff5940>},
 '_indexed_tokens': None,
 '_indexer_name_to_indexed_token': None}

### Prepare vocabulary

In [18]:
vocab = Vocabulary.from_instances(train_ds, max_vocab_size=config.max_vocab_size)

06/06/2019 16:13:22 - INFO - allennlp.data.vocabulary -   Fitting token dictionary from dataset.

  0%|          | 0/1000 [00:00<?, ?it/s][A
100%|██████████| 1000/1000 [00:00<00:00, 15049.58it/s][A

### Prepare iterator

The iterator is responsible for batching the data and preparing it for input into the model. We'll use the BucketIterator that batches text sequences of smilar lengths together.

In [19]:
from allennlp.data.iterators import BucketIterator

In [20]:
iterator = BucketIterator(batch_size=config.batch_size, 
                          sorting_keys=[("tokens", "num_tokens")],
                         )

We need to tell the iterator how to numericalize the text data. We do this by passing the vocabulary to the iterator. This step is easy to forget so be careful! 

In [21]:
iterator.index_with(vocab)

### Read sample

In [22]:
batch = next(iter(iterator(train_ds)))

In [23]:
batch

{'tokens': {'tokens': tensor([[   3,   14,    9,  ...,    0,    0,    0],
          [ 912,   15,   94,  ...,    0,    0,    0],
          [ 683,    4,  250,  ...,    0,    0,    0],
          ...,
          [   5, 4205, 4206,  ...,    0,    0,    0],
          [ 116,  157,   20,  ...,   54,   21,    0],
          [1946, 2977, 2978,  ..., 6523,    0,    0]])},
 'id': ['0228f8ff42b5cf40',
  '021d68ecd1056359',
  '011813b36dadcbe1',
  '02ae218e901a58f0',
  '028e859eada4bd90',
  '02a36ddb2884cb16',
  '0053978373606ba4',
  '013004b25c95612e',
  '015481306382caba',
  '01c78f7b48cb8057',
  '020c16fd5ab352ef',
  '00ab1b314832a2d9',
  '0296430035f695b9',
  '01f3d14df61d3481',
  '0206ea29807e8b6a',
  '028b4485f9893caf',
  '006fc8cfaa4faf0b',
  '02031a9ccbd4bac8',
  '02b0a716a458a796',
  '02004db8cf8f91f0',
  '00c438b052cc0f26',
  '0278f245edc81773',
  '012db3deb39d94ca',
  '012d059ecf3c421b',
  '00e3d92fb826eceb',
  '01076f023a99d133',
  '02195073bd639320',
  '01728e7abf3ee345',
  '00fd5ba387c67

In [24]:
batch["tokens"]["tokens"]

tensor([[   3,   14,    9,  ...,    0,    0,    0],
        [ 912,   15,   94,  ...,    0,    0,    0],
        [ 683,    4,  250,  ...,    0,    0,    0],
        ...,
        [   5, 4205, 4206,  ...,    0,    0,    0],
        [ 116,  157,   20,  ...,   54,   21,    0],
        [1946, 2977, 2978,  ..., 6523,    0,    0]])

In [25]:
batch["tokens"]["tokens"].shape

torch.Size([64, 40])

# Prepare Model

In [26]:
import torch
import torch.nn as nn
import torch.optim as optim

In [27]:
from allennlp.modules.seq2vec_encoders import Seq2VecEncoder, PytorchSeq2VecWrapper
from allennlp.nn.util import get_text_field_mask
from allennlp.models import Model
from allennlp.modules.text_field_embedders import TextFieldEmbedder

class BaselineModel(Model):
    def __init__(self, word_embeddings: TextFieldEmbedder,
                 encoder: Seq2VecEncoder,
                 out_sz: int=len(label_cols)):
        super().__init__(vocab)
        self.word_embeddings = word_embeddings
        self.encoder = encoder
        self.projection = nn.Linear(self.encoder.get_output_dim(), out_sz)
        self.loss = nn.BCEWithLogitsLoss()
        
    def forward(self, tokens: Dict[str, torch.Tensor],
                id: Any, label: torch.Tensor) -> torch.Tensor:
        mask = get_text_field_mask(tokens)
        embeddings = self.word_embeddings(tokens)
        state = self.encoder(embeddings, mask)
        class_logits = self.projection(state)
        
        output = {"class_logits": class_logits}
        output["loss"] = self.loss(class_logits, label)

        return output



### Prepare embeddings

In [28]:
from allennlp.modules.token_embedders import Embedding
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder

token_embedding = Embedding(num_embeddings=config.max_vocab_size + 2,
                            embedding_dim=300, padding_index=0)
# the embedder maps the input tokens to the appropriate embedding matrix
word_embeddings: TextFieldEmbedder = BasicTextFieldEmbedder({"tokens": token_embedding})

In [29]:
from allennlp.modules.seq2vec_encoders import PytorchSeq2VecWrapper
encoder: Seq2VecEncoder = PytorchSeq2VecWrapper(nn.LSTM(word_embeddings.get_output_dim(),
                                                        config.hidden_sz, bidirectional=True, batch_first=True))

Notice how simple and modular the code for initializing the model is. All the complexity is delegated to each component.

In [30]:
model = BaselineModel(
    word_embeddings, 
    encoder, 
)

In [31]:
if USE_GPU: model.cuda()
else: model

# Basic sanity checks

In [32]:
batch = nn_util.move_to_device(batch, 0 if USE_GPU else -1)

In [33]:
tokens = batch["tokens"]
labels = batch

In [34]:
tokens

{'tokens': tensor([[   3,   14,    9,  ...,    0,    0,    0],
         [ 912,   15,   94,  ...,    0,    0,    0],
         [ 683,    4,  250,  ...,    0,    0,    0],
         ...,
         [   5, 4205, 4206,  ...,    0,    0,    0],
         [ 116,  157,   20,  ...,   54,   21,    0],
         [1946, 2977, 2978,  ..., 6523,    0,    0]])}

In [35]:
mask = get_text_field_mask(tokens)
mask

tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 0],
        [1, 1, 1,  ..., 1, 0, 0]])

In [36]:
embeddings = model.word_embeddings(tokens)
state = model.encoder(embeddings, mask)
class_logits = model.projection(state)
class_logits

tensor([[-0.0395,  0.0811, -0.0303,  0.0291,  0.0160, -0.0256],
        [-0.0389,  0.0811, -0.0297,  0.0274,  0.0173, -0.0238],
        [-0.0381,  0.0792, -0.0313,  0.0284,  0.0153, -0.0257],
        [-0.0394,  0.0806, -0.0302,  0.0265,  0.0165, -0.0245],
        [-0.0389,  0.0801, -0.0313,  0.0256,  0.0167, -0.0264],
        [-0.0410,  0.0791, -0.0308,  0.0291,  0.0168, -0.0246],
        [-0.0385,  0.0799, -0.0303,  0.0274,  0.0173, -0.0245],
        [-0.0392,  0.0810, -0.0318,  0.0293,  0.0153, -0.0259],
        [-0.0387,  0.0808, -0.0310,  0.0278,  0.0156, -0.0251],
        [-0.0387,  0.0807, -0.0307,  0.0261,  0.0166, -0.0250],
        [-0.0379,  0.0803, -0.0328,  0.0280,  0.0171, -0.0236],
        [-0.0381,  0.0805, -0.0335,  0.0296,  0.0170, -0.0263],
        [-0.0381,  0.0815, -0.0312,  0.0288,  0.0160, -0.0249],
        [-0.0396,  0.0805, -0.0313,  0.0284,  0.0166, -0.0241],
        [-0.0386,  0.0795, -0.0309,  0.0279,  0.0162, -0.0235],
        [-0.0387,  0.0815, -0.0301,  0.0

In [37]:
model(**batch)

{'class_logits': tensor([[-0.0395,  0.0811, -0.0303,  0.0291,  0.0160, -0.0256],
         [-0.0389,  0.0811, -0.0297,  0.0274,  0.0173, -0.0238],
         [-0.0381,  0.0792, -0.0313,  0.0284,  0.0153, -0.0257],
         [-0.0394,  0.0806, -0.0302,  0.0265,  0.0165, -0.0245],
         [-0.0389,  0.0801, -0.0313,  0.0256,  0.0167, -0.0264],
         [-0.0410,  0.0791, -0.0308,  0.0291,  0.0168, -0.0246],
         [-0.0385,  0.0799, -0.0303,  0.0274,  0.0173, -0.0245],
         [-0.0392,  0.0810, -0.0318,  0.0293,  0.0153, -0.0259],
         [-0.0387,  0.0808, -0.0310,  0.0278,  0.0156, -0.0251],
         [-0.0387,  0.0807, -0.0307,  0.0261,  0.0166, -0.0250],
         [-0.0379,  0.0803, -0.0328,  0.0280,  0.0171, -0.0236],
         [-0.0381,  0.0805, -0.0335,  0.0296,  0.0170, -0.0263],
         [-0.0381,  0.0815, -0.0312,  0.0288,  0.0160, -0.0249],
         [-0.0396,  0.0805, -0.0313,  0.0284,  0.0166, -0.0241],
         [-0.0386,  0.0795, -0.0309,  0.0279,  0.0162, -0.0235],
         

In [38]:
loss = model(**batch)["loss"]

In [41]:
loss

tensor(0.6963, grad_fn=<MeanBackward1>)

In [42]:
loss.backward()

RuntimeError: Trying to backward through the graph a second time, but the buffers have already been freed. Specify retain_graph=True when calling backward the first time.

# Train

In [43]:
optimizer = optim.Adam(model.parameters(), lr=config.lr)

In [44]:
from allennlp.training.trainer import Trainer

trainer = Trainer(
    model=model,
    optimizer=optimizer,
    iterator=iterator,
    train_dataset=train_ds,
    cuda_device=0 if USE_GPU else -1,
    num_epochs=config.epochs,
)

In [45]:
metrics = trainer.train()

06/06/2019 16:16:01 - INFO - allennlp.training.trainer -   Beginning training.
06/06/2019 16:16:01 - INFO - allennlp.training.trainer -   Epoch 0/1
06/06/2019 16:16:01 - INFO - allennlp.training.trainer -   Peak CPU memory usage MB: 897.392
06/06/2019 16:16:01 - INFO - allennlp.training.trainer -   Training

  0%|          | 0/16 [00:00<?, ?it/s][A
loss: 0.6970 ||:   6%|▋         | 1/16 [00:00<00:09,  1.52it/s][A
loss: 0.6953 ||:  12%|█▎        | 2/16 [00:01<00:11,  1.24it/s][A
loss: 0.6941 ||:  19%|█▉        | 3/16 [00:03<00:12,  1.06it/s][A
loss: 0.6930 ||:  25%|██▌       | 4/16 [00:03<00:09,  1.24it/s][A
loss: 0.6924 ||:  31%|███▏      | 5/16 [00:03<00:07,  1.49it/s][A
loss: 0.6913 ||:  38%|███▊      | 6/16 [00:04<00:07,  1.36it/s][A
loss: 0.6905 ||:  44%|████▍     | 7/16 [00:05<00:05,  1.63it/s][A
loss: 0.6896 ||:  50%|█████     | 8/16 [00:05<00:04,  1.82it/s][A
loss: 0.6886 ||:  56%|█████▋    | 9/16 [00:06<00:04,  1.57it/s][A
loss: 0.6876 ||:  62%|██████▎   | 10/16 [00:0