In [1]:
from transformers import BertTokenizer, BertModel
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd

In [2]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [3]:
class BertClassifier(nn.Module):
    def __init__(self, dropout = .3):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 3)
        self.softmax = nn.Softmax(dim = 1)
        
    def forward(self, input_ids, attention_mask):
        pooled_output = self.bert(input_ids, attention_mask)[1]
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        return linear_output

In [4]:
from torchsample.modules import ModuleTrainer
from transformers import AdamW

In [5]:
model = BertClassifier()
trainer = ModuleTrainer(model)

trainer.compile(loss = nn.CrossEntropyLoss(), 
                optimizer = AdamW(params = model.parameters(), 
                                  lr = 1e-5, 
                                  correct_bias = False))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
import numpy as np

In [7]:
# Load CSV into DataFrame
df_tweets = pd.read_csv("Data/Twitter_Data.csv")

# Rename columns
df_tweets.columns = ["text", "label"]

# Drop NaN
df_tweets.dropna(inplace=True)

# Create X and y
X, y = df_tweets.text.values, df_tweets.label.values

# -1 (Negative) is now 2
y[y < 0] = 2
y = np.longlong(y)

# Small dataset for TESTING
X_small, y_small = X[:3000], y[:3000]

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X_small, y_small, test_size=.3, random_state=42)

In [9]:
test = tokenizer(text = X_small[0],
                       padding = 'max_length',
                       max_length = 32,             # 512 is max for BERT
                       truncation = True,
                       return_tensors = "pt"        # output as torch.Tensor
                       )
test

{'input_ids': tensor([[  101,  2043, 16913,  2072,  5763,  1523,  6263,  2231,  4555, 10615,
          1524,  3517,  2032,  4088,  1996,  3697,  3105, 29455,  1996,  2110,
          2339,  2515,  2202,  2086,  2131,  3425,  2110,  2323,  1998,  2025,
          2449,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1]])}

In [13]:
trainer.fit(test, y_train,
            num_epoch=20, 
            batch_size=128,
            verbose=1)

KeyError: 'Indexing with integers (to access backend Encoding for a given batch index) is not available when using Python based tokenizers'