In [9]:
import pandas as pd

import torch
import torch.nn as nn
from torch import cuda
from torch.utils.data import Dataset, DataLoader

import transformers
from transformers import AutoModel, AutoTokenizer

### Info

In [10]:
device = 'cuda' if cuda.is_available() else 'cpu'

MAX_LEN = 512
BATCH_SIZE = 4
EPOCHS = 1
LEARNING_RATE = 1e-05
DISTIL_BERT_CHECKPOINT = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(DISTIL_BERT_CHECKPOINT)

### Dataset and dataloader

In [5]:
class QuoraDataset(Dataset):

    def __init__(self, file_path, tokenizer, max_len):
        self._dataset = pd.read_csv(file_path, low_memory=False)
        self._tokenizer = tokenizer 
        self._max_len = max_len

    def __getitem__(self, index):
        text = self._dataset.iloc[index]["question_text"]
        inputs = self._tokenizer([text], padding=True, truncation=True, return_tensors="pt")

        return {
            "ids": inputs["input_ids"],
            "mask": inputs["attention_mask"],
            "target": torch.tensor(self._dataset.iloc[index]["target"], dtype=torch.long)
        }

    def __len__(self):
        return len(self._dataset)

In [6]:
train_dataset = QuoraDataset("../data/train.csv", tokenizer, MAX_LEN)
# add test + preprocessing later...

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

### DistilBert Model

In [13]:
# BertClass
class DistilBertModel(nn.Module):

    def __init__(self):
        super(DistilBertModel, self).__init__()
        self.distil_bert = transformers.AutoModel.from_pretrained(DISTIL_BERT_CHECKPOINT)
        self.drop1 = nn.Dropout(0.2)
        self.linear1 = nn.Linear(768, 1)
    
    def forward(self, ids, mask):
        x = self.distil_bert(ids, mask)
        x = self.drop1(x[0])
        x = self.linear1(x)
        return x

model = DistilBertModel()
model.to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


DistilBertModel(
  (distil_bert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear

### Training

In [17]:
# Creating the loss function and optimizer
loss = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [None]:
def train(epoch=1, train_loader, optimizer, loss):
    model.train()

    for idx, inputs in enumerate(train_loader):
        if idx%1_000 == 0:
            print(f"{idx}/{len(train_loader)} -- with accuracy #TODO")

        ids = inputs['ids'].to(device)
        mask = inputs['mask'].to(device)
        target = inputs['target'].to(device)

        outputs = model(idx, mask).squeeze()

        optimizer.zero_grad()
