<a href="https://colab.research.google.com/github/Ankur3107/colab_notebooks/blob/master/Generic_Transformer_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/27/3c/91ed8f5c4e7ef3227b4119200fc0ed4b4fd965b1f0172021c25701087825/transformers-3.0.2-py3-none-any.whl (769kB)
[K     |▍                               | 10kB 19.1MB/s eta 0:00:01[K     |▉                               | 20kB 2.8MB/s eta 0:00:01[K     |█▎                              | 30kB 4.0MB/s eta 0:00:01[K     |█▊                              | 40kB 4.4MB/s eta 0:00:01[K     |██▏                             | 51kB 3.4MB/s eta 0:00:01[K     |██▋                             | 61kB 3.9MB/s eta 0:00:01[K     |███                             | 71kB 4.0MB/s eta 0:00:01[K     |███▍                            | 81kB 4.5MB/s eta 0:00:01[K     |███▉                            | 92kB 4.7MB/s eta 0:00:01[K     |████▎                           | 102kB 4.5MB/s eta 0:00:01[K     |████▊                           | 112kB 4.5MB/s eta 0:00:01[K     |█████▏                          | 122kB 4.5M

In [6]:
import os, pandas as pd
from sklearn.model_selection import train_test_split
import logging
from transformers import *
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn

# 1. Set Configuration

In [11]:
class Config:
  train_file = './data.csv'
  eval_file = './eval.csv'
  max_seq_len = 128
  batch_size = 32
  epochs = 5
  model_name = 'bert-base-uncased'
  learning_rate = 2e-5
  n_classes = 3
  device = 'cpu'
  


flags = Config

# 2. Build Dataset Pipeline

In [7]:
class TextLabelDataset(Dataset):

    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
  
    def __len__(self):
        return len(self.texts)
  
    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
          text,
          add_special_tokens=True,
          max_length=self.max_len,
          return_token_type_ids=False,
          pad_to_max_length=True,
          return_attention_mask=True,
          return_tensors='pt',
        )

        return {
          'texts': text,
          'input_ids': encoding['input_ids'].flatten(),
          'attention_mask': encoding['attention_mask'].flatten(),
          'targets': torch.tensor(label, dtype=torch.long)
        }

def create_data_loader(df, tokenizer, max_len, batch_size, is_prediction=False):

  if isinstance(df, str):
    df = pd.read_csv(df)
  else:
    pass

  if is_prediction:
    ds = TextLabelDataset(
        texts=df.text.to_numpy(),
        labels=np.array([-1]*len(df.text.values)),
        tokenizer=tokenizer,
        max_len=max_len
        )
  else:
    ds = TextLabelDataset(
        texts=df.text.to_numpy(),
        labels=df.labels.to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len
        )

    return DataLoader(
        ds,
        batch_size=batch_size,
        num_workers=4
        )

# 3. Build Model 

In [8]:
class Classifier(nn.Module):

  def __init__(self, model_name, n_classes):
      super(Classifier, self).__init__()
      self.bert = AutoModel.from_pretrained(model_name)
      self.drop = nn.Dropout(p=0.3)
      self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

  def forward(self, input_ids, attention_mask):
      _, pooled_output = self.bert(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      output = self.drop(pooled_output)
      return self.out(output)

In [None]:
class ClassificationModel:

  def __init__(self, flags):
    tokenizer = BertTokenizer.from_pretrained(flags.model_name)
    model = Classifier(flags.model_name, flags.n_classes)
    model = model.to(device)
    


    