In this notebook we find the base models that were tested for our final project, before altering top layers.

In [None]:
!pip install kaggle transformers >> /dev/null
!pip install transformers

#installed twice as local instance was having issues with one or the other

In [None]:
# UNCOMMENT IF: setting up connection to kaggle instance to import data
# !rm -r ~/.kaggle | true && mkdir -p ~/.kaggle && cp kaggle.json ~/.kaggle
# !chmod 600 ~/.kaggle/kaggle.json
# !kaggle competitions download -c feedback-prize-2021

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!unzip feedback-prize-2021.zip -d raw_data

In [None]:
# from transformers import RobertaTokenizer, RobertaModel
# import transformers
# from transformers import AutoTokenizer, AutoModelForMaskedLM
# from transformers import BertModel
# import torchtext
# from torchtext.data import Dataset
# from torchtext.legacy import data
import transformers
from transformers import *
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
import torch
from torch import nn
import torch.utils.data as data
import torch.backends.cudnn as cudnn
from torch.utils.data import DataLoader

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# send to device, and match with parallel connection
def send_to_device(net):
    net = net.to(device)
    if device == 'cuda':
        net = torch.nn.DataParallel(net)
        cudnn.benchmark = True
        print('\t ==> Model sent to gpu...')
    else:
        print('\t ==> Model sent to cpu...')
        
    return net

In [None]:
sample = pd.read_csv('raw_data/sample_submission.csv')
sample.loc[sample['class'].notnull()].head() # verifying that this sheet contains only the submission format required by kaggle

In [None]:
exp_dataset = pd.read_csv('raw_data/train.csv')
exp_dataset.head()

Unnamed: 0,id,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,predictionstring
0,423A1CA112E2,1622628000000.0,8.0,229.0,Modern humans today are always on their phone....,Lead,Lead 1,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
1,423A1CA112E2,1622628000000.0,230.0,312.0,They are some really bad consequences when stu...,Position,Position 1,45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
2,423A1CA112E2,1622628000000.0,313.0,401.0,Some certain areas in the United States ban ph...,Evidence,Evidence 1,60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
3,423A1CA112E2,1622628000000.0,402.0,758.0,"When people have phones, they know about certa...",Evidence,Evidence 2,76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 9...
4,423A1CA112E2,1622628000000.0,759.0,886.0,Driving is one of the way how to get around. P...,Claim,Claim 1,139 140 141 142 143 144 145 146 147 148 149 15...


In [None]:
exp_dataset['discourse_text'][0] #snippet

'Modern humans today are always on their phone. They are always on their phone more than 5 hours a day no stop .All they do is text back and forward and just have group Chats on social media. They even do it while driving.'

In [None]:
train_df = exp_dataset.sample(frac=0.85)
test_df = exp_dataset.copy().drop(train_df.index).reset_index(drop=True)

In [None]:
class WritingDataset(data.Dataset):
  def __init__(self, data: pd.DataFrame, tokenizer: str):
    self.data = data
    self.tokenizer = AutoTokenizer.from_pretrained(tokenizer)
    self.label_map = {label: i for i, label in enumerate(data['discourse_type'].unique())}

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    sample = self.data.iloc[idx]
    encoded = self.tokenizer(sample['discourse_text'], padding='max_length', truncation=True)
    label = self.label_map[sample['discourse_type']]
    return [
        torch.Tensor(encoded['input_ids']).int(), # data
        label, # label
        torch.Tensor(encoded['attention_mask']).int() # attention mask
      ]


In [None]:
# BERT UNCASED
bert_train_dataset = WritingDataset(train_df, tokenizer='bert-based-uncased')
bert_test_dataset = WritingDataset(test_df, tokenizer='bert-based-uncased')


# ROBERTA 
roberta_train_dataset = WritingDataset(train_df, tokenizer='roberta-large')
roberta_test_dataset = WritingDataset(test_df, tokenizer='roberta-large')


# DISTILBERT UNCASED
distilbert_train_dataset = WritingDataset(train_df, tokenizer='distilbert-base-uncased')
distilbert_test_dataset = WritingDataset(test_df, tokenizer='distilbert-base-uncased')

In [None]:
# BERT UNCASED
bert_train_dataloader = DataLoader(bert_train_dataset, batch_size=16, shuffle=True)
bert_test_dataloader = DataLoader(bert_test_dataset, batch_size=16, shuffle=True)

# ROBERTA 
roberta_train_dataloader = DataLoader(roberta_train_dataset, batch_size=16, shuffle=True)
roberta_test_dataloader = DataLoader(roberta_test_dataset, batch_size=16, shuffle=True)

# DISTILBERT UNCASED
distilbert_train_dataloader = DataLoader(distilbert_train_dataset, batch_size=16, shuffle=True)
distilbert_test_dataloader = DataLoader(distilbert_test_dataset, batch_size=16, shuffle=True)

In [None]:
#BERT
class bertModel(torch.nn.Module):
 def __init__(
     self,
     output_dim: int,
     hidden_dim: int,
     n_layers: int,
     bidirectional: bool,
     dropout: int,
     train_bert: bool = False):
  
   super().__init__()
   self.bert = BertModel.from_pretrained('bert-base-uncased')
   if not train_bert:
     for name, param in self.bert.named_parameters():
           param.requires_grad = False
 
   embedding_dim = self.bert.config.to_dict()['hidden_size']
  
   self.rnn = nn.GRU(embedding_dim,
                     hidden_dim,
                     num_layers = n_layers,
                     bidirectional = bidirectional,
                     batch_first = True,
                     dropout = 0 if n_layers < 2 else dropout)
  
   self.dropout = nn.Dropout(dropout)
   self.out = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
 
 def forward(self, x):
 
   with torch.no_grad():
     embedded = self.bert(x)[0]
 
   _, hidden = self.rnn(embedded)
 
   if self.rnn.bidirectional:
       hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
   else:
       hidden = self.dropout(hidden[-1,:,:])
 
  
   output = self.out(hidden)
 
   return output

In [None]:
#ROBERTA 
class robertaModel(torch.nn.Module):
  def __init__(
      self,
      output_dim: int,
      hidden_dim: int,
      n_layers: int,
      bidirectional: bool,
      dropout: int,
      train_roberta: bool = False):
    
    super().__init__()
    self.roberta = RobertaModel.from_pretrained('roberta-large')
    if not train_roberta:
      for name, param in self.roberta.named_parameters():
            param.requires_grad = False

    embedding_dim = self.roberta.config.to_dict()['hidden_size']
    
    self.rnn = nn.GRU(embedding_dim,
                      hidden_dim,
                      num_layers = n_layers,
                      bidirectional = bidirectional,
                      batch_first = True,
                      dropout = 0 if n_layers < 2 else dropout)
    
    self.dropout = nn.Dropout(dropout)
    self.out = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)

  def forward(self, x):

    with torch.no_grad():
      embedded = self.roberta(x)[0]

    _, hidden = self.rnn(embedded)

    if self.rnn.bidirectional:
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
    else:
        hidden = self.dropout(hidden[-1,:,:])

    
    output = self.out(hidden)

    return output

In [None]:
#DISTILBERT
class distilbertModel(torch.nn.Module):
 def __init__(
     self,
     output_dim: int,
     hidden_dim: int,
     n_layers: int,
     bidirectional: bool,
     dropout: int,
     train_distilbert: bool = False):
  
   super().__init__()
   self.distilbert = BertModel.from_pretrained('distilbert-base-uncased')
   if not train_distilbert:
     for name, param in self.distilbert.named_parameters():
           param.requires_grad = False
 
   embedding_dim = self.distilbert.config.to_dict()['hidden_size']
  
   self.rnn = nn.GRU(embedding_dim,
                     hidden_dim,
                     num_layers = n_layers,
                     bidirectional = bidirectional,
                     batch_first = True,
                     dropout = 0 if n_layers < 2 else dropout)
  
   self.dropout = nn.Dropout(dropout)
   self.out = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
 
 def forward(self, x):
 
   with torch.no_grad():
     embedded = self.distilbert(x)[0]
 
   _, hidden = self.rnn(embedded)
 
   if self.rnn.bidirectional:
       hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
   else:
       hidden = self.dropout(hidden[-1,:,:])
 
  
   output = self.out(hidden)
 
   return output

In [None]:
import torch.optim as optim
from tqdm import tqdm
#BERT
#base
bert_optimizer = optim.Adam(bertModel.parameters())

#altered
bert_tuned_optimizer = optim.Adam(bertModel.parameters(), lr=3e-5, eps=1e-6, weight_decay=5e-4)



#ROBERTA
#base
roberta_optimizer = optim.Adam(robertaModel.parameters())

#altered
roberta_tuned_optimizer = optim.Adam(robertaModel.parameters(), lr=3e-5, eps=1e-6, weight_decay=5e-4)



#DISTILBERT
#base
distilbert_optimizer = optim.Adam(distilbertModel.parameters())

#altered
distilbert_tuned_optimizer = optim.Adam(distilbertModel.parameters(), lr=3e-5, eps=1e-6, weight_decay=5e-4)


#same for all 
criterion = nn.BCEWithLogitsLoss().to(device)


In [None]:
#architecture and hyperparams
BERT = send_to_device(
    robertaModel(
      output_dim=len(bert_train_dataset.label_map),
      hidden_dim=256,
      n_layers=2,
      bidirectional=True,
      dropout=0,
      train_roberta=False
  )
)

ROBERTA = send_to_device(
    robertaModel(
      output_dim=len(roberta_train_dataset.label_map),
      hidden_dim=256,
      n_layers=2,
      bidirectional=True,
      dropout=0,
      train_roberta=False
  )
)

DISTILBERT = send_to_device(
    robertaModel(
      output_dim=len(distilbert_train_dataset.label_map),
      hidden_dim=256,
      n_layers=2,
      bidirectional=True,
      dropout=0,
      train_roberta=False
  )
)

In [None]:
def accuracy(preds, y):
    correct = preds.eq(y.view_as(preds)).sum()
    acc = correct.float() / y.shape[0]
    return acc

In [None]:
def train_step(model, optimizer, criterion, train_dataloader):
  epoch_loss = 0
  epoch_acc = 0

  model.train()
  for batch, label, _ in tqdm(train_dataloader):
    batch = batch.int().to(device)
    label = label.float().to(device)
    optimizer.zero_grad()

    preds = torch.argmax(model(batch), axis=1).float()
    
    acc = accuracy(preds, label)
    loss = criterion(preds, label)
    loss.requires_grad = True
    loss.backward()

    optimizer.step()
        
    epoch_loss += loss.item()
    epoch_acc += acc.item()

  return epoch_loss / len(train_dataloader), epoch_acc / len(train_dataloader)

In [None]:
def evaluate(model, criterion, test_dataloader):
  eval_loss = 0
  eval_acc = 0
  with torch.no_grad():
    model.eval()
    for batch, label, _ in test_dataloader:
      batch = batch.int().to(device)
      label = label.float().to(device)

      preds = torch.argmax(model(batch), axis=1).float()
      acc = accuracy(preds, label)
      loss = criterion(preds, label)

      eval_loss += loss.item()
      eval_acc += acc.item()

  return eval_loss / len(train_dataloader), eval_acc / len(train_dataloader)


In [None]:
def train(model, optimizer, criterion, train_dataloader, test_dataloader, epochs):
  history = {
      'train': {
          'loss': [],
          'acc': []
        },
      'eval': {
          'loss': [],
          'acc': []
        }
      }

  for epoch in range(epochs):
    train_loss, train_acc = train_step(model, optimizer, criterion, train_dataloader)
    eval_loss, eval_acc = evaluate(model, criterion, test_dataloader)
    history['train']['loss'].append(train_loss)
    history['train']['acc'].append(train_acc)
    history['eval']['loss'].append(eval_loss)
    history['eval']['acc'].append(eval_acc)
    torch.save(model, 'model_{epoch}.pt')
    print(f'Epoch: {epoch}, Train Loss: {train_loss}, Train Acc: {train_acc*100}, Eval Loss: {eval_loss}, Eval Acc: {eval_acc*100}')


In [None]:
# train(model, optimizer, criterion, train_dataloader, test_dataloader, 5)


#BERT
# train(bertModel, bert_optimizer, criterion, bert_train_dataloader, bert_test_dataloader, 5)

# train(bertModel, bert_tuned_optimizer, criterion, bert_train_dataloader, bert_test_dataloader, 5)



#ROBERTA
train(robertaModel, roberta_optimizer, criterion, roberta_train_dataloader, roberta_test_dataloader, 5)

# train(robertaModel, roberta_tuned_optimizer, criterion, roberta_train_dataloader, roberta_test_dataloader, 5)



DISTILBERT
# train(distilbertModel, distilbert_optimizer, criterion, distilbert_train_dataloader, distilbert_test_dataloader, 5)

# train(distilbertModel, distilbert_tuned_optimizer, criterion, distilbert_train_dataloader, distilbert_test_dataloader, 5)

100%|██████████| 313/313 [17:07<00:00,  3.28s/it]


Epoch: 0, Train Loss: -1.5718411876085085, Train Acc: 6.25, Eval Loss: -0.28306302513939124, Eval Acc: 2.146565495207668


100%|██████████| 313/313 [17:07<00:00,  3.28s/it]


Epoch: 1, Train Loss: -1.558165132904205, Train Acc: 6.3099041533546325, Eval Loss: -0.25892134329762323, Eval Acc: 2.086661341853035


100%|██████████| 313/313 [17:06<00:00,  3.28s/it]


Epoch: 2, Train Loss: -1.538927958343928, Train Acc: 6.399760383386581, Eval Loss: -0.2698230284471481, Eval Acc: 2.2064696485623


100%|██████████| 313/313 [17:08<00:00,  3.28s/it]


Epoch: 3, Train Loss: -1.5922786501316597, Train Acc: 6.419728434504793, Eval Loss: -0.2739928555183898, Eval Acc: 2.1765175718849843


100%|██████████| 313/313 [17:08<00:00,  3.29s/it]


Epoch: 4, Train Loss: -1.5701346629486679, Train Acc: 6.409744408945688, Eval Loss: -0.2634501600036987, Eval Acc: 2.1765175718849843
