In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
from datasets import Dataset, load_dataset
from glob import glob
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, log_loss
import collections
import lightgbm
import multiprocessing as mp
import os

import warnings
warnings.filterwarnings('ignore')

### Loading and Preprocessing of Data

In [2]:
dir = "../input/feedback-prize-effectiveness"
essay_dir_train = dir + "/train"
essay_dir_test = dir + "/test"
train = pd.read_csv(dir + "/train.csv")
test = pd.read_csv(dir + "/test.csv")
train_essay_ids = train.essay_id.unique()
test_essay_ids = test.essay_id.unique()
discourse_ids = train["discourse_id"].values

def read_essay_train(essay_id):
  fname_train = f"{essay_dir_train}/{essay_id}.txt"
  with open(fname_train) as f:
    lines = f.read() 
  return lines

def read_essay_test(essay_id):
  fname_train = f"{essay_dir_test}/{essay_id}.txt"
  with open(fname_train) as f:
    lines = f.read() 
  return lines

train_essays = [read_essay_train(id) for id in train_essay_ids]
test_essays = [read_essay_test(id) for id in test_essay_ids]

train_essay_map = dict(zip(train_essay_ids, train_essays))
test_essay_map = dict(zip(test_essay_ids, test_essays))

train["essay_text"] = train.essay_id.map(train_essay_map)
test["essay_text"] = test.essay_id.map(test_essay_map)

train.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness,essay_text
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate,"Hi, i'm Isaac, i'm going to be writing about h..."
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,Adequate,"Hi, i'm Isaac, i'm going to be writing about h..."
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,Adequate,"Hi, i'm Isaac, i'm going to be writing about h..."
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,Adequate,"Hi, i'm Isaac, i'm going to be writing about h..."
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,Adequate,"Hi, i'm Isaac, i'm going to be writing about h..."


In [3]:
!ls

__notebook_source__.ipynb


In [4]:
endpoint = "microsoft/deberta-v3-base"

# endpoint = "../input/pppm-deberta-v3-large-baseline-w-w-b-train/config.pth"

# endpoint ="../input/huggingface-bert-variants/bert-base-uncased/bert-base-uncased"
# /deberta-v3-base"
# model     = AutoModel.from_pretrained(endpoint)
# tokenizer = AutoTokenizer.from_pretrained(model_path_or_name)

tokenizer = AutoTokenizer.from_pretrained(endpoint)
# tokenizer = AutoTokenizer.
d_types = sorted(train["discourse_type"].unique())

start_token_ids = []
end_token_ids = []

for t in d_types:
  tokenizer.add_tokens([f"[START_{t}]"], special_tokens=True)
  start_token_ids.append(tokenizer.encode(f"[START_{t}]")[1])
for t in d_types:
  tokenizer.add_tokens([f"[END_{t}]"], special_tokens=True)
  end_token_ids.append(tokenizer.encode(f"[END_{t}]")[1])

tokenizer.add_tokens(["\n"], special_tokens=True)
vocab_size = len(tokenizer)

Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
grps_train = train.groupby("essay_id", sort=False)
grps_test = test.groupby("essay_id", sort=False)

def process_essays(grps, mode): 
  processed_texts = []
  labels_grp = []
  for grp in grps.groups:
    g = grps.get_group(grp)
    t = ""
    if mode == "train":
      labels = g["discourse_effectiveness"].values
    end = 0

    for j in range(len(g)):
      d = g["discourse_text"].values[j]
      t += f" [START_{g.discourse_type.values[j]}]  " + d + f" [END_{g.discourse_type.values[j]}] "
    t = " ".join(g["discourse_type"].values) + f" [SEP] " + t
    processed_texts.append(t)
    if mode == "train":
      labels_grp.append(labels)
  if mode == "train":
    return processed_texts, labels_grp
  else:
    return processed_texts
processed_essays_train, processed_essays_train_labels = process_essays(grps_train, "train")
processed_essays_test = process_essays(grps_test, "test")

In [6]:
processed_essays_train[0]

'Lead Position Claim Evidence Counterclaim Rebuttal Evidence Counterclaim Concluding Statement [SEP]  [START_Lead]  Hi, i\'m Isaac, i\'m going to be writing about how this face on Mars is a natural landform or if there is life on Mars that made it. The story is about how NASA took a picture of Mars and a face was seen on the planet. NASA doesn\'t know if the landform was created by life on Mars, or if it is just a natural landform.  [END_Lead]  [START_Position]  On my perspective, I think that the face is a natural landform because I dont think that there is any life on Mars. In these next few paragraphs, I\'ll be talking about how I think that is is a natural landform  [END_Position]  [START_Claim]  I think that the face is a natural landform because there is no life on Mars that we have descovered yet  [END_Claim]  [START_Evidence]  If life was on Mars, we would know by now. The reason why I think it is a natural landform because, nobody live on Mars in order to create the figure. It

In [7]:
processed_essays_train_labels[0]

array(['Adequate', 'Adequate', 'Adequate', 'Adequate', 'Adequate',
       'Ineffective', 'Adequate', 'Adequate', 'Adequate'], dtype=object)

In [8]:
def encode(text):
    sample = dict()
    encodings = tokenizer(
        text,
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=2048,
#         max_length=512,
    )
    sample["input_ids"] = encodings["input_ids"][0]
    sample["token_type_ids"] = encodings["token_type_ids"][0]
    sample["attention_mask"] = encodings["attention_mask"][0]

    return sample

x_train = [encode(essay) for essay in processed_essays_train]
x_test = [encode(essay) for essay in processed_essays_test]

print(x_train[0])

{'input_ids': tensor([    1,  8380, 18172,  ...,     0,     0,     0]), 'token_type_ids': tensor([0, 0, 0,  ..., 0, 0, 0]), 'attention_mask': tensor([1, 1, 1,  ..., 0, 0, 0])}


In [9]:
labels_dict = {"Adequate": 0, "Effective": 1, "Ineffective": 2}

y_train = [[labels_dict[label] for label in processed_essays_train_labels[i]] for i in range(len(x_train))]
x_train[2]

{'input_ids': tensor([    1,  8380, 18172,  ...,     0,     0,     0]),
 'token_type_ids': tensor([0, 0, 0,  ..., 0, 0, 0]),
 'attention_mask': tensor([1, 1, 1,  ..., 0, 0, 0])}

In [10]:
y_train[2]

[0, 0, 0, 2, 0, 0, 0]

In [11]:
for i in range(len(x_train)):
  input, attention_mask = x_train[i]["input_ids"], x_train[i]["attention_mask"]
  idx = int(torch.where(attention_mask == 1)[0].max())
  idx += 1
  input = input[:idx]


  idx0 = torch.where(
  (input >= min(start_token_ids))
  & (input<= max(start_token_ids))
      )[0]
            
  idx1 = torch.where(
  (input >= min(end_token_ids))
  & (input <= max(end_token_ids))
  )[0]

      
  if idx0.shape != idx1.shape:
    print(tokenizer.batch_decode(input))

In [12]:
class FeedbackDataset(Dataset):
    def __init__(self, grp_texts, labels=None, mode="train"):
        self.grp_texts = grp_texts
        self.labels = labels
        self.mode = mode

    def __len__(self):
        return len(self.grp_texts)

    def __getitem__(self, idx):
        x = self.grp_texts[idx]
        if self.mode == "train":
          y = self.labels[idx]
          return x, y
        else:
          return x

### Load Model

In [13]:
backbone = AutoModel.from_pretrained(endpoint)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Downloading:   0%|          | 0.00/354M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifier.bias', 'mask_predictions.dense.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [14]:
print(start_token_ids)
print(end_token_ids)

class Pooling(nn.Module):
  def __init__(self, start_token_ids, end_token_ids):
    super().__init__()
    self.feat_mult = 3
    self.start_token_ids = start_token_ids
    self.end_token_ids = end_token_ids
  
  def forward(self, x, input, am):
    out = []
    for j in range(x.shape[0]):
      idx0 = torch.where(
      (input[j] >= min(self.start_token_ids))
      & (input[j] <= max(self.start_token_ids))
      )[0]
            
      idx1 = torch.where(
      (input[j] >= min(self.end_token_ids))
      & (input[j] <= max(self.end_token_ids))
      )[0]

      xx = []
      for jj in range(len(idx0)):
        xx0 = x[j, idx0[jj]]
        xx1 = x[j, idx1[jj]]
        xx2temp = x[j, idx0[jj] + 1: idx1[jj]]
        xx2 = xx2temp.mean(dim=0)
        xxx = torch.cat([xx0, xx2, xx1]).unsqueeze(0)
        xx.append(xxx)
      xx = torch.cat(xx)
      out.append(xx)
    return out

[128001, 128002, 128003, 128004, 128005, 128006, 128007]
[128008, 128009, 128010, 128011, 128012, 128013, 128014]


In [15]:
class Model(nn.Module):
  def __init__(self, vocab_size, num_classes):
    super().__init__()
    self.backbone = backbone
    self.backbone.resize_token_embeddings(vocab_size)
    self.pooler = Pooling(start_token_ids, end_token_ids)
    self.classifier = nn.Sequential(
      nn.Linear(768*3, num_classes, bias=True)
    )
  
  def forward(self, x):
    idx = int(torch.where(x["attention_mask"] == 1)[1].max())
    idx += 3
    attention_mask = x["attention_mask"][:, :idx]
    input_ids = x["input_ids"][:, :idx]
    token_type_ids = x["token_type_ids"][:, :idx]
    x = self.backbone(attention_mask = attention_mask.to("cuda"), input_ids=input_ids.to("cuda"), token_type_ids=token_type_ids.to("cuda"))
    x = x.last_hidden_state
    x = self.pooler(x, input_ids, attention_mask)
    x = torch.cat(x)
    return self.classifier(x)

### Training

In [16]:
from collections import defaultdict
from torch.optim import AdamW
from transformers import get_scheduler
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split

bs, lr = 1, 3e-6

model = Model(vocab_size, 3)
x_train, y_train = np.array(x_train), np.array(y_train)
x_train, x_eval, y_train, y_eval = train_test_split(x_train, y_train, train_size=0.1,test_size=0.02)
# x_train, x_eval, y_train, y_eval = train_test_split(x_train, y_train, test_size=0.2)

train_ds = FeedbackDataset(x_train, y_train)
eval_ds = FeedbackDataset(x_eval, y_eval)
train_dataloader = DataLoader(train_ds, shuffle=True, batch_size=bs)
eval_dataloader = DataLoader(eval_ds, batch_size=bs)
optimizer = AdamW(model.parameters(), lr=lr)
num_training_steps = 5 * len(train_dataloader)
lr_scheduler = get_scheduler(
  name="cosine", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)
model.to("cuda")
print("Training:")
print("------------------------------------------------------")
progress_bar = tqdm(range(num_training_steps))
loss_fn = nn.CrossEntropyLoss()
for epoch in range(5):
  c_loss = 0
  count = 0
  c_accuracy = 0

  model.train()
  for batch in train_dataloader:
    count += 1
    x, y = batch
    y = torch.Tensor(y)
    outputs = model(x)
    y = y.type(torch.LongTensor).to("cuda")
    loss = loss_fn(outputs, y)
    acc = accuracy_score(y.to("cpu").detach().numpy(), torch.argmax(F.softmax(outputs), dim=-1).to("cpu").detach().numpy())
    c_loss += loss
    c_accuracy += acc
    loss.backward()
    optimizer.step()
    lr_scheduler.step()
    optimizer.zero_grad()
    progress_bar.update(1)
  final_loss_train = c_loss/count
  final_accuracy_train = c_accuracy/count

  c_loss = 0
  count = 0
  c_accuracy = 0
  print(f"Training Done for epoch {epoch + 1}")
  print("------------------------------------------------------")

  print("Evaluating:")
  print("------------------------------------------------------")
  model.eval()
  for batch in eval_dataloader:
    with torch.inference_mode():
      count += 1
      x, y = batch
      y = torch.Tensor(y)
      outputs = model(x)
      y = y.type(torch.LongTensor).to("cuda")
      loss = loss_fn(outputs, y)
      acc = accuracy_score(y.to("cpu").detach().numpy(), torch.argmax(F.softmax(outputs), dim=-1).to("cpu").detach().numpy())
      c_loss += loss
      c_accuracy += acc
  final_loss_eval = c_loss/count
  final_accuracy_eval = c_accuracy/count
  print(f"Loss at epoch on Training Data {epoch + 1} is {final_loss_train}")
  print(f"Accuracy at epoch {epoch + 1} on Training Data is {final_accuracy_train}")
  print(f"Loss at epoch on Test Data {epoch + 1} is {final_loss_eval}")
  print(f"Accuracy at epoch {epoch + 1} on Test Data is {final_accuracy_eval}")
  print('''
------------------------------------------------------
    
''')

Training:
------------------------------------------------------


  0%|          | 0/2095 [00:00<?, ?it/s]

Training Done for epoch 1
------------------------------------------------------
Evaluating:
------------------------------------------------------
Loss at epoch on Training Data 1 is 0.9142217636108398
Accuracy at epoch 1 on Training Data is 0.5735354270251406
Loss at epoch on Test Data 1 is 0.811692476272583
Accuracy at epoch 1 on Test Data is 0.6271074494288778

------------------------------------------------------
    

Training Done for epoch 2
------------------------------------------------------
Evaluating:
------------------------------------------------------
Loss at epoch on Training Data 2 is 0.7784014940261841
Accuracy at epoch 2 on Training Data is 0.6574302256199112
Loss at epoch on Test Data 2 is 0.7840568423271179
Accuracy at epoch 2 on Test Data is 0.6438611322539893

------------------------------------------------------
    

Training Done for epoch 3
------------------------------------------------------
Evaluating:
------------------------------------------------

### Testing and Saving Model

In [17]:
test_ds = FeedbackDataset(x_test, mode="test")
test_dataloader = DataLoader(test_ds, batch_size=1)

model.eval()
for batch in test_dataloader:
  with torch.inference_mode():
    logits = model(batch)
    probability = F.softmax(logits, dim=-1).to("cpu").detach().numpy()
print(probability)

[[0.37845686 0.5685745  0.05296867]
 [0.5242642  0.46183833 0.01389746]
 [0.48563904 0.47762066 0.03674033]
 [0.5074369  0.45732632 0.03523682]
 [0.36128938 0.61270213 0.02600852]
 [0.22346056 0.7541404  0.02239905]
 [0.21853067 0.7611354  0.0203339 ]
 [0.26685083 0.7167637  0.01638545]
 [0.2215631  0.7575984  0.02083853]
 [0.27375698 0.69836277 0.02788026]]


In [18]:
dir

'../input/feedback-prize-effectiveness'

In [19]:
output_numpy=probability

In [20]:
sub = pd.read_csv("../input/feedback-prize-effectiveness/sample_submission.csv")
sub['Ineffective'] = output_numpy[:,2]
sub['Adequate'] = output_numpy[:,0]
sub['Effective'] = output_numpy[:,1]
sub

Unnamed: 0,discourse_id,Ineffective,Adequate,Effective
0,a261b6e14276,0.052969,0.378457,0.568574
1,5a88900e7dc1,0.013897,0.524264,0.461838
2,9790d835736b,0.03674,0.485639,0.477621
3,75ce6d68b67b,0.035237,0.507437,0.457326
4,93578d946723,0.026009,0.361289,0.612702
5,2e214524dbe3,0.022399,0.223461,0.75414
6,84812fc2ab9f,0.020334,0.218531,0.761135
7,c668ff840720,0.016385,0.266851,0.716764
8,739a6d00f44a,0.020839,0.221563,0.757598
9,bcfae2c9a244,0.02788,0.273757,0.698363


In [21]:
# tokenizer.save_pretrained("/content/drive/MyDrive/Kaggle Training Results/Predicting Effective Arguments/result/debertav3base/tokenizer")
# torch.save(model, "/content/drive/MyDrive/Kaggle Training Results/Predicting Effective Arguments/result/debertav3base/model/model.pth")