In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as pl
import re
from sklearn.metrics import classification_report
import bs4
from collections import defaultdict

WORKING_DIR='.'
DEVICE = 'cuda'

In [2]:
dataset_educ = pd.read_json(f'{WORKING_DIR}/Dataset_educ_1.1.json', orient = 'index')
dataset_div = pd.read_json(f'{WORKING_DIR}/Dataset_div2_final.json', orient = 'index')

In [3]:
regex_link_ful = re.compile('<a href.*\/contest/.*/submission/.*<\/a>')
code_regex = re.compile('<code>(\s|.)*?<\/code>')
def preprocess_for_transfomers(texts, problems):
  preprocessed_texts = []
  for t, p in zip(texts, problems):
    t_codes = code_regex.sub(' (code) ', t)
    t_link = regex_link_ful.sub(f' (link to problem {p}) ', t_codes)
    bs = bs4.BeautifulSoup(t_link)
    preprocessed_texts.append(bs.text)
  
  return preprocessed_texts

dataset_educ.loc[~(dataset_educ['label'] == 'Irrelevant'), 'label'] = 'Relevant'
preprocessed_text = preprocess_for_transfomers(dataset_educ['text'], dataset_educ['problem'])
dataset_educ["preprocessed_text"] = preprocessed_text

dataset_div.loc[~(dataset_div['label'] == 'Irrelevant'), 'label'] = 'Relevant'
preprocessed_text = preprocess_for_transfomers(dataset_div['text'], dataset_div['problem'])
dataset_div["preprocessed_text"] = preprocessed_text

In [4]:
TREE_FATHER_PATH_LENGTH = 3

def compute_father_indices_pd(df):
  father_tree = {}

  for _,com in df.iterrows():
    father_tree[com.id] = com.father_id

  comment_father_indices = {}

  for df_index,com in df.iterrows():
      indices = []
      last_ind = com.id
      for i in range(TREE_FATHER_PATH_LENGTH):
        if(father_tree[last_ind] not in father_tree):
          indices.append(-1)
          continue
        if(last_ind != -1):
          last_ind = father_tree[last_ind]
        
        if(last_ind != -1):
          indices.append(df[df.id == last_ind].index.values[0])
        else:
          indices.append(last_ind)
      indices.reverse()
      comment_father_indices[df_index] = indices
  return comment_father_indices

father_indices_educ = compute_father_indices_pd(dataset_educ)
father_indices_div = compute_father_indices_pd(dataset_div)

In [5]:
from embeddings_generation import TokenizedDataset, LayerEMBTokenEmbeddingGeneration
from embeddings_generation.utils import *

huggingface_model_name = "roberta-base"
huggingface_model_name_alias = huggingface_model_name.split("/")[-1] + '_educ'

if(not embedding_already_persisted(huggingface_model_name_alias)):
    persist_embeddings(dataset_educ["preprocessed_text"], huggingface_model_name, huggingface_model_name_alias, dataset_educ.index)
    
embeddings_educ = load_embeddings(huggingface_model_name_alias)


huggingface_model_name_alias_div = huggingface_model_name.split("/")[-1] +'_div'
if(not embedding_already_persisted(huggingface_model_name_alias_div)):
    persist_embeddings(dataset_div["preprocessed_text"], huggingface_model_name, huggingface_model_name_alias_div, dataset_div.index)
embeddings_div = load_embeddings(huggingface_model_name_alias_div)

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
statistics = determine_tokens_statistics(dataset_educ["preprocessed_text"], huggingface_model_name)

statistics["ratioNotWholeWords"] = statistics["nrWordsSplitAtleastTwice"] / statistics["nrWholeWords"]

statistics.describe()

Token indices sequence length is longer than the specified maximum sequence length for this model (579 > 512). Running this sequence through the model will result in indexing errors


Unnamed: 0,nrTokens,nrWholeWords,maxWordSplit,nrWordsSplitAtleastTwice,ratioNotWholeWords
count,937.0,937.0,937.0,937.0,937.0
mean,69.649947,69.649947,0.0,0.0,0.0
std,116.79456,116.79456,0.0,0.0,0.0
min,3.0,3.0,0.0,0.0,0.0
25%,17.0,17.0,0.0,0.0,0.0
50%,32.0,32.0,0.0,0.0,0.0
75%,71.0,71.0,0.0,0.0,0.0
max,1216.0,1216.0,0.0,0.0,0.0


In [7]:
import torch
from torch.utils.data.dataset import Dataset
from torch.utils.data.dataloader import DataLoader
import numpy as np
import random
RANDOM_SEED = 443
DEVICE = "cuda"

torch.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

groups = list(dataset_educ.groupby(lambda k : k.split("?")[0]))
random.shuffle(groups)

train_groups = groups[:10]
validation_groups = groups[10:13]
test_groups = groups[13:]

train_educ_dataset = dataset_educ.loc[[idx for _, g in train_groups for idx in g.index.tolist()]]
val_educ_dataset = dataset_educ.loc[[idx for _, g in validation_groups for idx in g.index.tolist()]]
test_educ_dataset = dataset_educ.loc[[idx for _, g in test_groups for idx in g.index.tolist()]]

from sklearn.preprocessing import LabelEncoder
labelEncoder = LabelEncoder()
train_encoded_labels = labelEncoder.fit_transform(train_educ_dataset["label"])
val_encoded_labels = labelEncoder.transform(val_educ_dataset["label"])
test_encoded_labels = labelEncoder.transform(test_educ_dataset["label"])


div_encoded_labels = labelEncoder.transform(dataset_div["label"])


In [8]:
class BlogCommentDataset(Dataset):
    def __init__(self, dataset:pd.DataFrame, embeddings:dict, father_indices:dict, num_last_layers_embeddings_agg ,labels):
        self.dataset = dataset
        self.embeddings = embeddings
        self.father_indices = father_indices
        self.num_last_layers_embeddings_agg = num_last_layers_embeddings_agg
        self.labels = labels
    def __len__(self):
        return len(self.dataset)
    def __getitem__(self, index):
        df_index = self.dataset.index[index]
        embedding = self._agg_emb(self.embeddings[df_index])
        fathers = self.father_indices[df_index]

        embedding_fathers = []
        masks = []
        for f_idx in fathers:
            if(f_idx == -1):
                embedding_fathers.append(torch.zeros(embedding.shape[0]))
                masks.append(1)
            else:
                embedding_fathers.append(self._agg_emb(self.embeddings[f_idx]))
                masks.append(0)

        embedding_fathers = torch.stack(embedding_fathers)
        return embedding.to(DEVICE), embedding_fathers.to(DEVICE), torch.tensor(masks, dtype = torch.float32).to(DEVICE), torch.tensor(self.labels[index], dtype = torch.long).to(DEVICE)
    
    def _agg_emb(self, embedding):
        embedding = np.array(embedding, dtype=np.float32)
        embedding = embedding[0, -self.num_last_layers_embeddings_agg:, :].mean(0)
        return torch.from_numpy(embedding)

train_torch_dataset = BlogCommentDataset(train_educ_dataset, embeddings_educ, father_indices_educ, 3, train_encoded_labels)
train_torch_dataloader = DataLoader(train_torch_dataset, 2, shuffle=True)

val_torch_dataset = BlogCommentDataset(val_educ_dataset, embeddings_educ, father_indices_educ, 3, val_encoded_labels)
val_torch_dataloader = DataLoader(val_torch_dataset, 2, shuffle=False)

test_torch_dataset = BlogCommentDataset(test_educ_dataset, embeddings_educ, father_indices_educ, 3, test_encoded_labels)
test_torch_dataloader = DataLoader(test_torch_dataset, 2, shuffle=False)

div_dataset_torch = BlogCommentDataset(dataset_div, embeddings_div, father_indices_div, 3, div_encoded_labels)
div_dataloader_torch = DataLoader(div_dataset_torch, 2, shuffle=False)

In [9]:
from tqdm import tqdm
import math
class CommentClassificationModel(torch.nn.Module):
    def __init__(self, nrLabels):
        super(CommentClassificationModel, self).__init__()

        self.comment_proj = torch.nn.Linear(768, 128) 
        self.relu = torch.nn.ReLU()
        self.output = torch.nn.Linear(128, nrLabels) 
        self.dropout = torch.nn.Dropout(0.7)  

    def forward(self, x, fathers_x, mask):
        com_proj = self.dropout(self.relu(self.comment_proj(x)))

        return self.output(com_proj)

comment_classification_Model = CommentClassificationModel(len(labelEncoder.classes_))
comment_classification_Model.to(DEVICE)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(comment_classification_Model.parameters())
scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer,len(train_torch_dataloader) * 5, 2)

nr_epochs = 50
current_step = 0
best_model_loss = 1e9
for epoch in range(nr_epochs):
  pbar_training = tqdm(train_torch_dataloader)
  training_average_loss = 0
  training_nr_batches = 0
  comment_classification_Model.train()
  iters = len(pbar_training)
  
  for embeddings_batch, embedding_fathers, mask_batch, labels_batch in pbar_training:
    optimizer.zero_grad()
    yhat = comment_classification_Model(embeddings_batch, embedding_fathers, mask_batch)
    loss = criterion(yhat, labels_batch)
    loss.backward()
    optimizer.step()
    pbar_training.set_postfix({'loss': loss.cpu().detach().numpy()})

    training_average_loss += loss.cpu().detach().numpy()
    training_nr_batches+=1
    current_step+=1
    scheduler.step()

  pbar_validation = tqdm(val_torch_dataloader)

  validation_average_loss = 0
  validation_nr_batches = 0
  comment_classification_Model.eval()
  for embeddings_batch, embedding_fathers, mask_batch, labels_batch in pbar_validation:
    with torch.no_grad():
      yhat = comment_classification_Model(embeddings_batch, embedding_fathers, mask_batch)
      loss = criterion(yhat, labels_batch)

      validation_average_loss += loss.cpu().detach().numpy()
      validation_nr_batches+=1

      pbar_validation.set_postfix({'loss': loss.cpu().detach().numpy()})
      
  print(f'Epoch {epoch + 1} has training loss: {training_average_loss / training_nr_batches}')
  print(f'Epoch {epoch + 1} has validation loss: {validation_average_loss / validation_nr_batches}')

  if(validation_average_loss / validation_nr_batches < best_model_loss):
     best_model_loss = validation_average_loss / validation_nr_batches
     print(f'Best loss at epoch {epoch}')
     torch.save(comment_classification_Model, f'best_models/{huggingface_model_name_alias}.pkl')


100%|██████████| 296/296 [00:02<00:00, 123.26it/s, loss=0.5693228] 
100%|██████████| 93/93 [00:00<00:00, 231.36it/s, loss=0.6702366] 


Epoch 1 has training loss: 0.6850910549228256
Epoch 1 has validation loss: 0.6168536851483006
Best loss at epoch 0


100%|██████████| 296/296 [00:01<00:00, 176.40it/s, loss=0.50567013]
100%|██████████| 93/93 [00:00<00:00, 216.29it/s, loss=0.76873696]


Epoch 2 has training loss: 0.6569605767726898
Epoch 2 has validation loss: 0.6066223959768972
Best loss at epoch 1


100%|██████████| 296/296 [00:01<00:00, 173.81it/s, loss=0.08672331]
100%|██████████| 93/93 [00:00<00:00, 219.85it/s, loss=0.6873891] 


Epoch 3 has training loss: 0.6237334448944878
Epoch 3 has validation loss: 0.5699333945910136
Best loss at epoch 2


100%|██████████| 296/296 [00:01<00:00, 177.14it/s, loss=0.70523226]
100%|██████████| 93/93 [00:00<00:00, 225.17it/s, loss=0.84986323]


Epoch 4 has training loss: 0.6170048918071631
Epoch 4 has validation loss: 0.5727979039312691


100%|██████████| 296/296 [00:01<00:00, 157.70it/s, loss=0.77008206]
100%|██████████| 93/93 [00:00<00:00, 218.32it/s, loss=0.81181294]


Epoch 5 has training loss: 0.5894430906486672
Epoch 5 has validation loss: 0.5658012344952552
Best loss at epoch 4


100%|██████████| 296/296 [00:01<00:00, 153.51it/s, loss=0.039275695]
100%|██████████| 93/93 [00:00<00:00, 224.23it/s, loss=0.48802072]


Epoch 6 has training loss: 0.5991293467908494
Epoch 6 has validation loss: 0.5517493826086803
Best loss at epoch 5


100%|██████████| 296/296 [00:02<00:00, 145.67it/s, loss=0.192091]  
100%|██████████| 93/93 [00:00<00:00, 226.72it/s, loss=0.98267597]


Epoch 7 has training loss: 0.57613610521563
Epoch 7 has validation loss: 0.5432317833746633
Best loss at epoch 6


100%|██████████| 296/296 [00:02<00:00, 128.81it/s, loss=0.6286085] 
100%|██████████| 93/93 [00:00<00:00, 220.90it/s, loss=0.65987384]


Epoch 8 has training loss: 0.5643844814165622
Epoch 8 has validation loss: 0.5009627866969314
Best loss at epoch 7


100%|██████████| 296/296 [00:02<00:00, 132.95it/s, loss=0.31928378] 
100%|██████████| 93/93 [00:00<00:00, 226.83it/s, loss=1.0181398] 


Epoch 9 has training loss: 0.5379147915219938
Epoch 9 has validation loss: 0.5202202530157182


100%|██████████| 296/296 [00:02<00:00, 131.39it/s, loss=0.48308313] 
100%|██████████| 93/93 [00:00<00:00, 208.99it/s, loss=0.9241088] 


Epoch 10 has training loss: 0.540616458678316
Epoch 10 has validation loss: 0.5158832129291309


100%|██████████| 296/296 [00:02<00:00, 132.60it/s, loss=0.5330649]  
100%|██████████| 93/93 [00:00<00:00, 214.73it/s, loss=0.76973057]


Epoch 11 has training loss: 0.5274175206023092
Epoch 11 has validation loss: 0.48962536634456727
Best loss at epoch 10


100%|██████████| 296/296 [00:02<00:00, 130.95it/s, loss=0.58160925] 
100%|██████████| 93/93 [00:00<00:00, 201.29it/s, loss=0.8810196]  


Epoch 12 has training loss: 0.5135347782004926
Epoch 12 has validation loss: 0.4924563316087569


100%|██████████| 296/296 [00:02<00:00, 130.64it/s, loss=0.13766693] 
100%|██████████| 93/93 [00:00<00:00, 212.81it/s, loss=0.7777808] 


Epoch 13 has training loss: 0.5003091405661827
Epoch 13 has validation loss: 0.4789970223140973
Best loss at epoch 12


100%|██████████| 296/296 [00:02<00:00, 132.38it/s, loss=0.3573865]  
100%|██████████| 93/93 [00:00<00:00, 223.56it/s, loss=0.8187455] 


Epoch 14 has training loss: 0.5017525493167341
Epoch 14 has validation loss: 0.48033015365882586


100%|██████████| 296/296 [00:02<00:00, 129.26it/s, loss=1.684917]   
100%|██████████| 93/93 [00:00<00:00, 210.89it/s, loss=0.83230954]


Epoch 15 has training loss: 0.4922415228132662
Epoch 15 has validation loss: 0.4809256631840942


100%|██████████| 296/296 [00:02<00:00, 132.80it/s, loss=0.41943502] 
100%|██████████| 93/93 [00:00<00:00, 223.56it/s, loss=1.0303711] 


Epoch 16 has training loss: 0.5436070027345842
Epoch 16 has validation loss: 0.5036383951783822


100%|██████████| 296/296 [00:02<00:00, 133.10it/s, loss=1.0546988]  
100%|██████████| 93/93 [00:00<00:00, 222.27it/s, loss=0.62674534]


Epoch 17 has training loss: 0.5420990324912693
Epoch 17 has validation loss: 0.4711533056632165
Best loss at epoch 16


100%|██████████| 296/296 [00:02<00:00, 130.52it/s, loss=1.4251038]  
100%|██████████| 93/93 [00:00<00:00, 212.81it/s, loss=1.1413641] 


Epoch 18 has training loss: 0.5319904081674444
Epoch 18 has validation loss: 0.5204078120890484


100%|██████████| 296/296 [00:02<00:00, 131.38it/s, loss=0.16752963] 
100%|██████████| 93/93 [00:00<00:00, 211.36it/s, loss=0.5927351] 


Epoch 19 has training loss: 0.5166863098086135
Epoch 19 has validation loss: 0.4596117166101292
Best loss at epoch 18


100%|██████████| 296/296 [00:02<00:00, 131.76it/s, loss=0.028208137]
100%|██████████| 93/93 [00:00<00:00, 204.85it/s, loss=0.753239]  


Epoch 20 has training loss: 0.5141355811140021
Epoch 20 has validation loss: 0.46634337738637


100%|██████████| 296/296 [00:02<00:00, 132.22it/s, loss=0.5655774]  
100%|██████████| 93/93 [00:00<00:00, 217.30it/s, loss=0.7143229]  


Epoch 21 has training loss: 0.48760140229431864
Epoch 21 has validation loss: 0.45400203780461384
Best loss at epoch 20


100%|██████████| 296/296 [00:02<00:00, 132.19it/s, loss=0.14989112] 
100%|██████████| 93/93 [00:00<00:00, 222.07it/s, loss=0.9758048]  


Epoch 22 has training loss: 0.5007303825491486
Epoch 22 has validation loss: 0.48582186037173836


100%|██████████| 296/296 [00:02<00:00, 130.34it/s, loss=0.057589807]
100%|██████████| 93/93 [00:00<00:00, 212.34it/s, loss=0.9303167] 


Epoch 23 has training loss: 0.48997380322701223
Epoch 23 has validation loss: 0.4754762585445117


100%|██████████| 296/296 [00:02<00:00, 132.52it/s, loss=0.10015665] 
100%|██████████| 93/93 [00:00<00:00, 220.37it/s, loss=0.79916656] 


Epoch 24 has training loss: 0.4787224984752971
Epoch 24 has validation loss: 0.4635773816256113


100%|██████████| 296/296 [00:02<00:00, 133.33it/s, loss=0.54429996] 
100%|██████████| 93/93 [00:00<00:00, 209.51it/s, loss=0.8037132] 


Epoch 25 has training loss: 0.4875954196259782
Epoch 25 has validation loss: 0.45309158849219483
Best loss at epoch 24


100%|██████████| 296/296 [00:02<00:00, 129.55it/s, loss=0.38837114] 
100%|██████████| 93/93 [00:00<00:00, 212.94it/s, loss=0.6637124] 


Epoch 26 has training loss: 0.4805839199295922
Epoch 26 has validation loss: 0.44975712410705065
Best loss at epoch 25


100%|██████████| 296/296 [00:02<00:00, 133.03it/s, loss=0.42076614] 
100%|██████████| 93/93 [00:00<00:00, 212.33it/s, loss=0.90641767] 


Epoch 27 has training loss: 0.46050400004369785
Epoch 27 has validation loss: 0.4626642769823472


100%|██████████| 296/296 [00:02<00:00, 132.64it/s, loss=0.44410533] 
100%|██████████| 93/93 [00:00<00:00, 213.30it/s, loss=0.8107714]  


Epoch 28 has training loss: 0.4623805159841337
Epoch 28 has validation loss: 0.4485049367912354
Best loss at epoch 27


100%|██████████| 296/296 [00:02<00:00, 131.02it/s, loss=0.52973664] 
100%|██████████| 93/93 [00:00<00:00, 223.02it/s, loss=0.71890485] 


Epoch 29 has training loss: 0.4748385655379074
Epoch 29 has validation loss: 0.43906076192375154
Best loss at epoch 28


100%|██████████| 296/296 [00:02<00:00, 132.09it/s, loss=0.27516907]  
100%|██████████| 93/93 [00:00<00:00, 222.75it/s, loss=0.7991077]  


Epoch 30 has training loss: 0.43960921601722697
Epoch 30 has validation loss: 0.4457908385903925


100%|██████████| 296/296 [00:02<00:00, 126.82it/s, loss=0.3538632]  
100%|██████████| 93/93 [00:00<00:00, 220.34it/s, loss=0.76423377]


Epoch 31 has training loss: 0.45343136885580987
Epoch 31 has validation loss: 0.44080152254431476


100%|██████████| 296/296 [00:02<00:00, 133.21it/s, loss=0.47687942] 
100%|██████████| 93/93 [00:00<00:00, 221.96it/s, loss=0.78042793]


Epoch 32 has training loss: 0.44995628796970927
Epoch 32 has validation loss: 0.4426226785546669


100%|██████████| 296/296 [00:02<00:00, 132.56it/s, loss=1.3997416]  
100%|██████████| 93/93 [00:00<00:00, 221.42it/s, loss=0.7780402]  


Epoch 33 has training loss: 0.45252379074038285
Epoch 33 has validation loss: 0.44239023275753503


100%|██████████| 296/296 [00:02<00:00, 133.03it/s, loss=1.0883476]   
100%|██████████| 93/93 [00:00<00:00, 221.62it/s, loss=0.76919574] 


Epoch 34 has training loss: 0.4526256698095617
Epoch 34 has validation loss: 0.4422171331982138


100%|██████████| 296/296 [00:02<00:00, 131.65it/s, loss=0.79653645] 
100%|██████████| 93/93 [00:00<00:00, 209.93it/s, loss=0.76904035]


Epoch 35 has training loss: 0.4509426414387653
Epoch 35 has validation loss: 0.44227051714895876


100%|██████████| 296/296 [00:02<00:00, 131.67it/s, loss=0.39960745] 
100%|██████████| 93/93 [00:00<00:00, 203.95it/s, loss=0.80123484]


Epoch 36 has training loss: 0.4765065136913059
Epoch 36 has validation loss: 0.448544304457403


100%|██████████| 296/296 [00:02<00:00, 127.94it/s, loss=0.4932912]  
100%|██████████| 93/93 [00:00<00:00, 203.06it/s, loss=1.3556627]  


Epoch 37 has training loss: 0.4943659559074739
Epoch 37 has validation loss: 0.5336158663755463


100%|██████████| 296/296 [00:02<00:00, 130.68it/s, loss=1.3608578]  
100%|██████████| 93/93 [00:00<00:00, 220.90it/s, loss=1.1071072]  


Epoch 38 has training loss: 0.5054497741779463
Epoch 38 has validation loss: 0.4952762553127863


100%|██████████| 296/296 [00:02<00:00, 131.68it/s, loss=0.38599753] 
100%|██████████| 93/93 [00:00<00:00, 214.29it/s, loss=1.2647619]  


Epoch 39 has training loss: 0.4778623586297161
Epoch 39 has validation loss: 0.5123063393778378


100%|██████████| 296/296 [00:02<00:00, 133.66it/s, loss=1.3305094]  
100%|██████████| 93/93 [00:00<00:00, 218.83it/s, loss=0.8282013] 


Epoch 40 has training loss: 0.4945168005604599
Epoch 40 has validation loss: 0.4480588488000375


100%|██████████| 296/296 [00:02<00:00, 132.56it/s, loss=0.14037986]  
100%|██████████| 93/93 [00:00<00:00, 217.78it/s, loss=0.8355882] 


Epoch 41 has training loss: 0.4763111458576876
Epoch 41 has validation loss: 0.47151886603684834


100%|██████████| 296/296 [00:02<00:00, 131.69it/s, loss=0.034225386]
100%|██████████| 93/93 [00:00<00:00, 216.28it/s, loss=0.5189415]  


Epoch 42 has training loss: 0.4926217417106838
Epoch 42 has validation loss: 0.4275099977410288
Best loss at epoch 41


100%|██████████| 296/296 [00:02<00:00, 128.58it/s, loss=0.034794766]
100%|██████████| 93/93 [00:00<00:00, 209.46it/s, loss=0.7282309]  


Epoch 43 has training loss: 0.5088977308100643
Epoch 43 has validation loss: 0.43134796865765124


100%|██████████| 296/296 [00:02<00:00, 133.35it/s, loss=1.7061251]   
100%|██████████| 93/93 [00:00<00:00, 220.90it/s, loss=1.0398104]  


Epoch 44 has training loss: 0.47719341466310305
Epoch 44 has validation loss: 0.4728800591722291


100%|██████████| 296/296 [00:02<00:00, 132.08it/s, loss=0.14164707]  
100%|██████████| 93/93 [00:00<00:00, 218.82it/s, loss=0.8238709]  


Epoch 45 has training loss: 0.4749716395367491
Epoch 45 has validation loss: 0.43051047233604295


100%|██████████| 296/296 [00:02<00:00, 133.58it/s, loss=0.32158622]  
100%|██████████| 93/93 [00:00<00:00, 221.94it/s, loss=0.47758782] 


Epoch 46 has training loss: 0.45923284736203934
Epoch 46 has validation loss: 0.4227328207744386
Best loss at epoch 45


100%|██████████| 296/296 [00:02<00:00, 130.15it/s, loss=1.1285322]  
100%|██████████| 93/93 [00:00<00:00, 200.86it/s, loss=0.7645695]  


Epoch 47 has training loss: 0.498101189570514
Epoch 47 has validation loss: 0.4321225483911813


100%|██████████| 296/296 [00:02<00:00, 132.51it/s, loss=0.086757526] 
100%|██████████| 93/93 [00:00<00:00, 220.90it/s, loss=0.78648627] 


Epoch 48 has training loss: 0.4653608509515588
Epoch 48 has validation loss: 0.4270634790682184


100%|██████████| 296/296 [00:02<00:00, 133.53it/s, loss=0.13009055]  
100%|██████████| 93/93 [00:00<00:00, 219.34it/s, loss=0.90808547] 


Epoch 49 has training loss: 0.4493726382969652
Epoch 49 has validation loss: 0.4449135168826067


100%|██████████| 296/296 [00:02<00:00, 132.34it/s, loss=0.19786488]  
100%|██████████| 93/93 [00:00<00:00, 218.82it/s, loss=0.6726161]  

Epoch 50 has training loss: 0.46688850161090895
Epoch 50 has validation loss: 0.42194686832046635
Best loss at epoch 49





In [10]:
comment_classification_Model= torch.load(f'best_models/{huggingface_model_name_alias}.pkl')
comment_classification_Model.to(DEVICE)
comment_classification_Model.eval()
predictions = []
pbar_validation = tqdm(val_torch_dataloader)
for embeddings_batch, embedding_fathers, mask_batch, labels_batch in pbar_validation:
  predictions.append(comment_classification_Model(embeddings_batch, embedding_fathers, mask_batch))

predictions = np.stack([e for b in predictions for e in b.cpu().detach().numpy()])

  0%|          | 0/93 [00:00<?, ?it/s]

100%|██████████| 93/93 [00:00<00:00, 319.59it/s]


In [11]:
predictions_softmax = torch.nn.Softmax(dim=1)(torch.from_numpy(predictions)).detach().numpy()

predictions_indices = np.argmax(predictions_softmax, axis = 1)
predicted_normal_labels = labelEncoder.inverse_transform(predictions_indices.ravel())

from sklearn.metrics import classification_report

print(classification_report(val_educ_dataset["label"], predicted_normal_labels))

              precision    recall  f1-score   support

  Irrelevant       0.76      0.68      0.71        74
    Relevant       0.80      0.86      0.83       112

    accuracy                           0.78       186
   macro avg       0.78      0.77      0.77       186
weighted avg       0.78      0.78      0.78       186



In [12]:
predictions = []
pbar_test = tqdm(test_torch_dataloader)
for embeddings_batch, embedding_fathers, mask_batch, labels_batch in pbar_test:
  predictions.append(comment_classification_Model(embeddings_batch, embedding_fathers, mask_batch))

predictions = np.stack([e for b in predictions for e in b.cpu().detach().numpy()])

100%|██████████| 80/80 [00:00<00:00, 355.56it/s]


In [13]:
predictions_softmax = torch.nn.Softmax(dim=1)(torch.from_numpy(predictions)).detach().numpy()

predictions_indices = np.argmax(predictions_softmax, axis = 1)
predicted_normal_labels = labelEncoder.inverse_transform(predictions_indices.ravel())

from sklearn.metrics import classification_report

print(classification_report(test_educ_dataset["label"], predicted_normal_labels))

              precision    recall  f1-score   support

  Irrelevant       0.84      0.73      0.78        74
    Relevant       0.79      0.88      0.84        86

    accuracy                           0.81       160
   macro avg       0.82      0.81      0.81       160
weighted avg       0.82      0.81      0.81       160



In [14]:
predictions = []
pbar_div = tqdm(div_dataloader_torch)
for embeddings_batch, embedding_fathers, mask_batch, labels_batch in pbar_div:
  predictions.append(comment_classification_Model(embeddings_batch, embedding_fathers, mask_batch))

predictions = np.stack([e for b in predictions for e in b.cpu().detach().numpy()])

100%|██████████| 252/252 [00:00<00:00, 345.19it/s]


In [15]:
predictions_softmax = torch.nn.Softmax(dim=-1)(torch.from_numpy(predictions)).detach().numpy()

predictions_indices = np.argmax(predictions_softmax, axis = 1)
predicted_normal_labels = labelEncoder.inverse_transform(predictions_indices.ravel())

from sklearn.metrics import classification_report

print(classification_report(dataset_div["label"], predicted_normal_labels))

              precision    recall  f1-score   support

  Irrelevant       0.95      0.61      0.74       393
    Relevant       0.39      0.89      0.54       111

    accuracy                           0.67       504
   macro avg       0.67      0.75      0.64       504
weighted avg       0.83      0.67      0.70       504

