In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as pl
import re
from sklearn.metrics import classification_report
import bs4
from collections import defaultdict

WORKING_DIR='.'
DEVICE = 'cuda'

In [None]:
dataset_educ = pd.read_json(f'{WORKING_DIR}/Dataset_educ_1.1.json', orient = 'index')
dataset_div = pd.read_json(f'{WORKING_DIR}/Dataset_div2_final.json', orient = 'index')

In [None]:
regex_link_ful = re.compile('<a href.*\/contest/.*/submission/.*<\/a>')
code_regex = re.compile('<code>(\s|.)*?<\/code>')
def preprocess_for_transfomers(texts, problems):
  preprocessed_texts = []
  for t, p in zip(texts, problems):
    t_codes = code_regex.sub(' (code) ', t)
    t_link = regex_link_ful.sub(f' (link to problem {p}) ', t_codes)
    bs = bs4.BeautifulSoup(t_link)
    preprocessed_texts.append(bs.text)
  
  return preprocessed_texts

dataset_educ.loc[~(dataset_educ['label'] == 'Irrelevant'), 'label'] = 'Relevant'
preprocessed_text = preprocess_for_transfomers(dataset_educ['text'], dataset_educ['problem'])
dataset_educ["preprocessed_text"] = preprocessed_text

dataset_div.loc[~(dataset_div['label'] == 'Irrelevant'), 'label'] = 'Relevant'
preprocessed_text = preprocess_for_transfomers(dataset_div['text'], dataset_div['problem'])
dataset_div["preprocessed_text"] = preprocessed_text

In [None]:
TREE_FATHER_PATH_LENGTH = 3

def compute_father_indices_pd(df):
  father_tree = {}

  for _,com in df.iterrows():
    father_tree[com.id] = com.father_id

  comment_father_indices = {}

  for df_index,com in df.iterrows():
      indices = []
      last_ind = com.id
      for i in range(TREE_FATHER_PATH_LENGTH):
        if(father_tree[last_ind] not in father_tree):
          indices.append(-1)
          continue
        if(last_ind != -1):
          last_ind = father_tree[last_ind]
        
        if(last_ind != -1):
          indices.append(df[df.id == last_ind].index.values[0])
        else:
          indices.append(last_ind)
      indices.reverse()
      comment_father_indices[df_index] = indices
  return comment_father_indices

father_indices_educ = compute_father_indices_pd(dataset_educ)
father_indices_div = compute_father_indices_pd(dataset_div)

In [None]:
from embeddings_generation import TokenizedDataset, LayerEMBTokenEmbeddingGeneration
from embeddings_generation.utils import *

huggingface_model_name = "finetuned-128bert-base"
huggingface_model_name_alias = huggingface_model_name.split("/")[-1] + "_educ"

if(not embedding_already_persisted(huggingface_model_name_alias)):
    persist_embeddings(dataset_educ["preprocessed_text"], huggingface_model_name, huggingface_model_name_alias, dataset_educ.index)
    
embeddings_educ = load_embeddings(huggingface_model_name_alias)

huggingface_model_name_alias_div = huggingface_model_name.split("/")[-1] +'_div'
if(not embedding_already_persisted(huggingface_model_name_alias_div)):
    persist_embeddings(dataset_div["preprocessed_text"], huggingface_model_name, huggingface_model_name_alias_div, dataset_div.index)
embeddings_div = load_embeddings(huggingface_model_name_alias_div)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
statistics = determine_tokens_statistics(dataset_educ["preprocessed_text"], huggingface_model_name)

statistics["ratioNotWholeWords"] = statistics["nrWordsSplitAtleastTwice"] / statistics["nrWholeWords"]

statistics.describe()

Unnamed: 0,nrTokens,nrWholeWords,maxWordSplit,nrWordsSplitAtleastTwice,ratioNotWholeWords
count,937.0,937.0,937.0,937.0,937.0
mean,74.66809,70.043757,1.469584,3.155816,0.05639
std,129.464346,122.61402,1.337087,6.083399,0.071372
min,3.0,3.0,0.0,0.0,0.0
25%,17.0,16.0,0.0,0.0,0.0
50%,32.0,30.0,1.0,1.0,0.041667
75%,74.0,68.0,2.0,4.0,0.075758
max,1424.0,1373.0,11.0,93.0,0.571429


In [None]:
import torch
from torch.utils.data.dataset import Dataset
from torch.utils.data.dataloader import DataLoader
import numpy as np
import random
RANDOM_SEED = 443
DEVICE = "cuda"

torch.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

groups = list(dataset_educ.groupby(lambda k : k.split("?")[0]))
random.shuffle(groups)

train_groups = groups[:10]
validation_groups = groups[10:13]
test_groups = groups[13:]

train_educ_dataset = dataset_educ.loc[[idx for _, g in train_groups for idx in g.index.tolist()]]
val_educ_dataset = dataset_educ.loc[[idx for _, g in validation_groups for idx in g.index.tolist()]]
test_educ_dataset = dataset_educ.loc[[idx for _, g in test_groups for idx in g.index.tolist()]]

from sklearn.preprocessing import LabelEncoder
labelEncoder = LabelEncoder()
train_encoded_labels = labelEncoder.fit_transform(train_educ_dataset["label"])
val_encoded_labels = labelEncoder.fit_transform(val_educ_dataset["label"])
test_encoded_labels = labelEncoder.fit_transform(test_educ_dataset["label"])

div_encoded_labels = labelEncoder.fit_transform(dataset_div["label"])

In [None]:
class BlogCommentDataset(Dataset):
    def __init__(self, dataset:pd.DataFrame, embeddings:dict, father_indices:dict, num_last_layers_embeddings_agg ,labels):
        self.dataset = dataset
        self.embeddings = embeddings
        self.father_indices = father_indices
        self.num_last_layers_embeddings_agg = num_last_layers_embeddings_agg
        self.labels = labels
    def __len__(self):
        return len(self.dataset)
    def __getitem__(self, index):
        df_index = self.dataset.index[index]
        embedding = self._agg_emb(self.embeddings[df_index])
        fathers = self.father_indices[df_index]

        embedding_fathers = []
        masks = []
        for f_idx in fathers:
            if(f_idx == -1):
                embedding_fathers.append(torch.zeros(embedding.shape[0]))
                masks.append(1)
            else:
                embedding_fathers.append(self._agg_emb(self.embeddings[f_idx]))
                masks.append(0)

        embedding_fathers = torch.stack(embedding_fathers)
        return embedding.to(DEVICE), embedding_fathers.to(DEVICE), torch.tensor(masks, dtype = torch.float32).to(DEVICE), torch.tensor(self.labels[index], dtype = torch.long).to(DEVICE)
    
    def _agg_emb(self, embedding):
        embedding = np.array(embedding, dtype=np.float32)
        embedding = embedding[0, -self.num_last_layers_embeddings_agg:, :].mean(0)
        return torch.from_numpy(embedding)

train_torch_dataset = BlogCommentDataset(train_educ_dataset, embeddings_educ, father_indices_educ, 3, train_encoded_labels)
train_torch_dataloader = DataLoader(train_torch_dataset, 2, shuffle=True)

val_torch_dataset = BlogCommentDataset(val_educ_dataset, embeddings_educ, father_indices_educ, 3, val_encoded_labels)
val_torch_dataloader = DataLoader(val_torch_dataset, 2, shuffle=False)

test_torch_dataset = BlogCommentDataset(test_educ_dataset, embeddings_educ, father_indices_educ, 3, test_encoded_labels)
test_torch_dataloader = DataLoader(test_torch_dataset, 2, shuffle=False)

div_dataset_torch = BlogCommentDataset(dataset_div, embeddings_div, father_indices_div, 3, div_encoded_labels)
div_dataloader_torch = DataLoader(div_dataset_torch, 2, shuffle=False)

In [None]:
from tqdm import tqdm
import math
class CommentClassificationModel(torch.nn.Module):
    def __init__(self, nrLabels):
        super(CommentClassificationModel, self).__init__()

        self.comment_proj = torch.nn.Linear(768, 128) 
        self.relu = torch.nn.ReLU()
        self.output = torch.nn.Linear(128, nrLabels) 
        self.dropout = torch.nn.Dropout(0.7)  

    def forward(self, x, fathers_x, mask):
        com_proj = self.dropout(self.relu(self.comment_proj(x)))

        return self.output(com_proj)


comment_classification_Model = CommentClassificationModel(len(labelEncoder.classes_))
comment_classification_Model.to(DEVICE)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(comment_classification_Model.parameters())
scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer,len(train_torch_dataloader) * 5, 2)

nr_epochs = 50
current_step = 0
best_model_loss = 1e9

for epoch in range(nr_epochs):
  pbar_training = tqdm(train_torch_dataloader)
  training_average_loss = 0
  training_nr_batches = 0
  comment_classification_Model.train()
  iters = len(pbar_training)
  
  for embeddings_batch, embedding_fathers, mask_batch, labels_batch in pbar_training:
    optimizer.zero_grad()
    yhat = comment_classification_Model(embeddings_batch, embedding_fathers, mask_batch)
    loss = criterion(yhat, labels_batch)
    loss.backward()
    optimizer.step()
    pbar_training.set_postfix({'loss': loss.cpu().detach().numpy()})

    training_average_loss += loss.cpu().detach().numpy()
    training_nr_batches+=1
    current_step+=1
    scheduler.step()

  pbar_validation = tqdm(val_torch_dataloader)

  validation_average_loss = 0
  validation_nr_batches = 0
  comment_classification_Model.eval()
  for embeddings_batch, embedding_fathers, mask_batch, labels_batch in pbar_validation:
    with torch.no_grad():
      yhat = comment_classification_Model(embeddings_batch, embedding_fathers, mask_batch)
      loss = criterion(yhat, labels_batch)

      validation_average_loss += loss.cpu().detach().numpy()
      validation_nr_batches+=1

      pbar_validation.set_postfix({'loss': loss.cpu().detach().numpy()})
      
  print(f'Epoch {epoch + 1} has training loss: {training_average_loss / training_nr_batches}')
  print(f'Epoch {epoch + 1} has validation loss: {validation_average_loss / validation_nr_batches}')

  if(validation_average_loss / validation_nr_batches < best_model_loss):
     best_model_loss = validation_average_loss / validation_nr_batches
     print(f'Best loss at epoch {epoch}')
     torch.save(comment_classification_Model, f'best_models/{huggingface_model_name_alias}.pkl')


  a_t = torch.nn.Softmax()(e_t)
100%|██████████| 296/296 [00:03<00:00, 74.97it/s, loss=0.16136043] 
100%|██████████| 93/93 [00:00<00:00, 200.42it/s, loss=0.6379756] 


Epoch 1 has training loss: 0.7129244081474639
Epoch 1 has validation loss: 0.616388282147787
Best loss at epoch 0


100%|██████████| 296/296 [00:02<00:00, 138.38it/s, loss=0.48703867] 
100%|██████████| 93/93 [00:00<00:00, 203.50it/s, loss=0.77701664]


Epoch 2 has training loss: 0.6573054569244787
Epoch 2 has validation loss: 0.6109093831431481
Best loss at epoch 1


100%|██████████| 296/296 [00:02<00:00, 120.47it/s, loss=0.701263]   
100%|██████████| 93/93 [00:00<00:00, 200.86it/s, loss=0.59386444]


Epoch 3 has training loss: 0.602256837351298
Epoch 3 has validation loss: 0.5340876534420956
Best loss at epoch 2


100%|██████████| 296/296 [00:02<00:00, 113.37it/s, loss=0.84810317]
100%|██████████| 93/93 [00:00<00:00, 202.17it/s, loss=0.634805]  


Epoch 4 has training loss: 0.5918350840943891
Epoch 4 has validation loss: 0.5242083274549053
Best loss at epoch 3


100%|██████████| 296/296 [00:02<00:00, 121.46it/s, loss=0.8874818]  
100%|██████████| 93/93 [00:00<00:00, 202.17it/s, loss=0.63926435]


Epoch 5 has training loss: 0.5685229100337302
Epoch 5 has validation loss: 0.5200555369418155
Best loss at epoch 4


100%|██████████| 296/296 [00:02<00:00, 124.11it/s, loss=0.8431307]  
100%|██████████| 93/93 [00:00<00:00, 197.03it/s, loss=0.538495]  


Epoch 6 has training loss: 0.6040347217192018
Epoch 6 has validation loss: 0.5176892847784104
Best loss at epoch 5


100%|██████████| 296/296 [00:02<00:00, 122.36it/s, loss=0.13906695]  
100%|██████████| 93/93 [00:00<00:00, 198.71it/s, loss=0.74723834] 


Epoch 7 has training loss: 0.5707789430264195
Epoch 7 has validation loss: 0.4509643023853661
Best loss at epoch 6


100%|██████████| 296/296 [00:02<00:00, 122.16it/s, loss=0.62805194] 
100%|██████████| 93/93 [00:00<00:00, 199.14it/s, loss=0.6836497] 


Epoch 8 has training loss: 0.5646916485479655
Epoch 8 has validation loss: 0.5141978893549212


100%|██████████| 296/296 [00:02<00:00, 122.82it/s, loss=0.40034238] 
100%|██████████| 93/93 [00:00<00:00, 206.67it/s, loss=0.5863392] 


Epoch 9 has training loss: 0.5670455340673594
Epoch 9 has validation loss: 0.49649675182437386


100%|██████████| 296/296 [00:02<00:00, 124.32it/s, loss=0.31354624] 
100%|██████████| 93/93 [00:00<00:00, 205.30it/s, loss=0.6199184] 


Epoch 10 has training loss: 0.5435639200197231
Epoch 10 has validation loss: 0.4774146297125406


100%|██████████| 296/296 [00:02<00:00, 123.75it/s, loss=0.007888355] 
100%|██████████| 93/93 [00:00<00:00, 197.45it/s, loss=0.69775236]


Epoch 11 has training loss: 0.5342384137534159
Epoch 11 has validation loss: 0.44088948638208453
Best loss at epoch 10


100%|██████████| 296/296 [00:02<00:00, 122.06it/s, loss=0.029951625]  
100%|██████████| 93/93 [00:00<00:00, 199.14it/s, loss=0.83406675]


Epoch 12 has training loss: 0.5035232466799318
Epoch 12 has validation loss: 0.4484683096769356


100%|██████████| 296/296 [00:02<00:00, 124.42it/s, loss=0.9245161]  
100%|██████████| 93/93 [00:00<00:00, 190.57it/s, loss=0.6351869]  


Epoch 13 has training loss: 0.5143658429151401
Epoch 13 has validation loss: 0.43645868781873937
Best loss at epoch 12


100%|██████████| 296/296 [00:02<00:00, 123.95it/s, loss=0.6576016]   
100%|██████████| 93/93 [00:00<00:00, 188.64it/s, loss=0.6537262]  


Epoch 14 has training loss: 0.4878033442923019
Epoch 14 has validation loss: 0.43452975852915676
Best loss at epoch 13


100%|██████████| 296/296 [00:02<00:00, 122.62it/s, loss=0.8009503]  
100%|██████████| 93/93 [00:00<00:00, 204.40it/s, loss=0.6713809] 


Epoch 15 has training loss: 0.4983060519286507
Epoch 15 has validation loss: 0.43591125634929506


100%|██████████| 296/296 [00:02<00:00, 122.26it/s, loss=0.79454494]  
100%|██████████| 93/93 [00:00<00:00, 202.17it/s, loss=0.67480826]


Epoch 16 has training loss: 0.5635260124108754
Epoch 16 has validation loss: 0.47103411007312035


100%|██████████| 296/296 [00:02<00:00, 124.32it/s, loss=2.0864234]   
100%|██████████| 93/93 [00:00<00:00, 192.54it/s, loss=0.4885684]  


Epoch 17 has training loss: 0.5377939944942104
Epoch 17 has validation loss: 0.4387263123907389


100%|██████████| 296/296 [00:02<00:00, 124.37it/s, loss=0.5753339]   
100%|██████████| 93/93 [00:00<00:00, 199.57it/s, loss=0.6411978] 


Epoch 18 has training loss: 0.5251717880526338
Epoch 18 has validation loss: 0.4591797011834319


100%|██████████| 296/296 [00:02<00:00, 119.35it/s, loss=0.7282157]   
100%|██████████| 93/93 [00:00<00:00, 202.62it/s, loss=0.84062123] 


Epoch 19 has training loss: 0.5107808005521637
Epoch 19 has validation loss: 0.4903022026022275


100%|██████████| 296/296 [00:02<00:00, 118.26it/s, loss=4.502722]     
100%|██████████| 93/93 [00:00<00:00, 194.56it/s, loss=0.6871644]  


Epoch 20 has training loss: 0.5341974216264773
Epoch 20 has validation loss: 0.47373382651036783


100%|██████████| 296/296 [00:02<00:00, 121.51it/s, loss=0.3882738]   
100%|██████████| 93/93 [00:00<00:00, 179.88it/s, loss=0.627047]   


Epoch 21 has training loss: 0.5033630016102532
Epoch 21 has validation loss: 0.4212962298313536
Best loss at epoch 20


100%|██████████| 296/296 [00:02<00:00, 122.65it/s, loss=0.49324962]   
100%|██████████| 93/93 [00:00<00:00, 200.00it/s, loss=0.6197128]  


Epoch 22 has training loss: 0.5087090613149645
Epoch 22 has validation loss: 0.4470537984042719


100%|██████████| 296/296 [00:02<00:00, 122.52it/s, loss=0.63047916]  
100%|██████████| 93/93 [00:00<00:00, 200.43it/s, loss=0.59379774]


Epoch 23 has training loss: 0.5385091681598148
Epoch 23 has validation loss: 0.4540363877450907


100%|██████████| 296/296 [00:02<00:00, 121.91it/s, loss=0.2437502]    
100%|██████████| 93/93 [00:00<00:00, 200.43it/s, loss=0.43057477] 


Epoch 24 has training loss: 0.4968085501152165
Epoch 24 has validation loss: 0.423267616039162


100%|██████████| 296/296 [00:02<00:00, 123.54it/s, loss=0.51071566]   
100%|██████████| 93/93 [00:00<00:00, 190.19it/s, loss=0.6190388]  


Epoch 25 has training loss: 0.4981799095402666
Epoch 25 has validation loss: 0.4728039571795092


100%|██████████| 296/296 [00:02<00:00, 122.31it/s, loss=1.0728779e-05]
100%|██████████| 93/93 [00:00<00:00, 199.15it/s, loss=0.5496459]  


Epoch 26 has training loss: 0.49870737136410065
Epoch 26 has validation loss: 0.42794349185761904


100%|██████████| 296/296 [00:02<00:00, 122.67it/s, loss=0.08170515]  
100%|██████████| 93/93 [00:00<00:00, 202.17it/s, loss=0.45068032] 


Epoch 27 has training loss: 0.44235562237323345
Epoch 27 has validation loss: 0.41693240055884484
Best loss at epoch 26


100%|██████████| 296/296 [00:02<00:00, 122.62it/s, loss=0.60849774]  
100%|██████████| 93/93 [00:00<00:00, 196.62it/s, loss=0.5016738]  


Epoch 28 has training loss: 0.48349729365164984
Epoch 28 has validation loss: 0.42824238292392225


100%|██████████| 296/296 [00:02<00:00, 124.42it/s, loss=1.2673901]    
100%|██████████| 93/93 [00:00<00:00, 188.26it/s, loss=0.49754113] 


Epoch 29 has training loss: 0.4832983433548579
Epoch 29 has validation loss: 0.41805956718481835


100%|██████████| 296/296 [00:02<00:00, 122.97it/s, loss=0.11206427]  
100%|██████████| 93/93 [00:00<00:00, 197.45it/s, loss=0.54388595] 


Epoch 30 has training loss: 0.4623532970624699
Epoch 30 has validation loss: 0.4342615607304759


100%|██████████| 296/296 [00:02<00:00, 120.87it/s, loss=0.4637677]   
100%|██████████| 93/93 [00:00<00:00, 199.14it/s, loss=0.45993406] 


Epoch 31 has training loss: 0.46078895602432884
Epoch 31 has validation loss: 0.41685383551083105
Best loss at epoch 30


100%|██████████| 296/296 [00:02<00:00, 121.91it/s, loss=0.0145609835] 
100%|██████████| 93/93 [00:00<00:00, 200.86it/s, loss=0.45294595] 


Epoch 32 has training loss: 0.4276265939743344
Epoch 32 has validation loss: 0.4115544624316196
Best loss at epoch 31


100%|██████████| 296/296 [00:02<00:00, 124.11it/s, loss=0.8227696]   
100%|██████████| 93/93 [00:00<00:00, 188.64it/s, loss=0.48424733] 


Epoch 33 has training loss: 0.4393736409328011
Epoch 33 has validation loss: 0.4114850720225443
Best loss at epoch 32


100%|██████████| 296/296 [00:02<00:00, 123.38it/s, loss=0.49951786]   
100%|██████████| 93/93 [00:00<00:00, 193.75it/s, loss=0.4810291]  


Epoch 34 has training loss: 0.44392485877442256
Epoch 34 has validation loss: 0.4113704930009541
Best loss at epoch 33


100%|██████████| 296/296 [00:02<00:00, 117.79it/s, loss=0.12463202]   
100%|██████████| 93/93 [00:00<00:00, 203.50it/s, loss=0.47965524] 


Epoch 35 has training loss: 0.4458312856119134
Epoch 35 has validation loss: 0.4113894531313049


100%|██████████| 296/296 [00:02<00:00, 121.36it/s, loss=0.5004114]    
100%|██████████| 93/93 [00:00<00:00, 199.14it/s, loss=0.62824875] 


Epoch 36 has training loss: 0.5330674478513223
Epoch 36 has validation loss: 0.4612580488526052


100%|██████████| 296/296 [00:02<00:00, 124.21it/s, loss=0.02859723]   
100%|██████████| 93/93 [00:00<00:00, 193.35it/s, loss=0.7884525]   


Epoch 37 has training loss: 0.47775722686057287
Epoch 37 has validation loss: 0.46314946294171355


100%|██████████| 296/296 [00:02<00:00, 124.79it/s, loss=0.7630646]   
100%|██████████| 93/93 [00:00<00:00, 202.61it/s, loss=0.45623365] 


Epoch 38 has training loss: 0.4819221347579376
Epoch 38 has validation loss: 0.40888891404404515
Best loss at epoch 37


100%|██████████| 296/296 [00:02<00:00, 122.01it/s, loss=0.5134822]   
100%|██████████| 93/93 [00:00<00:00, 197.04it/s, loss=0.42129767] 


Epoch 39 has training loss: 0.518354371450629
Epoch 39 has validation loss: 0.43851412038859583


100%|██████████| 296/296 [00:02<00:00, 120.92it/s, loss=0.6966687]    
100%|██████████| 93/93 [00:00<00:00, 198.30it/s, loss=0.60919464]


Epoch 40 has training loss: 0.4709827327412476
Epoch 40 has validation loss: 0.4583144347203435


100%|██████████| 296/296 [00:02<00:00, 121.41it/s, loss=0.3085317]    
100%|██████████| 93/93 [00:00<00:00, 202.61it/s, loss=0.5325262]  


Epoch 41 has training loss: 0.5118408859572896
Epoch 41 has validation loss: 0.4207180068147198


100%|██████████| 296/296 [00:02<00:00, 125.00it/s, loss=0.83194745]   
100%|██████████| 93/93 [00:00<00:00, 196.63it/s, loss=0.6346029]  


Epoch 42 has training loss: 0.5473365575568891
Epoch 42 has validation loss: 0.47956803953775795


100%|██████████| 296/296 [00:02<00:00, 119.60it/s, loss=0.1996479]    
100%|██████████| 93/93 [00:00<00:00, 200.43it/s, loss=0.6600434]  


Epoch 43 has training loss: 0.5179986253987028
Epoch 43 has validation loss: 0.45353065660443675


100%|██████████| 296/296 [00:02<00:00, 117.79it/s, loss=0.004525064]  
100%|██████████| 93/93 [00:00<00:00, 205.30it/s, loss=0.7615627]  


Epoch 44 has training loss: 0.5090126870797355
Epoch 44 has validation loss: 0.48803672434822204


100%|██████████| 296/296 [00:02<00:00, 121.56it/s, loss=0.503354]     
100%|██████████| 93/93 [00:00<00:00, 199.14it/s, loss=0.6099763]  


Epoch 45 has training loss: 0.5054213696233066
Epoch 45 has validation loss: 0.4809672383050765


100%|██████████| 296/296 [00:02<00:00, 123.44it/s, loss=0.049730636] 
100%|██████████| 93/93 [00:00<00:00, 197.45it/s, loss=0.6533344]  


Epoch 46 has training loss: 0.5041291241204079
Epoch 46 has validation loss: 0.506791624771331


100%|██████████| 296/296 [00:02<00:00, 122.21it/s, loss=0.0011261319] 
100%|██████████| 93/93 [00:00<00:00, 199.13it/s, loss=0.59380245]  


Epoch 47 has training loss: 0.47922457850969247
Epoch 47 has validation loss: 0.43850011553525203


100%|██████████| 296/296 [00:02<00:00, 121.21it/s, loss=0.11231053]   
100%|██████████| 93/93 [00:00<00:00, 197.03it/s, loss=0.5847374] 


Epoch 48 has training loss: 0.5230285808831423
Epoch 48 has validation loss: 0.46063615554463


100%|██████████| 296/296 [00:02<00:00, 122.26it/s, loss=0.44470772]   
100%|██████████| 93/93 [00:00<00:00, 198.29it/s, loss=0.58110005] 


Epoch 49 has training loss: 0.512807426970441
Epoch 49 has validation loss: 0.4430687071465116


100%|██████████| 296/296 [00:02<00:00, 123.33it/s, loss=0.08091102]   
100%|██████████| 93/93 [00:00<00:00, 195.38it/s, loss=0.48066282] 

Epoch 50 has training loss: 0.5151622776345963
Epoch 50 has validation loss: 0.4429555295250549





In [None]:
comment_classification_Model= torch.load(f'best_models/{huggingface_model_name_alias}.pkl')
comment_classification_Model.to(DEVICE)
comment_classification_Model.eval()

predictions = []
pbar_validation = tqdm(val_torch_dataloader)
for embeddings_batch, embedding_fathers, mask_batch, labels_batch in pbar_validation:
  predictions.append(comment_classification_Model(embeddings_batch, embedding_fathers, mask_batch))

predictions = np.stack([e for b in predictions for e in b.cpu().detach().numpy()])

  0%|          | 0/93 [00:00<?, ?it/s]

  a_t = torch.nn.Softmax()(e_t)
100%|██████████| 93/93 [00:00<00:00, 319.59it/s]


In [None]:
predictions_softmax = torch.nn.Softmax(dim=1)(torch.from_numpy(predictions)).detach().numpy()

predictions_indices = np.argmax(predictions_softmax, axis = 1)
predicted_normal_labels = labelEncoder.inverse_transform(predictions_indices.ravel())

from sklearn.metrics import classification_report

print(classification_report(val_educ_dataset["label"], predicted_normal_labels))

              precision    recall  f1-score   support

  Irrelevant       0.88      0.62      0.73        74
    Relevant       0.79      0.95      0.86       112

    accuracy                           0.82       186
   macro avg       0.84      0.78      0.80       186
weighted avg       0.83      0.82      0.81       186



In [None]:
predictions = []
pbar_test = tqdm(test_torch_dataloader)
for embeddings_batch, embedding_fathers, mask_batch, labels_batch in pbar_test:
  predictions.append(comment_classification_Model(embeddings_batch, embedding_fathers, mask_batch))

predictions = np.stack([e for b in predictions for e in b.cpu().detach().numpy()])

  a_t = torch.nn.Softmax()(e_t)
100%|██████████| 80/80 [00:00<00:00, 284.70it/s]


In [None]:
predictions_softmax = torch.nn.Softmax(dim=1)(torch.from_numpy(predictions)).detach().numpy()

predictions_indices = np.argmax(predictions_softmax, axis = 1)
predicted_normal_labels = labelEncoder.inverse_transform(predictions_indices.ravel())

from sklearn.metrics import classification_report

print(classification_report(test_educ_dataset["label"], predicted_normal_labels))

              precision    recall  f1-score   support

  Irrelevant       0.91      0.72      0.80        74
    Relevant       0.79      0.94      0.86        86

    accuracy                           0.84       160
   macro avg       0.85      0.83      0.83       160
weighted avg       0.85      0.84      0.83       160



In [None]:
predictions = []
pbar_div = tqdm(div_dataloader_torch)
for embeddings_batch, embedding_fathers, mask_batch, labels_batch in pbar_div:
  predictions.append(comment_classification_Model(embeddings_batch, embedding_fathers, mask_batch))

predictions = np.stack([e for b in predictions for e in b.cpu().detach().numpy()])

  a_t = torch.nn.Softmax()(e_t)
100%|██████████| 252/252 [00:00<00:00, 285.07it/s]


In [None]:
predictions_softmax = torch.nn.Softmax(dim=1)(torch.from_numpy(predictions)).detach().numpy()

predictions_indices = np.argmax(predictions_softmax, axis = 1)
predicted_normal_labels = labelEncoder.inverse_transform(predictions_indices.ravel())

from sklearn.metrics import classification_report

print(classification_report(dataset_div["label"], predicted_normal_labels))

              precision    recall  f1-score   support

  Irrelevant       0.96      0.60      0.74       393
    Relevant       0.39      0.92      0.55       111

    accuracy                           0.67       504
   macro avg       0.68      0.76      0.65       504
weighted avg       0.84      0.67      0.70       504

