In [11]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as pl
import re
from sklearn.metrics import classification_report
import bs4
from collections import defaultdict

WORKING_DIR='.'
DEVICE = 'cuda'

In [12]:
dataset_educ = pd.read_json(f'{WORKING_DIR}/Dataset_educ_1.1.json', orient = 'index')
dataset_div = pd.read_json(f'{WORKING_DIR}/Dataset_div2_final.json', orient = 'index')

In [13]:
regex_link_ful = re.compile('<a href.*\/contest/.*/submission/.*<\/a>')
code_regex = re.compile('<code>(\s|.)*?<\/code>')
def preprocess_for_transfomers(texts, problems):
  preprocessed_texts = []
  for t, p in zip(texts, problems):
    t_codes = code_regex.sub(' (code) ', t)
    t_link = regex_link_ful.sub(f' (link to problem {p}) ', t_codes)
    bs = bs4.BeautifulSoup(t_link)
    preprocessed_texts.append(bs.text)
  
  return preprocessed_texts

dataset_educ.loc[~(dataset_educ['label'] == 'Irrelevant'), 'label'] = 'Relevant'
preprocessed_text = preprocess_for_transfomers(dataset_educ['text'], dataset_educ['problem'])
dataset_educ["preprocessed_text"] = preprocessed_text

dataset_div.loc[~(dataset_div['label'] == 'Irrelevant'), 'label'] = 'Relevant'
preprocessed_text = preprocess_for_transfomers(dataset_div['text'], dataset_div['problem'])
dataset_div["preprocessed_text"] = preprocessed_text

In [14]:
TREE_FATHER_PATH_LENGTH = 3

def compute_father_indices_pd(df):
  father_tree = {}

  for _,com in df.iterrows():
    father_tree[com.id] = com.father_id

  comment_father_indices = {}

  for df_index,com in df.iterrows():
      indices = []
      last_ind = com.id
      for i in range(TREE_FATHER_PATH_LENGTH):
        if(father_tree[last_ind] not in father_tree):
          indices.append(-1)
          continue
        if(last_ind != -1):
          last_ind = father_tree[last_ind]
        
        if(last_ind != -1):
          indices.append(df[df.id == last_ind].index.values[0])
        else:
          indices.append(last_ind)
      indices.reverse()
      comment_father_indices[df_index] = indices
  return comment_father_indices

father_indices_educ = compute_father_indices_pd(dataset_educ)
father_indices_div = compute_father_indices_pd(dataset_div)

In [15]:
from embeddings_generation import TokenizedDataset, LayerEMBTokenEmbeddingGeneration
from embeddings_generation.utils import *

huggingface_model_name = "roberta-base"
huggingface_model_name_alias = huggingface_model_name.split("/")[-1] +'_educ'

if(not embedding_already_persisted(huggingface_model_name_alias)):
    persist_embeddings(dataset_educ["preprocessed_text"], huggingface_model_name, huggingface_model_name_alias, dataset_educ.index)
    
embeddings_educ = load_embeddings(huggingface_model_name_alias)

huggingface_model_name_alias_div = huggingface_model_name.split("/")[-1] +'_div'
if(not embedding_already_persisted(huggingface_model_name_alias_div)):
    persist_embeddings(dataset_div["preprocessed_text"], huggingface_model_name, huggingface_model_name_alias_div, dataset_div.index)
embeddings_div = load_embeddings(huggingface_model_name_alias_div)

In [16]:
import torch
from torch.utils.data.dataset import Dataset
from torch.utils.data.dataloader import DataLoader
import numpy as np
import random
RANDOM_SEED = 443
DEVICE = "cuda"

torch.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

groups = list(dataset_educ.groupby(lambda k : k.split("?")[0]))
random.shuffle(groups)

train_groups = groups[:10]
validation_groups = groups[10:13]
test_groups = groups[13:]

train_educ_dataset= dataset_educ.loc[[idx for _, g in train_groups for idx in g.index.tolist()]]
val_educ_dataset = dataset_educ.loc[[idx for _, g in validation_groups for idx in g.index.tolist()]]
test_educ_dataset = dataset_educ.loc[[idx for _, g in test_groups for idx in g.index.tolist()]]

from sklearn.preprocessing import LabelEncoder
labelEncoder = LabelEncoder()
train_encoded_labels = labelEncoder.fit_transform(train_educ_dataset["label"])
val_encoded_labels = labelEncoder.fit_transform(val_educ_dataset["label"])
test_encoded_labels = labelEncoder.fit_transform(test_educ_dataset["label"])

div_encoded_labels = labelEncoder.fit_transform(dataset_div["label"])

In [17]:
from augmentation_generation import get_weak_augmented_text
from augmentation_generation.utils import persist_augmentations, load_augmentations
NUMBER_AUGMENTATIONS = 6

huggingface_model_name_aug_alias = huggingface_model_name_alias + f'_aug_{NUMBER_AUGMENTATIONS}'
if(not embedding_already_persisted(huggingface_model_name_aug_alias)):
    augmented_train_text = get_weak_augmented_text(train_educ_dataset['preprocessed_text'], NUMBER_AUGMENTATIONS)
    augmented_train_text = [a_text for a_text_list in augmented_train_text for a_text in a_text_list]
    augmented_indices = []

    for idx in train_educ_dataset.index:
        for aug_idx in range(NUMBER_AUGMENTATIONS):
            augmented_indices.append(idx + '__' + str(aug_idx))

    persist_embeddings(augmented_train_text, huggingface_model_name, huggingface_model_name_aug_alias, augmented_indices)
    persist_augmentations(augmented_train_text, huggingface_model_name_aug_alias + "__text", augmented_indices)
    augmented_father_indices = {}
    for k in train_educ_dataset.index:
        v = father_indices_educ[k]
        for aug_ind in range(NUMBER_AUGMENTATIONS):
            aug_k = k + "__" + str(aug_ind)
            aug_s_f_indices = []
            for f_ind in father_indices_educ[k]:
                if(f_ind == -1):
                    aug_s_f_indices.append(-1)
                else:
                    random_aug_index = np.random.choice(range(NUMBER_AUGMENTATIONS))
                    aug_s_f_indices.append(f_ind + "__" + str(random_aug_index))
            augmented_father_indices[aug_k] = aug_s_f_indices

    persist_augmentations(augmented_father_indices.values(), huggingface_model_name_aug_alias + "__fathers", augmented_father_indices.keys())

augmented_train_embeddings = load_embeddings(huggingface_model_name_aug_alias)
augmented_train_text = load_augmentations(huggingface_model_name_aug_alias + "__text")
augmented_father_indices = load_augmentations(huggingface_model_name_aug_alias + "__fathers")

train_educ_aug_dataset = train_educ_dataset.copy()
train_educ_aug_dataset["aug_text"] = None

train_educ_aug_dataset_dict = train_educ_aug_dataset.to_dict(orient='index')

for aug_key in augmented_train_embeddings.keys():
    data = train_educ_dataset.loc[aug_key.split("__")[0]].copy()
    data["aug_text"] = augmented_train_text[aug_key]
    train_educ_aug_dataset_dict[aug_key] = data.to_dict()

train_educ_aug_dataset = pd.DataFrame.from_dict(train_educ_aug_dataset_dict, orient = 'index')
train_aug_encoded_labels = labelEncoder.fit_transform(train_educ_aug_dataset["label"])


In [18]:
for k,v in augmented_train_embeddings.items():
    embeddings_educ[k] = v
for k,v in augmented_father_indices.items():
    father_indices_educ[k] = v

In [19]:
class BlogCommentDataset(Dataset):
    def __init__(self, dataset:pd.DataFrame, embeddings:dict, father_indices:dict, num_last_layers_embeddings_agg ,labels):
        self.dataset = dataset
        self.embeddings = embeddings
        self.father_indices = father_indices
        self.num_last_layers_embeddings_agg = num_last_layers_embeddings_agg
        self.labels = labels
    def __len__(self):
        return len(self.dataset)
    def __getitem__(self, index):
        df_index = self.dataset.index[index]
        embedding = self._agg_emb(self.embeddings[df_index])
        fathers = self.father_indices[df_index]

        embedding_fathers = []
        masks = []
        for f_idx in fathers:
            if(f_idx == -1):
                embedding_fathers.append(torch.zeros(embedding.shape[0]))
                masks.append(1)
            else:
                embedding_fathers.append(self._agg_emb(self.embeddings[f_idx]))
                masks.append(0)

        embedding_fathers = torch.stack(embedding_fathers)
        return embedding.to(DEVICE), embedding_fathers.to(DEVICE), torch.tensor(masks, dtype = torch.float32).to(DEVICE), torch.tensor(self.labels[index], dtype = torch.long).to(DEVICE)
    
    def _agg_emb(self, embedding):
        embedding = np.array(embedding, dtype=np.float32)
        embedding = embedding[0, -self.num_last_layers_embeddings_agg:, :].mean(0)
        return torch.from_numpy(embedding)

train_torch_dataset = BlogCommentDataset(train_educ_aug_dataset, embeddings_educ, father_indices_educ, 3, train_aug_encoded_labels)
train_torch_dataloader = DataLoader(train_torch_dataset, 2, shuffle=True)

val_torch_dataset = BlogCommentDataset(val_educ_dataset, embeddings_educ, father_indices_educ, 3, val_encoded_labels)
val_torch_dataloader = DataLoader(val_torch_dataset, 2, shuffle=False)

test_torch_dataset = BlogCommentDataset(test_educ_dataset, embeddings_educ, father_indices_educ, 3, test_encoded_labels)
test_torch_dataloader = DataLoader(test_torch_dataset, 2, shuffle=False)

div_dataset_torch = BlogCommentDataset(dataset_div, embeddings_div, father_indices_div, 3, div_encoded_labels)
div_dataloader_torch = DataLoader(div_dataset_torch, 2, shuffle=False)

In [20]:
from tqdm import tqdm
import math
class CommentClassificationModel(torch.nn.Module):
    def __init__(self, nrLabels):
        super(CommentClassificationModel, self).__init__()
        self.k = torch.nn.Linear(768, 128)
        self.q = torch.nn.Linear(768, 128)
        self.v = torch.nn.Linear(768, 128) 

        self.comment_proj = torch.nn.Linear(768, 128) 
        self.relu = torch.nn.ReLU()
        self.output = torch.nn.Linear(128, nrLabels) 
        self.hidden1 = torch.nn.Linear(128 * 2, 128)  
        self.dropout = torch.nn.Dropout(0.6)  

    def forward(self, x, fathers_x, mask):
        key = self.k(x)
        queries = self.q(fathers_x)
        values = self.v(fathers_x)

        key = torch.unsqueeze(key, -1)

        e_t = torch.bmm(queries, key)  / math.sqrt(128)
        e_t = torch.squeeze(e_t, -1)

        e_t = e_t + mask * -2e9
        a_t = torch.nn.Softmax()(e_t)

        a_t = torch.unsqueeze(a_t, -1)
        average_att = torch.bmm(a_t.permute(0, 2, 1), values)
        average_att = average_att.squeeze(1)
        average_att = average_att

        com_proj = self.comment_proj(x)

        h1 = self.dropout(self.relu(torch.cat([average_att, com_proj], -1)))
        h2 = self.dropout(self.relu(self.hidden1(h1)))

        return self.output(h2)

comment_classification_Model = CommentClassificationModel(len(labelEncoder.classes_))
comment_classification_Model.to(DEVICE)



criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(comment_classification_Model.parameters())
scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer,len(train_torch_dataloader) * 5, 2)
nr_epochs = 50
current_step = 0
best_model_loss = 1e9
for epoch in range(nr_epochs):
  pbar_training = tqdm(train_torch_dataloader)
  training_average_loss = 0
  training_nr_batches = 0
  comment_classification_Model.train()
  iters = len(pbar_training)
  
  for embeddings_batch, embedding_fathers, mask_batch, labels_batch in pbar_training:
    optimizer.zero_grad()
    yhat = comment_classification_Model(embeddings_batch, embedding_fathers, mask_batch)
    loss = criterion(yhat, labels_batch)
    loss.backward()
    optimizer.step()
    pbar_training.set_postfix({'loss': loss.cpu().detach().numpy()})

    training_average_loss += loss.cpu().detach().numpy()
    training_nr_batches+=1
    current_step+=1
    scheduler.step()

  pbar_validation = tqdm(val_torch_dataloader)

  validation_average_loss = 0
  validation_nr_batches = 0
  comment_classification_Model.eval()
  for embeddings_batch, embedding_fathers, mask_batch, labels_batch in pbar_validation:
    with torch.no_grad():
      yhat = comment_classification_Model(embeddings_batch, embedding_fathers, mask_batch)
      loss = criterion(yhat, labels_batch)

      validation_average_loss += loss.cpu().detach().numpy()
      validation_nr_batches+=1

      pbar_validation.set_postfix({'loss': loss.cpu().detach().numpy()})
      
  print(f'Epoch {epoch + 1} has training loss: {training_average_loss / training_nr_batches}')
  print(f'Epoch {epoch + 1} has validation loss: {validation_average_loss / validation_nr_batches}')

  if(validation_average_loss / validation_nr_batches < best_model_loss):
     best_model_loss = validation_average_loss / validation_nr_batches
     print(f'Best loss at epoch {epoch}')
     torch.save(comment_classification_Model, f'best_models/{huggingface_model_name_aug_alias}.pkl')


  0%|          | 0/2069 [00:00<?, ?it/s]

  a_t = torch.nn.Softmax()(e_t)
100%|██████████| 2069/2069 [00:17<00:00, 117.39it/s, loss=0.046730448]
100%|██████████| 93/93 [00:00<00:00, 195.37it/s, loss=0.61024123] 


Epoch 1 has training loss: 0.6329784024472598
Epoch 1 has validation loss: 0.5154596518885385
Best loss at epoch 0


100%|██████████| 2069/2069 [00:16<00:00, 122.30it/s, loss=0.19067053]  
100%|██████████| 93/93 [00:00<00:00, 196.20it/s, loss=0.9150687] 


Epoch 2 has training loss: 0.5411846857044644
Epoch 2 has validation loss: 0.5055979445817009
Best loss at epoch 1


100%|██████████| 2069/2069 [00:17<00:00, 117.47it/s, loss=0.83393735] 
100%|██████████| 93/93 [00:00<00:00, 202.17it/s, loss=0.8424709] 


Epoch 3 has training loss: 0.5130117024931293
Epoch 3 has validation loss: 0.4535832258882702
Best loss at epoch 2


100%|██████████| 2069/2069 [00:16<00:00, 122.20it/s, loss=1.5653766]   
100%|██████████| 93/93 [00:00<00:00, 203.50it/s, loss=0.859815]  


Epoch 4 has training loss: 0.4822380084041623
Epoch 4 has validation loss: 0.4556656507074192


100%|██████████| 2069/2069 [00:17<00:00, 120.96it/s, loss=1.4341284]  
100%|██████████| 93/93 [00:00<00:00, 196.62it/s, loss=0.9156595]  


Epoch 5 has training loss: 0.4693768980340409
Epoch 5 has validation loss: 0.45421733300612177


100%|██████████| 2069/2069 [00:17<00:00, 120.64it/s, loss=0.00021217002]
100%|██████████| 93/93 [00:00<00:00, 200.01it/s, loss=1.12305]   


Epoch 6 has training loss: 0.5166684461274859
Epoch 6 has validation loss: 0.5097721792977824


100%|██████████| 2069/2069 [00:16<00:00, 122.04it/s, loss=1.7714907]   
100%|██████████| 93/93 [00:00<00:00, 184.16it/s, loss=1.1325566] 


Epoch 7 has training loss: 0.499275424096199
Epoch 7 has validation loss: 0.48351529140466004


100%|██████████| 2069/2069 [00:17<00:00, 121.37it/s, loss=1.0323224]    
100%|██████████| 93/93 [00:00<00:00, 200.00it/s, loss=0.64423335] 


Epoch 8 has training loss: 0.4885072064952813
Epoch 8 has validation loss: 0.4266463197737692
Best loss at epoch 7


100%|██████████| 2069/2069 [00:17<00:00, 117.66it/s, loss=1.4004345]    
100%|██████████| 93/93 [00:00<00:00, 202.18it/s, loss=1.0947354]  


Epoch 9 has training loss: 0.4639858078269713
Epoch 9 has validation loss: 0.46754161479397205


100%|██████████| 2069/2069 [00:16<00:00, 122.09it/s, loss=0.0068159937] 
100%|██████████| 93/93 [00:00<00:00, 195.38it/s, loss=0.663597]   


Epoch 10 has training loss: 0.4520333637905404
Epoch 10 has validation loss: 0.41600639934455025
Best loss at epoch 9


100%|██████████| 2069/2069 [00:17<00:00, 116.28it/s, loss=0.6024997]    
100%|██████████| 93/93 [00:00<00:00, 195.37it/s, loss=0.9865117]  


Epoch 11 has training loss: 0.43448219182494807
Epoch 11 has validation loss: 0.4519348720624362


100%|██████████| 2069/2069 [00:17<00:00, 119.44it/s, loss=0.25704142]   
100%|██████████| 93/93 [00:00<00:00, 196.20it/s, loss=1.0033939]  


Epoch 12 has training loss: 0.4126372154441388
Epoch 12 has validation loss: 0.4470329885242722


100%|██████████| 2069/2069 [00:17<00:00, 120.24it/s, loss=0.00019190853] 
100%|██████████| 93/93 [00:00<00:00, 197.45it/s, loss=0.7393923]  


Epoch 13 has training loss: 0.40606832675119703
Epoch 13 has validation loss: 0.42645342298165295


100%|██████████| 2069/2069 [00:17<00:00, 120.30it/s, loss=0.9638459]    
100%|██████████| 93/93 [00:00<00:00, 201.74it/s, loss=0.83160865] 


Epoch 14 has training loss: 0.39286617748639086
Epoch 14 has validation loss: 0.43705851703005033


100%|██████████| 2069/2069 [00:16<00:00, 121.82it/s, loss=9.6316464e-05]
100%|██████████| 93/93 [00:00<00:00, 197.04it/s, loss=0.817937]   


Epoch 15 has training loss: 0.39435727115056396
Epoch 15 has validation loss: 0.4348507435098591


100%|██████████| 2069/2069 [00:17<00:00, 117.28it/s, loss=1.2278481e-05] 
100%|██████████| 93/93 [00:00<00:00, 194.97it/s, loss=1.0812604]  


Epoch 16 has training loss: 0.46503536644138604
Epoch 16 has validation loss: 0.45423829153252726


100%|██████████| 2069/2069 [00:17<00:00, 121.42it/s, loss=0.7237747]    
100%|██████████| 93/93 [00:00<00:00, 185.63it/s, loss=0.8723322]  


Epoch 17 has training loss: 0.46839781591434754
Epoch 17 has validation loss: 0.44723978933012326


100%|██████████| 2069/2069 [00:16<00:00, 121.79it/s, loss=0.049937956]  
100%|██████████| 93/93 [00:00<00:00, 195.79it/s, loss=1.2111782]  


Epoch 18 has training loss: 0.46834261784402126
Epoch 18 has validation loss: 0.469485991277046


100%|██████████| 2069/2069 [00:17<00:00, 121.06it/s, loss=0.6848122]    
100%|██████████| 93/93 [00:00<00:00, 187.12it/s, loss=0.743589]  


Epoch 19 has training loss: 0.4624214649834944
Epoch 19 has validation loss: 0.43020650158293283


100%|██████████| 2069/2069 [00:17<00:00, 121.01it/s, loss=0.0468582]    
100%|██████████| 93/93 [00:00<00:00, 193.75it/s, loss=0.7497124]  


Epoch 20 has training loss: 0.44073841052103757
Epoch 20 has validation loss: 0.4261224631693665


100%|██████████| 2069/2069 [00:16<00:00, 122.80it/s, loss=0.025080148]  
100%|██████████| 93/93 [00:00<00:00, 183.08it/s, loss=0.83633524]


Epoch 21 has training loss: 0.44302648242743503
Epoch 21 has validation loss: 0.42954178613870936


100%|██████████| 2069/2069 [00:17<00:00, 120.85it/s, loss=0.039344233]  
100%|██████████| 93/93 [00:00<00:00, 206.21it/s, loss=0.775309]   


Epoch 22 has training loss: 0.43928485641014603
Epoch 22 has validation loss: 0.41349904838798757
Best loss at epoch 21


100%|██████████| 2069/2069 [00:17<00:00, 121.02it/s, loss=1.0605841]    
100%|██████████| 93/93 [00:00<00:00, 200.43it/s, loss=0.71841407] 


Epoch 23 has training loss: 0.42422237370680416
Epoch 23 has validation loss: 0.4198568281864389


100%|██████████| 2069/2069 [00:17<00:00, 119.92it/s, loss=1.4778235]    
100%|██████████| 93/93 [00:00<00:00, 196.62it/s, loss=0.6668193]  


Epoch 24 has training loss: 0.4059566288181583
Epoch 24 has validation loss: 0.39352160143423176
Best loss at epoch 23


100%|██████████| 2069/2069 [00:16<00:00, 122.17it/s, loss=4.2915253e-06]
100%|██████████| 93/93 [00:00<00:00, 200.44it/s, loss=0.9116162]  


Epoch 25 has training loss: 0.4047614070473713
Epoch 25 has validation loss: 0.43053368286704213


100%|██████████| 2069/2069 [00:16<00:00, 121.99it/s, loss=0.53016496]   
100%|██████████| 93/93 [00:00<00:00, 186.37it/s, loss=0.43191597] 


Epoch 26 has training loss: 0.3839980271962116
Epoch 26 has validation loss: 0.38928979783448997
Best loss at epoch 25


100%|██████████| 2069/2069 [00:17<00:00, 118.09it/s, loss=0.0043971282] 
100%|██████████| 93/93 [00:00<00:00, 200.00it/s, loss=0.7102919]  


Epoch 27 has training loss: 0.37184477121908904
Epoch 27 has validation loss: 0.4092029131628493


100%|██████████| 2069/2069 [00:17<00:00, 121.60it/s, loss=2.4621608]    
100%|██████████| 93/93 [00:00<00:00, 195.39it/s, loss=0.66956913] 


Epoch 28 has training loss: 0.35918907789124815
Epoch 28 has validation loss: 0.4016524632411298


100%|██████████| 2069/2069 [00:17<00:00, 118.43it/s, loss=0.032454833]   
100%|██████████| 93/93 [00:00<00:00, 182.71it/s, loss=0.7484741]  


Epoch 29 has training loss: 0.3484961231518187
Epoch 29 has validation loss: 0.43458493085958627


100%|██████████| 2069/2069 [00:17<00:00, 120.35it/s, loss=0.00022146634]
100%|██████████| 93/93 [00:00<00:00, 203.95it/s, loss=0.6329495]  


Epoch 30 has training loss: 0.34753220779079275
Epoch 30 has validation loss: 0.4271072796796278


100%|██████████| 2069/2069 [00:17<00:00, 120.74it/s, loss=0.1737582]    
100%|██████████| 93/93 [00:00<00:00, 198.74it/s, loss=0.60708344] 


Epoch 31 has training loss: 0.32764581636636136
Epoch 31 has validation loss: 0.4202563592022465


100%|██████████| 2069/2069 [00:17<00:00, 120.47it/s, loss=1.5059582]    
100%|██████████| 93/93 [00:00<00:00, 193.35it/s, loss=0.5950135]  


Epoch 32 has training loss: 0.3180299409059541
Epoch 32 has validation loss: 0.4203564209402889


100%|██████████| 2069/2069 [00:16<00:00, 122.05it/s, loss=0.59527916]   
100%|██████████| 93/93 [00:00<00:00, 195.38it/s, loss=0.5801713]  


Epoch 33 has training loss: 0.30923456149854506
Epoch 33 has validation loss: 0.4267657397174707


100%|██████████| 2069/2069 [00:17<00:00, 119.94it/s, loss=0.032503996]  
100%|██████████| 93/93 [00:00<00:00, 187.89it/s, loss=0.6249304]  


Epoch 34 has training loss: 0.31019441793117747
Epoch 34 has validation loss: 0.4369011488092202


100%|██████████| 2069/2069 [00:17<00:00, 120.95it/s, loss=0.9065957]     
100%|██████████| 93/93 [00:00<00:00, 194.97it/s, loss=0.62383014] 


Epoch 35 has training loss: 0.3044760051119572
Epoch 35 has validation loss: 0.4367267416129189


100%|██████████| 2069/2069 [00:16<00:00, 122.25it/s, loss=0.50583535]   
100%|██████████| 93/93 [00:00<00:00, 191.76it/s, loss=0.5549625] 


Epoch 36 has training loss: 0.42229303429655063
Epoch 36 has validation loss: 0.42844822281791195


100%|██████████| 2069/2069 [00:17<00:00, 120.19it/s, loss=0.0038183653] 
100%|██████████| 93/93 [00:00<00:00, 201.29it/s, loss=0.5980085]  


Epoch 37 has training loss: 0.41300081109650383
Epoch 37 has validation loss: 0.39163089138972734


100%|██████████| 2069/2069 [00:17<00:00, 121.13it/s, loss=0.9691363]    
100%|██████████| 93/93 [00:00<00:00, 191.75it/s, loss=0.48146075] 


Epoch 38 has training loss: 0.4241822880634243
Epoch 38 has validation loss: 0.4202878222430247


100%|██████████| 2069/2069 [00:16<00:00, 122.09it/s, loss=1.0513179]     
100%|██████████| 93/93 [00:00<00:00, 196.62it/s, loss=0.90819585] 


Epoch 39 has training loss: 0.4263243692473342
Epoch 39 has validation loss: 0.4175748481385114


100%|██████████| 2069/2069 [00:17<00:00, 115.86it/s, loss=2.430762]     
100%|██████████| 93/93 [00:00<00:00, 197.87it/s, loss=0.64406073] 


Epoch 40 has training loss: 0.414577360275084
Epoch 40 has validation loss: 0.4001615631884773


100%|██████████| 2069/2069 [00:16<00:00, 122.38it/s, loss=0.16704556]   
100%|██████████| 93/93 [00:00<00:00, 200.86it/s, loss=1.3396175]  


Epoch 41 has training loss: 0.4115724513499189
Epoch 41 has validation loss: 0.49156797809431035


100%|██████████| 2069/2069 [00:17<00:00, 118.69it/s, loss=2.0471919]    
100%|██████████| 93/93 [00:00<00:00, 204.39it/s, loss=0.7327337]  


Epoch 42 has training loss: 0.4096713336074445
Epoch 42 has validation loss: 0.41345403988352014


100%|██████████| 2069/2069 [00:17<00:00, 121.43it/s, loss=0.01331909]   
100%|██████████| 93/93 [00:00<00:00, 198.72it/s, loss=0.38506144] 


Epoch 43 has training loss: 0.40528110378245125
Epoch 43 has validation loss: 0.40944938902412675


100%|██████████| 2069/2069 [00:17<00:00, 121.03it/s, loss=0.04870962]   
100%|██████████| 93/93 [00:00<00:00, 181.29it/s, loss=0.5919844]  


Epoch 44 has training loss: 0.40377248866653903
Epoch 44 has validation loss: 0.39345014411754076


100%|██████████| 2069/2069 [00:17<00:00, 118.62it/s, loss=0.120460436]  
100%|██████████| 93/93 [00:00<00:00, 198.30it/s, loss=1.1966572]  


Epoch 45 has training loss: 0.3916618477673487
Epoch 45 has validation loss: 0.4364128135224823


100%|██████████| 2069/2069 [00:17<00:00, 121.28it/s, loss=0.2914714]    
100%|██████████| 93/93 [00:00<00:00, 198.72it/s, loss=0.6626829]  


Epoch 46 has training loss: 0.39103037711847155
Epoch 46 has validation loss: 0.3901931390126177


100%|██████████| 2069/2069 [00:17<00:00, 118.66it/s, loss=0.33881065]   
100%|██████████| 93/93 [00:00<00:00, 182.71it/s, loss=0.78542185] 


Epoch 47 has training loss: 0.37854486611506705
Epoch 47 has validation loss: 0.4257280732833537


100%|██████████| 2069/2069 [00:17<00:00, 120.51it/s, loss=1.0717589]    
100%|██████████| 93/93 [00:00<00:00, 200.43it/s, loss=0.46490565] 


Epoch 48 has training loss: 0.36651314700548115
Epoch 48 has validation loss: 0.3878204003376983
Best loss at epoch 47


100%|██████████| 2069/2069 [00:16<00:00, 121.87it/s, loss=1.3602097]    
100%|██████████| 93/93 [00:00<00:00, 187.50it/s, loss=0.82811654] 


Epoch 49 has training loss: 0.3679430160663749
Epoch 49 has validation loss: 0.41996089108688855


100%|██████████| 2069/2069 [00:17<00:00, 115.30it/s, loss=0.04827489]   
100%|██████████| 93/93 [00:00<00:00, 196.62it/s, loss=0.8825129]  

Epoch 50 has training loss: 0.3666120948900162
Epoch 50 has validation loss: 0.41131433017391655





In [21]:
comment_classification_Model= torch.load(f'best_models/{huggingface_model_name_aug_alias}.pkl')
comment_classification_Model.to(DEVICE)
comment_classification_Model.eval()
predictions = []
pbar_validation = tqdm(val_torch_dataloader)
for embeddings_batch, embedding_fathers, mask_batch, labels_batch in pbar_validation:
  predictions.append(comment_classification_Model(embeddings_batch, embedding_fathers, mask_batch))

predictions = np.stack([e for b in predictions for e in b.cpu().detach().numpy()])

  a_t = torch.nn.Softmax()(e_t)
100%|██████████| 93/93 [00:00<00:00, 318.49it/s]


In [22]:
predictions_softmax = torch.nn.Softmax(dim=1)(torch.from_numpy(predictions)).detach().numpy()

predictions_indices = np.argmax(predictions_softmax, axis = 1)
predicted_normal_labels = labelEncoder.inverse_transform(predictions_indices.ravel())

from sklearn.metrics import classification_report

print(classification_report(val_educ_dataset["label"], predicted_normal_labels))

              precision    recall  f1-score   support

  Irrelevant       0.85      0.69      0.76        74
    Relevant       0.82      0.92      0.87       112

    accuracy                           0.83       186
   macro avg       0.83      0.80      0.81       186
weighted avg       0.83      0.83      0.82       186



In [23]:
predictions = []
pbar_test = tqdm(test_torch_dataloader)
for embeddings_batch, embedding_fathers, mask_batch, labels_batch in pbar_test:
  predictions.append(comment_classification_Model(embeddings_batch, embedding_fathers, mask_batch))

predictions = np.stack([e for b in predictions for e in b.cpu().detach().numpy()])

  a_t = torch.nn.Softmax()(e_t)
100%|██████████| 80/80 [00:00<00:00, 307.72it/s]


In [24]:
predictions_softmax = torch.nn.Softmax(dim=1)(torch.from_numpy(predictions)).detach().numpy()

predictions_indices = np.argmax(predictions_softmax, axis = 1)
predicted_normal_labels = labelEncoder.inverse_transform(predictions_indices.ravel())

from sklearn.metrics import classification_report

print(classification_report(test_educ_dataset["label"], predicted_normal_labels))

              precision    recall  f1-score   support

  Irrelevant       0.88      0.80      0.84        74
    Relevant       0.84      0.91      0.87        86

    accuracy                           0.86       160
   macro avg       0.86      0.85      0.85       160
weighted avg       0.86      0.86      0.86       160



In [25]:
predictions = []
pbar_div = tqdm(div_dataloader_torch)
for embeddings_batch, embedding_fathers, mask_batch, labels_batch in pbar_div:
  predictions.append(comment_classification_Model(embeddings_batch, embedding_fathers, mask_batch))

predictions = np.stack([e for b in predictions for e in b.cpu().detach().numpy()])

  a_t = torch.nn.Softmax()(e_t)
100%|██████████| 252/252 [00:00<00:00, 280.30it/s]


In [26]:
predictions_softmax = torch.nn.Softmax(dim=1)(torch.from_numpy(predictions)).detach().numpy()

predictions_indices = np.argmax(predictions_softmax, axis = 1)
predicted_normal_labels = labelEncoder.inverse_transform(predictions_indices.ravel())

from sklearn.metrics import classification_report

print(classification_report(dataset_div["label"], predicted_normal_labels))

              precision    recall  f1-score   support

  Irrelevant       0.97      0.59      0.73       393
    Relevant       0.39      0.93      0.55       111

    accuracy                           0.66       504
   macro avg       0.68      0.76      0.64       504
weighted avg       0.84      0.66      0.69       504

