In [16]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as pl
import re
from sklearn.metrics import classification_report
import bs4
from collections import defaultdict

WORKING_DIR='.'
DEVICE = 'cuda'

In [17]:
dataset_educ = pd.read_json(f'{WORKING_DIR}/Dataset_educ_1.1.json', orient = 'index')
dataset_div = pd.read_json(f'{WORKING_DIR}/Dataset_div2_final.json', orient = 'index')

In [18]:
regex_link_ful = re.compile('<a href.*\/contest/.*/submission/.*<\/a>')
code_regex = re.compile('<code>(\s|.)*?<\/code>')
def preprocess_for_transfomers(texts, problems):
  preprocessed_texts = []
  for t, p in zip(texts, problems):
    t_codes = code_regex.sub(' (code) ', t)
    t_link = regex_link_ful.sub(f' (link to problem {p}) ', t_codes)
    bs = bs4.BeautifulSoup(t_link)
    preprocessed_texts.append(bs.text)
  
  return preprocessed_texts

dataset_educ.loc[~(dataset_educ['label'] == 'Irrelevant'), 'label'] = 'Relevant'
preprocessed_text = preprocess_for_transfomers(dataset_educ['text'], dataset_educ['problem'])
dataset_educ["preprocessed_text"] = preprocessed_text

dataset_div.loc[~(dataset_div['label'] == 'Irrelevant'), 'label'] = 'Relevant'
preprocessed_text = preprocess_for_transfomers(dataset_div['text'], dataset_div['problem'])
dataset_div["preprocessed_text"] = preprocessed_text

In [19]:
TREE_FATHER_PATH_LENGTH = 3

def compute_father_indices_pd(df):
  father_tree = {}

  for _,com in df.iterrows():
    father_tree[com.id] = com.father_id

  comment_father_indices = {}

  for df_index,com in df.iterrows():
      indices = []
      last_ind = com.id
      for i in range(TREE_FATHER_PATH_LENGTH):
        if(father_tree[last_ind] not in father_tree):
          indices.append(-1)
          continue
        if(last_ind != -1):
          last_ind = father_tree[last_ind]
        
        if(last_ind != -1):
          indices.append(df[df.id == last_ind].index.values[0])
        else:
          indices.append(last_ind)
      indices.reverse()
      comment_father_indices[df_index] = indices
  return comment_father_indices

father_indices_educ = compute_father_indices_pd(dataset_educ)
father_indices_div = compute_father_indices_pd(dataset_div)

In [20]:
from embeddings_generation import TokenizedDataset, LayerEMBTokenEmbeddingGeneration
from embeddings_generation.utils import *

huggingface_model_name = "bert-base-cased"

huggingface_model_name_alias_educ = huggingface_model_name.split("/")[-1] +'_educ'
if(not embedding_already_persisted(huggingface_model_name_alias_educ)):
    persist_embeddings(dataset_educ["preprocessed_text"], huggingface_model_name, huggingface_model_name_alias_educ, dataset_educ.index)   
embeddings_educ = load_embeddings(huggingface_model_name_alias_educ)

huggingface_model_name_alias_div = huggingface_model_name.split("/")[-1] +'_div'
if(not embedding_already_persisted(huggingface_model_name_alias_div)):
    persist_embeddings(dataset_div["preprocessed_text"], huggingface_model_name, huggingface_model_name_alias_div, dataset_div.index)
embeddings_div = load_embeddings(huggingface_model_name_alias_div)

In [21]:
statistics = determine_tokens_statistics(dataset_educ["preprocessed_text"], huggingface_model_name)

statistics["ratioNotWholeWords"] = statistics["nrWordsSplitAtleastTwice"] / statistics["nrWholeWords"]

statistics.describe()

Token indices sequence length is longer than the specified maximum sequence length for this model (616 > 512). Running this sequence through the model will result in indexing errors


Unnamed: 0,nrTokens,nrWholeWords,maxWordSplit,nrWordsSplitAtleastTwice,ratioNotWholeWords
count,937.0,937.0,937.0,937.0,937.0
mean,74.66809,70.043757,1.469584,3.155816,0.05639
std,129.464346,122.61402,1.337087,6.083399,0.071372
min,3.0,3.0,0.0,0.0,0.0
25%,17.0,16.0,0.0,0.0,0.0
50%,32.0,30.0,1.0,1.0,0.041667
75%,74.0,68.0,2.0,4.0,0.075758
max,1424.0,1373.0,11.0,93.0,0.571429


In [22]:
import torch
from torch.utils.data.dataset import Dataset
from torch.utils.data.dataloader import DataLoader
import numpy as np
import random
RANDOM_SEED = 443
DEVICE = "cuda"

torch.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

groups = list(dataset_educ.groupby(lambda k : k.split("?")[0]))
random.shuffle(groups)

train_groups = groups[:10]
validation_groups = groups[10:13]
test_groups = groups[13:]

train_educ_dataset = dataset_educ.loc[[idx for _, g in train_groups for idx in g.index.tolist()]]
val_educ_dataset = dataset_educ.loc[[idx for _, g in validation_groups for idx in g.index.tolist()]]
test_educ_dataset = dataset_educ.loc[[idx for _, g in test_groups for idx in g.index.tolist()]]

from sklearn.preprocessing import LabelEncoder
labelEncoder = LabelEncoder()
train_encoded_labels = labelEncoder.fit_transform(train_educ_dataset["label"])
val_encoded_labels = labelEncoder.fit_transform(val_educ_dataset["label"])
test_encoded_labels = labelEncoder.fit_transform(test_educ_dataset["label"])

div_encoded_labels = labelEncoder.fit_transform(dataset_div["label"])


In [23]:
class BlogCommentDataset(Dataset):
    def __init__(self, dataset:pd.DataFrame, embeddings:dict, father_indices:dict, num_last_layers_embeddings_agg ,labels):
        self.dataset = dataset
        self.embeddings = embeddings
        self.father_indices = father_indices
        self.num_last_layers_embeddings_agg = num_last_layers_embeddings_agg
        self.labels = labels
    def __len__(self):
        return len(self.dataset)
    def __getitem__(self, index):
        df_index = self.dataset.index[index]
        embedding = self._agg_emb(self.embeddings[df_index])
        fathers = self.father_indices[df_index]

        embedding_fathers = []
        masks = []
        for f_idx in fathers:
            if(f_idx == -1):
                embedding_fathers.append(torch.zeros(embedding.shape[0]))
                masks.append(1)
            else:
                embedding_fathers.append(self._agg_emb(self.embeddings[f_idx]))
                masks.append(0)

        embedding_fathers = torch.stack(embedding_fathers)
        return embedding.to(DEVICE), embedding_fathers.to(DEVICE), torch.tensor(masks, dtype = torch.float32).to(DEVICE), torch.tensor(self.labels[index], dtype = torch.long).to(DEVICE)
    
    def _agg_emb(self, embedding):
        embedding = np.array(embedding, dtype=np.float32)
        embedding = embedding[0, -self.num_last_layers_embeddings_agg:, :].mean(0)
        return torch.from_numpy(embedding)

train_torch_dataset = BlogCommentDataset(train_educ_dataset, embeddings_educ, father_indices_educ, 3, train_encoded_labels)
train_torch_dataloader = DataLoader(train_torch_dataset, 2, shuffle=True)

val_torch_dataset = BlogCommentDataset(val_educ_dataset, embeddings_educ, father_indices_educ, 3, val_encoded_labels)
val_torch_dataloader = DataLoader(val_torch_dataset, 2, shuffle=False)

test_torch_dataset = BlogCommentDataset(test_educ_dataset, embeddings_educ, father_indices_educ, 3, test_encoded_labels)
test_torch_dataloader = DataLoader(test_torch_dataset, 2, shuffle=False)

div_dataset_torch = BlogCommentDataset(dataset_div, embeddings_div, father_indices_div, 3, div_encoded_labels)
div_dataloader_torch = DataLoader(div_dataset_torch, 2, shuffle=False)


In [24]:
from tqdm import tqdm
import math
class CommentClassificationModel(torch.nn.Module):
    def __init__(self, nrLabels):
        super(CommentClassificationModel, self).__init__()
        self.k = torch.nn.Linear(768, 128)
        self.q = torch.nn.Linear(768, 128)
        self.v = torch.nn.Linear(768, 128) 

        self.comment_proj = torch.nn.Linear(768, 128) 
        self.relu = torch.nn.ReLU()
        self.output = torch.nn.Linear(128, nrLabels) 
        self.hidden1 = torch.nn.Linear(128 * 2, 128)  
        self.dropout = torch.nn.Dropout(0.7)  

    def forward(self, x, fathers_x, mask):
        key = self.k(x)
        queries = self.q(fathers_x)
        values = self.v(fathers_x)

        key = torch.unsqueeze(key, -1)

        e_t = torch.bmm(queries, key)  / math.sqrt(128)
        e_t = torch.squeeze(e_t, -1)

        e_t = e_t + mask * -2e9
        a_t = torch.nn.Softmax()(e_t)

        a_t = torch.unsqueeze(a_t, -1)
        average_att = torch.bmm(a_t.permute(0, 2, 1), values)
        average_att = average_att.squeeze(1)
        average_att = average_att

        com_proj = self.comment_proj(x)

        h1 = self.dropout(self.relu(torch.cat([average_att, com_proj], -1)))
        h2 = self.dropout(self.relu(self.hidden1(h1)))

        return self.output(h2)

comment_classification_Model = CommentClassificationModel(len(labelEncoder.classes_))
comment_classification_Model.to(DEVICE)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(comment_classification_Model.parameters())
scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer,len(train_torch_dataloader) * 5, 2)

nr_epochs = 50
current_step = 0
best_model_loss = 1e9
for epoch in range(nr_epochs):
  pbar_training = tqdm(train_torch_dataloader)
  training_average_loss = 0
  training_nr_batches = 0
  comment_classification_Model.train()
  iters = len(pbar_training)
  
  for embeddings_batch, embedding_fathers, mask_batch, labels_batch in pbar_training:
    optimizer.zero_grad()
    yhat = comment_classification_Model(embeddings_batch, embedding_fathers, mask_batch)
    loss = criterion(yhat, labels_batch)
    loss.backward()
    optimizer.step()
    pbar_training.set_postfix({'loss': loss.cpu().detach().numpy()})

    training_average_loss += loss.cpu().detach().numpy()
    training_nr_batches+=1
    current_step+=1
    scheduler.step()

  pbar_validation = tqdm(val_torch_dataloader)

  validation_average_loss = 0
  validation_nr_batches = 0
  comment_classification_Model.eval()
  for embeddings_batch, embedding_fathers, mask_batch, labels_batch in pbar_validation:
    with torch.no_grad():
      yhat = comment_classification_Model(embeddings_batch, embedding_fathers, mask_batch)
      loss = criterion(yhat, labels_batch)

      validation_average_loss += loss.cpu().detach().numpy()
      validation_nr_batches+=1

      pbar_validation.set_postfix({'loss': loss.cpu().detach().numpy()})
      
  print(f'Epoch {epoch + 1} has training loss: {training_average_loss / training_nr_batches}')
  print(f'Epoch {epoch + 1} has validation loss: {validation_average_loss / validation_nr_batches}')

  if(validation_average_loss / validation_nr_batches < best_model_loss):
     best_model_loss = validation_average_loss / validation_nr_batches
     print(f'Best loss at epoch {epoch}')
     torch.save(comment_classification_Model, f'best_models/{huggingface_model_name_alias_educ}.pkl')


  a_t = torch.nn.Softmax()(e_t)
100%|██████████| 296/296 [00:02<00:00, 99.08it/s, loss=0.15534961]  
100%|██████████| 93/93 [00:00<00:00, 161.31it/s, loss=1.1306872] 


Epoch 1 has training loss: 0.7010409776624795
Epoch 1 has validation loss: 0.5750652455514477
Best loss at epoch 0


100%|██████████| 296/296 [00:02<00:00, 102.25it/s, loss=0.33772773]
100%|██████████| 93/93 [00:00<00:00, 200.86it/s, loss=1.2401571] 


Epoch 2 has training loss: 0.6284196352384783
Epoch 2 has validation loss: 0.5602136046014806
Best loss at epoch 1


100%|██████████| 296/296 [00:02<00:00, 112.84it/s, loss=0.66164345]
100%|██████████| 93/93 [00:00<00:00, 186.37it/s, loss=1.4111724] 


Epoch 3 has training loss: 0.5617702651164822
Epoch 3 has validation loss: 0.5225999855226086
Best loss at epoch 2


100%|██████████| 296/296 [00:02<00:00, 121.96it/s, loss=0.6860862]  
100%|██████████| 93/93 [00:00<00:00, 184.89it/s, loss=1.6520042] 


Epoch 4 has training loss: 0.5474903023411596
Epoch 4 has validation loss: 0.5307240674412379


100%|██████████| 296/296 [00:02<00:00, 126.48it/s, loss=1.1308538]  
100%|██████████| 93/93 [00:00<00:00, 206.19it/s, loss=1.5818055] 


Epoch 5 has training loss: 0.5247799241097292
Epoch 5 has validation loss: 0.5256343585669353


100%|██████████| 296/296 [00:02<00:00, 120.77it/s, loss=1.0523369]   
100%|██████████| 93/93 [00:00<00:00, 197.87it/s, loss=1.5133276] 


Epoch 6 has training loss: 0.5569585062566565
Epoch 6 has validation loss: 0.5214623864940418
Best loss at epoch 5


100%|██████████| 296/296 [00:02<00:00, 123.92it/s, loss=0.16535588] 
100%|██████████| 93/93 [00:00<00:00, 207.13it/s, loss=1.6234503]  


Epoch 7 has training loss: 0.5515761799871217
Epoch 7 has validation loss: 0.5045509229424179
Best loss at epoch 6


100%|██████████| 296/296 [00:02<00:00, 122.73it/s, loss=0.044755783] 
100%|██████████| 93/93 [00:00<00:00, 199.14it/s, loss=2.0964937]  


Epoch 8 has training loss: 0.5470245970732445
Epoch 8 has validation loss: 0.48742846467642376
Best loss at epoch 7


100%|██████████| 296/296 [00:02<00:00, 124.84it/s, loss=0.21394774]  
100%|██████████| 93/93 [00:00<00:00, 191.75it/s, loss=1.1017473] 


Epoch 9 has training loss: 0.5250986771274244
Epoch 9 has validation loss: 0.45635699488783393
Best loss at epoch 8


100%|██████████| 296/296 [00:02<00:00, 125.92it/s, loss=0.60153365]  
100%|██████████| 93/93 [00:00<00:00, 188.65it/s, loss=2.2514489] 


Epoch 10 has training loss: 0.49567613592158966
Epoch 10 has validation loss: 0.4802569070009775


100%|██████████| 296/296 [00:02<00:00, 118.44it/s, loss=0.014180139] 
100%|██████████| 93/93 [00:00<00:00, 206.21it/s, loss=2.7589393]  


Epoch 11 has training loss: 0.4874141839800113
Epoch 11 has validation loss: 0.4884515103874027


100%|██████████| 296/296 [00:02<00:00, 123.61it/s, loss=0.024922026]  
100%|██████████| 93/93 [00:00<00:00, 205.75it/s, loss=2.8317666]  


Epoch 12 has training loss: 0.4567471088568273
Epoch 12 has validation loss: 0.5021072522967412


100%|██████████| 296/296 [00:02<00:00, 123.52it/s, loss=0.01360466]  
100%|██████████| 93/93 [00:00<00:00, 207.69it/s, loss=2.5819545]  


Epoch 13 has training loss: 0.45297003641600303
Epoch 13 has validation loss: 0.47642148065791334


100%|██████████| 296/296 [00:02<00:00, 125.88it/s, loss=0.5697381]   
100%|██████████| 93/93 [00:00<00:00, 191.01it/s, loss=2.769261]   


Epoch 14 has training loss: 0.45125635928874347
Epoch 14 has validation loss: 0.48490231844686693


100%|██████████| 296/296 [00:02<00:00, 125.82it/s, loss=0.637296]    
100%|██████████| 93/93 [00:00<00:00, 207.59it/s, loss=2.7153883]  


Epoch 15 has training loss: 0.4691786275469821
Epoch 15 has validation loss: 0.4827300383238703


100%|██████████| 296/296 [00:02<00:00, 124.27it/s, loss=0.62457764]  
100%|██████████| 93/93 [00:00<00:00, 208.05it/s, loss=3.0175538]  


Epoch 16 has training loss: 0.5061072673199453
Epoch 16 has validation loss: 0.5079726589623318


100%|██████████| 296/296 [00:02<00:00, 117.74it/s, loss=1.0693511]   
100%|██████████| 93/93 [00:00<00:00, 203.51it/s, loss=2.0052426]  


Epoch 17 has training loss: 0.5041497984331382
Epoch 17 has validation loss: 0.4399970736334561
Best loss at epoch 16


100%|██████████| 296/296 [00:02<00:00, 118.19it/s, loss=0.0014387743]
100%|██████████| 93/93 [00:00<00:00, 199.99it/s, loss=1.4954208]  


Epoch 18 has training loss: 0.495294419858633
Epoch 18 has validation loss: 0.46198363473979376


100%|██████████| 296/296 [00:02<00:00, 122.96it/s, loss=0.552311]     
100%|██████████| 93/93 [00:00<00:00, 173.44it/s, loss=1.8751409]  


Epoch 19 has training loss: 0.512207301357894
Epoch 19 has validation loss: 0.47628934797580524


100%|██████████| 296/296 [00:02<00:00, 125.33it/s, loss=2.3234684]   
100%|██████████| 93/93 [00:00<00:00, 204.41it/s, loss=2.67065]    


Epoch 20 has training loss: 0.5018496914732119
Epoch 20 has validation loss: 0.48310086942247804


100%|██████████| 296/296 [00:02<00:00, 124.32it/s, loss=0.89629465]   
100%|██████████| 93/93 [00:00<00:00, 209.94it/s, loss=4.2515826]  


Epoch 21 has training loss: 0.4564908883650439
Epoch 21 has validation loss: 0.5369744400884355


100%|██████████| 296/296 [00:02<00:00, 124.31it/s, loss=0.17892194]   
100%|██████████| 93/93 [00:00<00:00, 204.45it/s, loss=2.6119952]  


Epoch 22 has training loss: 0.4803300871511507
Epoch 22 has validation loss: 0.4736728422263617


100%|██████████| 296/296 [00:02<00:00, 125.58it/s, loss=0.72124994]   
100%|██████████| 93/93 [00:00<00:00, 189.41it/s, loss=2.4371622]  


Epoch 23 has training loss: 0.472026033800019
Epoch 23 has validation loss: 0.44310032372032443


100%|██████████| 296/296 [00:02<00:00, 125.96it/s, loss=0.003809696]  
100%|██████████| 93/93 [00:00<00:00, 207.59it/s, loss=2.2180784]  


Epoch 24 has training loss: 0.4794815924362681
Epoch 24 has validation loss: 0.44951406044144465


100%|██████████| 296/296 [00:02<00:00, 122.85it/s, loss=0.22623411]  
100%|██████████| 93/93 [00:00<00:00, 201.73it/s, loss=2.0489354]  


Epoch 25 has training loss: 0.4388977346924229
Epoch 25 has validation loss: 0.4547668231290675


100%|██████████| 296/296 [00:02<00:00, 122.93it/s, loss=0.00012373159]
100%|██████████| 93/93 [00:00<00:00, 203.79it/s, loss=3.4050086]  


Epoch 26 has training loss: 0.420444409691241
Epoch 26 has validation loss: 0.49001946674800045


100%|██████████| 296/296 [00:02<00:00, 123.91it/s, loss=0.40696886]   
100%|██████████| 93/93 [00:00<00:00, 204.40it/s, loss=3.1810048]   


Epoch 27 has training loss: 0.4070177283964613
Epoch 27 has validation loss: 0.4552805860857329


100%|██████████| 296/296 [00:02<00:00, 125.33it/s, loss=0.16250803]   
100%|██████████| 93/93 [00:00<00:00, 184.16it/s, loss=4.2883205]   


Epoch 28 has training loss: 0.4073090403737321
Epoch 28 has validation loss: 0.5182264065000689


100%|██████████| 296/296 [00:02<00:00, 120.49it/s, loss=1.3057923]    
100%|██████████| 93/93 [00:00<00:00, 208.52it/s, loss=3.8203688]   


Epoch 29 has training loss: 0.4136672447689178
Epoch 29 has validation loss: 0.49668928842255544


100%|██████████| 296/296 [00:02<00:00, 121.71it/s, loss=0.0056231176] 
100%|██████████| 93/93 [00:00<00:00, 190.97it/s, loss=4.115969]    


Epoch 30 has training loss: 0.3887847015363097
Epoch 30 has validation loss: 0.4965651785355232


100%|██████████| 296/296 [00:02<00:00, 119.21it/s, loss=0.29741216]    
100%|██████████| 93/93 [00:00<00:00, 197.45it/s, loss=3.350976]    


Epoch 31 has training loss: 0.3938839109467985
Epoch 31 has validation loss: 0.4635801535708109


100%|██████████| 296/296 [00:02<00:00, 120.57it/s, loss=2.145765e-06] 
100%|██████████| 93/93 [00:00<00:00, 193.75it/s, loss=3.582765]    


Epoch 32 has training loss: 0.3703948392061543
Epoch 32 has validation loss: 0.47405791499524286


100%|██████████| 296/296 [00:02<00:00, 111.95it/s, loss=0.28139243]   
100%|██████████| 93/93 [00:00<00:00, 191.35it/s, loss=3.68287]     


Epoch 33 has training loss: 0.37366221408971084
Epoch 33 has validation loss: 0.4799335696991102


100%|██████████| 296/296 [00:02<00:00, 118.21it/s, loss=0.5800561]    
100%|██████████| 93/93 [00:00<00:00, 183.07it/s, loss=3.7350342]   


Epoch 34 has training loss: 0.3630548143968818
Epoch 34 has validation loss: 0.4818136901225424


100%|██████████| 296/296 [00:02<00:00, 122.26it/s, loss=0.18625669]   
100%|██████████| 93/93 [00:00<00:00, 195.38it/s, loss=3.7542753]   


Epoch 35 has training loss: 0.35488266040336175
Epoch 35 has validation loss: 0.4824349173865411


100%|██████████| 296/296 [00:02<00:00, 121.61it/s, loss=0.48760173]   
100%|██████████| 93/93 [00:00<00:00, 200.00it/s, loss=1.1406423]  


Epoch 36 has training loss: 0.44068436928522087
Epoch 36 has validation loss: 0.44283420678668767


100%|██████████| 296/296 [00:02<00:00, 119.45it/s, loss=0.0005324853] 
100%|██████████| 93/93 [00:00<00:00, 204.40it/s, loss=5.167835]    


Epoch 37 has training loss: 0.4444212982856378
Epoch 37 has validation loss: 0.7223185912195232


100%|██████████| 296/296 [00:02<00:00, 120.62it/s, loss=0.00029476112]
100%|██████████| 93/93 [00:00<00:00, 189.40it/s, loss=1.6522337]   


Epoch 38 has training loss: 0.48089809421272806
Epoch 38 has validation loss: 0.439934442439417
Best loss at epoch 37


100%|██████████| 296/296 [00:02<00:00, 122.57it/s, loss=0.532972]     
100%|██████████| 93/93 [00:00<00:00, 180.93it/s, loss=1.5573497]   


Epoch 39 has training loss: 0.45939516411252485
Epoch 39 has validation loss: 0.423666414233946
Best loss at epoch 38


100%|██████████| 296/296 [00:02<00:00, 122.01it/s, loss=0.008601048]  
100%|██████████| 93/93 [00:00<00:00, 194.97it/s, loss=1.6014044]   


Epoch 40 has training loss: 0.4485250306842298
Epoch 40 has validation loss: 0.4407648784834491


100%|██████████| 296/296 [00:02<00:00, 117.79it/s, loss=0.466689]     
100%|██████████| 93/93 [00:00<00:00, 190.96it/s, loss=1.4352965]  


Epoch 41 has training loss: 0.4524857785673592
Epoch 41 has validation loss: 0.44893660880024394


100%|██████████| 296/296 [00:02<00:00, 116.67it/s, loss=0.12974153]   
100%|██████████| 93/93 [00:00<00:00, 190.97it/s, loss=2.9723027]   


Epoch 42 has training loss: 0.44306777802902964
Epoch 42 has validation loss: 0.4705720708013252


100%|██████████| 296/296 [00:02<00:00, 119.98it/s, loss=0.016169835]  
100%|██████████| 93/93 [00:00<00:00, 192.95it/s, loss=3.3271623]   


Epoch 43 has training loss: 0.40528214310375377
Epoch 43 has validation loss: 0.4752431970567853


100%|██████████| 296/296 [00:02<00:00, 117.97it/s, loss=8.106199e-06] 
100%|██████████| 93/93 [00:00<00:00, 200.44it/s, loss=4.271014]    


Epoch 44 has training loss: 0.4234511486008426
Epoch 44 has validation loss: 0.567935133948096


100%|██████████| 296/296 [00:02<00:00, 122.47it/s, loss=0.022786565]  
100%|██████████| 93/93 [00:00<00:00, 183.80it/s, loss=3.894105]   


Epoch 45 has training loss: 0.42204212349761916
Epoch 45 has validation loss: 0.5262278846204681


100%|██████████| 296/296 [00:02<00:00, 121.14it/s, loss=0.024232278]  
100%|██████████| 93/93 [00:00<00:00, 192.95it/s, loss=3.091237]   


Epoch 46 has training loss: 0.4462241844686423
Epoch 46 has validation loss: 0.4614053626735044


100%|██████████| 296/296 [00:02<00:00, 119.21it/s, loss=0.22421157]   
100%|██████████| 93/93 [00:00<00:00, 196.62it/s, loss=5.9079676]   


Epoch 47 has training loss: 0.41369915089934395
Epoch 47 has validation loss: 0.5874620031672455


100%|██████████| 296/296 [00:02<00:00, 115.90it/s, loss=0.0031437282]  
100%|██████████| 93/93 [00:00<00:00, 187.88it/s, loss=4.549488]    


Epoch 48 has training loss: 0.40319608993783734
Epoch 48 has validation loss: 0.5564330116049537


100%|██████████| 296/296 [00:02<00:00, 100.82it/s, loss=0.5521482]    
100%|██████████| 93/93 [00:00<00:00, 193.34it/s, loss=4.3664455]   


Epoch 49 has training loss: 0.4123327397838823
Epoch 49 has validation loss: 0.534199126899713


100%|██████████| 296/296 [00:02<00:00, 107.91it/s, loss=0.66348684]  
100%|██████████| 93/93 [00:00<00:00, 203.50it/s, loss=4.937436]   

Epoch 50 has training loss: 0.38758453969303075
Epoch 50 has validation loss: 0.5565479415542727





In [25]:
comment_classification_Model= torch.load(f'best_models/{huggingface_model_name_alias_educ}.pkl')
comment_classification_Model.to(DEVICE)
comment_classification_Model.eval()

predictions = []
pbar_validation = tqdm(val_torch_dataloader)
for embeddings_batch, embedding_fathers, mask_batch, labels_batch in pbar_validation:
  predictions.append(comment_classification_Model(embeddings_batch, embedding_fathers, mask_batch))

predictions = np.stack([e for b in predictions for e in b.cpu().detach().numpy()])

  a_t = torch.nn.Softmax()(e_t)
100%|██████████| 93/93 [00:00<00:00, 270.35it/s]


In [26]:
predictions_softmax = torch.nn.Softmax(dim=1)(torch.from_numpy(predictions)).detach().numpy()

predictions_indices = np.argmax(predictions_softmax, axis = 1)
predicted_normal_labels = labelEncoder.inverse_transform(predictions_indices.ravel())

from sklearn.metrics import classification_report

print(classification_report(val_educ_dataset["label"], predicted_normal_labels))

              precision    recall  f1-score   support

  Irrelevant       0.76      0.69      0.72        74
    Relevant       0.81      0.86      0.83       112

    accuracy                           0.79       186
   macro avg       0.78      0.77      0.78       186
weighted avg       0.79      0.79      0.79       186



In [27]:
predictions = []
pbar_test = tqdm(test_torch_dataloader)
for embeddings_batch, embedding_fathers, mask_batch, labels_batch in pbar_test:
  predictions.append(comment_classification_Model(embeddings_batch, embedding_fathers, mask_batch))

predictions = np.stack([e for b in predictions for e in b.cpu().detach().numpy()])

  a_t = torch.nn.Softmax()(e_t)
100%|██████████| 80/80 [00:05<00:00, 15.19it/s]


In [28]:
predictions_softmax = torch.nn.Softmax(dim=1)(torch.from_numpy(predictions)).detach().numpy()

predictions_indices = np.argmax(predictions_softmax, axis = 1)
predicted_normal_labels = labelEncoder.inverse_transform(predictions_indices.ravel())

from sklearn.metrics import classification_report

print(classification_report(test_educ_dataset["label"], predicted_normal_labels))

              precision    recall  f1-score   support

  Irrelevant       0.85      0.69      0.76        74
    Relevant       0.77      0.90      0.83        86

    accuracy                           0.80       160
   macro avg       0.81      0.79      0.79       160
weighted avg       0.81      0.80      0.80       160



In [29]:
predictions = []
pbar_div = tqdm(div_dataloader_torch)
for embeddings_batch, embedding_fathers, mask_batch, labels_batch in pbar_div:
  predictions.append(comment_classification_Model(embeddings_batch, embedding_fathers, mask_batch))

predictions = np.stack([e for b in predictions for e in b.cpu().detach().numpy()])

  a_t = torch.nn.Softmax()(e_t)
100%|██████████| 252/252 [00:02<00:00, 89.60it/s] 


In [30]:
predictions_softmax = torch.nn.Softmax(dim=1)(torch.from_numpy(predictions)).detach().numpy()

predictions_indices = np.argmax(predictions_softmax, axis = 1)
predicted_normal_labels = labelEncoder.inverse_transform(predictions_indices.ravel())

from sklearn.metrics import classification_report

print(classification_report(dataset_div["label"], predicted_normal_labels))

              precision    recall  f1-score   support

  Irrelevant       0.97      0.58      0.72       393
    Relevant       0.38      0.93      0.54       111

    accuracy                           0.65       504
   macro avg       0.67      0.75      0.63       504
weighted avg       0.84      0.65      0.68       504

