In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as pl
import re
from sklearn.metrics import classification_report
import bs4
from collections import defaultdict

WORKING_DIR='.'
DEVICE = 'cuda'

In [2]:
dataset_educ = pd.read_json(f'{WORKING_DIR}/Dataset_educ_1.1.json', orient = 'index')
dataset_div = pd.read_json(f'{WORKING_DIR}/Dataset_div2_final.json', orient = 'index')

In [3]:
regex_link_ful = re.compile('<a href.*\/contest/.*/submission/.*<\/a>')
code_regex = re.compile('<code>(\s|.)*?<\/code>')
def preprocess_for_transfomers(texts, problems):
  preprocessed_texts = []
  for t, p in zip(texts, problems):
    t_codes = code_regex.sub(' (code) ', t)
    t_link = regex_link_ful.sub(f' (link to problem {p}) ', t_codes)
    bs = bs4.BeautifulSoup(t_link)
    preprocessed_texts.append(bs.text)
  
  return preprocessed_texts

dataset_educ.loc[~(dataset_educ['label'] == 'Irrelevant'), 'label'] = 'Relevant'
preprocessed_text = preprocess_for_transfomers(dataset_educ['text'], dataset_educ['problem'])
dataset_educ["preprocessed_text"] = preprocessed_text

dataset_div.loc[~(dataset_div['label'] == 'Irrelevant'), 'label'] = 'Relevant'
preprocessed_text = preprocess_for_transfomers(dataset_div['text'], dataset_div['problem'])
dataset_div["preprocessed_text"] = preprocessed_text

In [4]:
TREE_FATHER_PATH_LENGTH = 3

def compute_father_indices_pd(df):
  father_tree = {}

  for _,com in df.iterrows():
    father_tree[com.id] = com.father_id

  comment_father_indices = {}

  for df_index,com in df.iterrows():
      indices = []
      last_ind = com.id
      for i in range(TREE_FATHER_PATH_LENGTH):
        if(father_tree[last_ind] not in father_tree):
          indices.append(-1)
          continue
        if(last_ind != -1):
          last_ind = father_tree[last_ind]
        
        if(last_ind != -1):
          indices.append(df[df.id == last_ind].index.values[0])
        else:
          indices.append(last_ind)
      indices.reverse()
      comment_father_indices[df_index] = indices
  return comment_father_indices

father_indices_educ = compute_father_indices_pd(dataset_educ)
father_indices_div = compute_father_indices_pd(dataset_div)

In [5]:
from embeddings_generation import TokenizedDataset, LayerEMBTokenEmbeddingGeneration
from embeddings_generation.utils import *

huggingface_model_name = "roberta-128-base"
huggingface_model_name_alias = huggingface_model_name.split("/")[-1] + "_educ"

if(not embedding_already_persisted(huggingface_model_name_alias)):
    persist_embeddings(dataset_educ["preprocessed_text"], huggingface_model_name, huggingface_model_name_alias, dataset_educ.index)
    
embeddings_educ = load_embeddings(huggingface_model_name_alias)

huggingface_model_name_alias_div = huggingface_model_name.split("/")[-1] +'_div'
if(not embedding_already_persisted(huggingface_model_name_alias_div)):
    persist_embeddings(dataset_div["preprocessed_text"], huggingface_model_name, huggingface_model_name_alias_div, dataset_div.index)
embeddings_div = load_embeddings(huggingface_model_name_alias_div)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at roberta-128-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-128-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for p

In [6]:
statistics = determine_tokens_statistics(dataset_educ["preprocessed_text"], huggingface_model_name)

statistics["ratioNotWholeWords"] = statistics["nrWordsSplitAtleastTwice"] / statistics["nrWholeWords"]

statistics.describe()

Unnamed: 0,nrTokens,nrWholeWords,maxWordSplit,nrWordsSplitAtleastTwice,ratioNotWholeWords
count,937.0,937.0,937.0,937.0,937.0
mean,69.649947,69.649947,0.0,0.0,0.0
std,116.79456,116.79456,0.0,0.0,0.0
min,3.0,3.0,0.0,0.0,0.0
25%,17.0,17.0,0.0,0.0,0.0
50%,32.0,32.0,0.0,0.0,0.0
75%,71.0,71.0,0.0,0.0,0.0
max,1216.0,1216.0,0.0,0.0,0.0


In [7]:
import torch
from torch.utils.data.dataset import Dataset
from torch.utils.data.dataloader import DataLoader
import numpy as np
import random
RANDOM_SEED = 443
DEVICE = "cuda"

torch.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

groups = list(dataset_educ.groupby(lambda k : k.split("?")[0]))
random.shuffle(groups)

train_groups = groups[:10]
validation_groups = groups[10:13]
test_groups = groups[13:]

train_educ_dataset = dataset_educ.loc[[idx for _, g in train_groups for idx in g.index.tolist()]]
val_educ_dataset = dataset_educ.loc[[idx for _, g in validation_groups for idx in g.index.tolist()]]
test_educ_dataset = dataset_educ.loc[[idx for _, g in test_groups for idx in g.index.tolist()]]

from sklearn.preprocessing import LabelEncoder
labelEncoder = LabelEncoder()
train_encoded_labels = labelEncoder.fit_transform(train_educ_dataset["label"])
val_encoded_labels = labelEncoder.fit_transform(val_educ_dataset["label"])
test_encoded_labels = labelEncoder.fit_transform(test_educ_dataset["label"])

div_encoded_labels = labelEncoder.fit_transform(dataset_div["label"])

In [8]:
class BlogCommentDataset(Dataset):
    def __init__(self, dataset:pd.DataFrame, embeddings:dict, father_indices:dict, num_last_layers_embeddings_agg ,labels):
        self.dataset = dataset
        self.embeddings = embeddings
        self.father_indices = father_indices
        self.num_last_layers_embeddings_agg = num_last_layers_embeddings_agg
        self.labels = labels
    def __len__(self):
        return len(self.dataset)
    def __getitem__(self, index):
        df_index = self.dataset.index[index]
        embedding = self._agg_emb(self.embeddings[df_index])
        fathers = self.father_indices[df_index]

        embedding_fathers = []
        masks = []
        for f_idx in fathers:
            if(f_idx == -1):
                embedding_fathers.append(torch.zeros(embedding.shape[0]))
                masks.append(1)
            else:
                embedding_fathers.append(self._agg_emb(self.embeddings[f_idx]))
                masks.append(0)

        embedding_fathers = torch.stack(embedding_fathers)
        return embedding.to(DEVICE), embedding_fathers.to(DEVICE), torch.tensor(masks, dtype = torch.float32).to(DEVICE), torch.tensor(self.labels[index], dtype = torch.long).to(DEVICE)
    
    def _agg_emb(self, embedding):
        embedding = np.array(embedding, dtype=np.float32)
        embedding = embedding[0, -self.num_last_layers_embeddings_agg:, :].mean(0)
        return torch.from_numpy(embedding)

train_torch_dataset = BlogCommentDataset(train_educ_dataset, embeddings_educ, father_indices_educ, 3, train_encoded_labels)
train_torch_dataloader = DataLoader(train_torch_dataset, 2, shuffle=True)

val_torch_dataset = BlogCommentDataset(val_educ_dataset, embeddings_educ, father_indices_educ, 3, val_encoded_labels)
val_torch_dataloader = DataLoader(val_torch_dataset, 2, shuffle=False)

test_torch_dataset = BlogCommentDataset(test_educ_dataset, embeddings_educ, father_indices_educ, 3, test_encoded_labels)
test_torch_dataloader = DataLoader(test_torch_dataset, 2, shuffle=False)

div_dataset_torch = BlogCommentDataset(dataset_div, embeddings_div, father_indices_div, 3, div_encoded_labels)
div_dataloader_torch = DataLoader(div_dataset_torch, 2, shuffle=False)

In [9]:
from tqdm import tqdm
import math
class CommentClassificationModel(torch.nn.Module):
    def __init__(self, nrLabels):
        super(CommentClassificationModel, self).__init__()
        self.k = torch.nn.Linear(768, 128)
        self.q = torch.nn.Linear(768, 128)
        self.v = torch.nn.Linear(768, 128) 

        self.comment_proj = torch.nn.Linear(768, 128) 
        self.relu = torch.nn.ReLU()
        self.output = torch.nn.Linear(128, nrLabels) 
        self.hidden1 = torch.nn.Linear(128 * 2, 128)  
        self.dropout = torch.nn.Dropout(0.6)  

    def forward(self, x, fathers_x, mask):
        key = self.k(x)
        queries = self.q(fathers_x)
        values = self.v(fathers_x)

        key = torch.unsqueeze(key, -1)

        e_t = torch.bmm(queries, key)  / math.sqrt(128)
        e_t = torch.squeeze(e_t, -1)

        e_t = e_t + mask * -2e9
        a_t = torch.nn.Softmax()(e_t)

        a_t = torch.unsqueeze(a_t, -1)
        average_att = torch.bmm(a_t.permute(0, 2, 1), values)
        average_att = average_att.squeeze(1)
        average_att = average_att

        com_proj = self.comment_proj(x)

        h1 = self.dropout(self.relu(torch.cat([average_att, com_proj], -1)))
        h2 = self.dropout(self.relu(self.hidden1(h1)))

        return self.output(h2)

comment_classification_Model = CommentClassificationModel(len(labelEncoder.classes_))
comment_classification_Model.to(DEVICE)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(comment_classification_Model.parameters())
scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer,len(train_torch_dataloader) * 5, 2)

nr_epochs = 50
current_step = 0
best_model_loss = 1e9
for epoch in range(nr_epochs):
  pbar_training = tqdm(train_torch_dataloader)
  training_average_loss = 0
  training_nr_batches = 0
  comment_classification_Model.train()
  iters = len(pbar_training)
  
  for embeddings_batch, embedding_fathers, mask_batch, labels_batch in pbar_training:
    optimizer.zero_grad()
    yhat = comment_classification_Model(embeddings_batch, embedding_fathers, mask_batch)
    loss = criterion(yhat, labels_batch)
    loss.backward()
    optimizer.step()
    pbar_training.set_postfix({'loss': loss.cpu().detach().numpy()})

    training_average_loss += loss.cpu().detach().numpy()
    training_nr_batches+=1
    current_step+=1
    scheduler.step()

  pbar_validation = tqdm(val_torch_dataloader)

  validation_average_loss = 0
  validation_nr_batches = 0
  comment_classification_Model.eval()
  for embeddings_batch, embedding_fathers, mask_batch, labels_batch in pbar_validation:
    with torch.no_grad():
      yhat = comment_classification_Model(embeddings_batch, embedding_fathers, mask_batch)
      loss = criterion(yhat, labels_batch)

      validation_average_loss += loss.cpu().detach().numpy()
      validation_nr_batches+=1

      pbar_validation.set_postfix({'loss': loss.cpu().detach().numpy()})
      
  print(f'Epoch {epoch + 1} has training loss: {training_average_loss / training_nr_batches}')
  print(f'Epoch {epoch + 1} has validation loss: {validation_average_loss / validation_nr_batches}')

  if(validation_average_loss / validation_nr_batches < best_model_loss):
     best_model_loss = validation_average_loss / validation_nr_batches
     print(f'Best loss at epoch {epoch}')
     torch.save(comment_classification_Model, f'best_models/{huggingface_model_name_alias}.pkl')


  a_t = torch.nn.Softmax()(e_t)
  0%|          | 0/296 [00:00<?, ?it/s, loss=0.6294312] 

100%|██████████| 296/296 [00:02<00:00, 119.26it/s, loss=0.43024558]
100%|██████████| 93/93 [00:00<00:00, 195.79it/s, loss=0.5988634] 


Epoch 1 has training loss: 0.6967282313350085
Epoch 1 has validation loss: 0.6388682601272418
Best loss at epoch 0


100%|██████████| 296/296 [00:02<00:00, 115.50it/s, loss=0.5576991] 
100%|██████████| 93/93 [00:00<00:00, 194.56it/s, loss=0.75110555]


Epoch 2 has training loss: 0.6555286470699955
Epoch 2 has validation loss: 0.6193550158572453
Best loss at epoch 1


100%|██████████| 296/296 [00:02<00:00, 121.78it/s, loss=0.8905525] 
100%|██████████| 93/93 [00:00<00:00, 189.40it/s, loss=0.47291055]


Epoch 3 has training loss: 0.6038451375490105
Epoch 3 has validation loss: 0.5064702432963156
Best loss at epoch 2


100%|██████████| 296/296 [00:02<00:00, 119.28it/s, loss=0.9328258]  
100%|██████████| 93/93 [00:00<00:00, 194.97it/s, loss=0.51450664]


Epoch 4 has training loss: 0.5242821458299216
Epoch 4 has validation loss: 0.46118304413813416
Best loss at epoch 3


100%|██████████| 296/296 [00:02<00:00, 120.16it/s, loss=0.6580543]  
100%|██████████| 93/93 [00:00<00:00, 188.64it/s, loss=0.5712142] 


Epoch 5 has training loss: 0.5032970505967937
Epoch 5 has validation loss: 0.4611269742731125
Best loss at epoch 4


100%|██████████| 296/296 [00:02<00:00, 121.80it/s, loss=0.99258757] 
100%|██████████| 93/93 [00:00<00:00, 198.35it/s, loss=0.526408]  


Epoch 6 has training loss: 0.4925573758680273
Epoch 6 has validation loss: 0.43581380262490244
Best loss at epoch 5


100%|██████████| 296/296 [00:02<00:00, 120.13it/s, loss=0.05644633] 
100%|██████████| 93/93 [00:00<00:00, 197.04it/s, loss=0.31574118] 


Epoch 7 has training loss: 0.4769298205957622
Epoch 7 has validation loss: 0.42428047856896794
Best loss at epoch 6


100%|██████████| 296/296 [00:02<00:00, 120.73it/s, loss=0.19680944] 
100%|██████████| 93/93 [00:00<00:00, 202.16it/s, loss=0.6232935]  


Epoch 8 has training loss: 0.4373471903413333
Epoch 8 has validation loss: 0.41668051122737826
Best loss at epoch 7


100%|██████████| 296/296 [00:02<00:00, 121.22it/s, loss=0.16772762] 
100%|██████████| 93/93 [00:00<00:00, 186.54it/s, loss=0.3215848]  


Epoch 9 has training loss: 0.4583663620712311
Epoch 9 has validation loss: 0.38664295015636313
Best loss at epoch 8


100%|██████████| 296/296 [00:02<00:00, 120.60it/s, loss=0.13164648]  
100%|██████████| 93/93 [00:00<00:00, 174.81it/s, loss=0.521152]   


Epoch 10 has training loss: 0.40484228673331263
Epoch 10 has validation loss: 0.40592034101005525


100%|██████████| 296/296 [00:02<00:00, 118.55it/s, loss=0.06359656] 
100%|██████████| 93/93 [00:00<00:00, 178.16it/s, loss=0.2980622]  


Epoch 11 has training loss: 0.398438085518727
Epoch 11 has validation loss: 0.38365838021760984
Best loss at epoch 10


100%|██████████| 296/296 [00:02<00:00, 120.13it/s, loss=0.010561171]
100%|██████████| 93/93 [00:00<00:00, 196.19it/s, loss=0.47966516] 


Epoch 12 has training loss: 0.36855509923456387
Epoch 12 has validation loss: 0.4122782576108171


100%|██████████| 296/296 [00:02<00:00, 119.16it/s, loss=0.21209286] 
100%|██████████| 93/93 [00:00<00:00, 195.94it/s, loss=0.27468356] 


Epoch 13 has training loss: 0.36134921931630204
Epoch 13 has validation loss: 0.3763802816330265
Best loss at epoch 12


100%|██████████| 296/296 [00:02<00:00, 125.59it/s, loss=1.1859283]  
100%|██████████| 93/93 [00:00<00:00, 201.63it/s, loss=0.29737085] 


Epoch 14 has training loss: 0.3405523642369017
Epoch 14 has validation loss: 0.3789379596109352


100%|██████████| 296/296 [00:02<00:00, 125.93it/s, loss=1.034175]    
100%|██████████| 93/93 [00:00<00:00, 197.03it/s, loss=0.30709094] 


Epoch 15 has training loss: 0.3590758592771316
Epoch 15 has validation loss: 0.3804145807460431


100%|██████████| 296/296 [00:02<00:00, 125.86it/s, loss=1.065258]    
100%|██████████| 93/93 [00:00<00:00, 197.03it/s, loss=0.12187627] 


Epoch 16 has training loss: 0.4018668642761563
Epoch 16 has validation loss: 0.44226396824383446


100%|██████████| 296/296 [00:02<00:00, 126.07it/s, loss=0.45264563]  
100%|██████████| 93/93 [00:00<00:00, 204.85it/s, loss=0.22715124] 


Epoch 17 has training loss: 0.3920605522602076
Epoch 17 has validation loss: 0.38593861249385664


100%|██████████| 296/296 [00:02<00:00, 111.19it/s, loss=0.037898872] 
100%|██████████| 93/93 [00:00<00:00, 199.56it/s, loss=0.44049323] 


Epoch 18 has training loss: 0.4032572735088399
Epoch 18 has validation loss: 0.4077574075750446


100%|██████████| 296/296 [00:02<00:00, 119.26it/s, loss=2.2780461]   
100%|██████████| 93/93 [00:00<00:00, 207.59it/s, loss=0.57628465] 


Epoch 19 has training loss: 0.38123729478644924
Epoch 19 has validation loss: 0.4344770784379654


100%|██████████| 296/296 [00:02<00:00, 119.92it/s, loss=3.0703049]    
100%|██████████| 93/93 [00:00<00:00, 203.51it/s, loss=0.35504287] 


Epoch 20 has training loss: 0.39466778781265804
Epoch 20 has validation loss: 0.372889557893398
Best loss at epoch 19


100%|██████████| 296/296 [00:02<00:00, 125.49it/s, loss=0.12990119]  
100%|██████████| 93/93 [00:00<00:00, 206.67it/s, loss=0.63739705]  


Epoch 21 has training loss: 0.3927336761415653
Epoch 21 has validation loss: 0.465885795998357


100%|██████████| 296/296 [00:02<00:00, 125.89it/s, loss=0.07990321]  
100%|██████████| 93/93 [00:00<00:00, 197.45it/s, loss=0.22026342]  


Epoch 22 has training loss: 0.39323523009984085
Epoch 22 has validation loss: 0.3731752023231038


100%|██████████| 296/296 [00:02<00:00, 118.18it/s, loss=0.83604723]  
100%|██████████| 93/93 [00:00<00:00, 200.87it/s, loss=0.26679778] 


Epoch 23 has training loss: 0.3876357579981748
Epoch 23 has validation loss: 0.3742213706956595


100%|██████████| 296/296 [00:02<00:00, 121.13it/s, loss=0.41137043]  
100%|██████████| 93/93 [00:00<00:00, 203.56it/s, loss=0.26190016] 


Epoch 24 has training loss: 0.3521475365945465
Epoch 24 has validation loss: 0.367760770468502
Best loss at epoch 23


100%|██████████| 296/296 [00:02<00:00, 125.65it/s, loss=0.29128328]   
100%|██████████| 93/93 [00:00<00:00, 193.93it/s, loss=0.83143413] 


Epoch 25 has training loss: 0.328447080849499
Epoch 25 has validation loss: 0.44405636971475937


100%|██████████| 296/296 [00:02<00:00, 119.89it/s, loss=0.00027879167]
100%|██████████| 93/93 [00:00<00:00, 203.48it/s, loss=0.23989168] 


Epoch 26 has training loss: 0.3305062342631425
Epoch 26 has validation loss: 0.40372616099694403


100%|██████████| 296/296 [00:02<00:00, 122.07it/s, loss=0.078734845] 
100%|██████████| 93/93 [00:00<00:00, 200.43it/s, loss=0.23677564] 


Epoch 27 has training loss: 0.3247468171392865
Epoch 27 has validation loss: 0.37721662338502626


100%|██████████| 296/296 [00:02<00:00, 118.68it/s, loss=0.010810265] 
100%|██████████| 93/93 [00:00<00:00, 187.51it/s, loss=0.19154608]  


Epoch 28 has training loss: 0.3154851847133663
Epoch 28 has validation loss: 0.3778439166740344


100%|██████████| 296/296 [00:02<00:00, 119.89it/s, loss=0.9804181]   
100%|██████████| 93/93 [00:00<00:00, 199.99it/s, loss=0.3308792]   


Epoch 29 has training loss: 0.3022465091041365
Epoch 29 has validation loss: 0.4000002464620016


100%|██████████| 296/296 [00:02<00:00, 122.31it/s, loss=0.02735839]  
100%|██████████| 93/93 [00:00<00:00, 204.43it/s, loss=0.27506873]  


Epoch 30 has training loss: 0.2900186526824876
Epoch 30 has validation loss: 0.39574567604840044


100%|██████████| 296/296 [00:02<00:00, 117.93it/s, loss=0.01868036]   
100%|██████████| 93/93 [00:00<00:00, 183.44it/s, loss=0.22826493]  


Epoch 31 has training loss: 0.2672977350929294
Epoch 31 has validation loss: 0.3939628173291723


100%|██████████| 296/296 [00:02<00:00, 118.50it/s, loss=0.045538258]  
100%|██████████| 93/93 [00:00<00:00, 191.38it/s, loss=0.2492629]   


Epoch 32 has training loss: 0.2808012358579396
Epoch 32 has validation loss: 0.3905574614607719


100%|██████████| 296/296 [00:02<00:00, 120.63it/s, loss=0.30491605]   
100%|██████████| 93/93 [00:00<00:00, 202.25it/s, loss=0.25866708]  


Epoch 33 has training loss: 0.28948332798500886
Epoch 33 has validation loss: 0.39168439242708425


100%|██████████| 296/296 [00:02<00:00, 116.27it/s, loss=0.17999291]   
100%|██████████| 93/93 [00:00<00:00, 198.28it/s, loss=0.264121]    


Epoch 34 has training loss: 0.28268117767426726
Epoch 34 has validation loss: 0.3930936834489506


100%|██████████| 296/296 [00:02<00:00, 119.33it/s, loss=0.32621935]   
100%|██████████| 93/93 [00:00<00:00, 184.16it/s, loss=0.26240873]  


Epoch 35 has training loss: 0.27653215808022896
Epoch 35 has validation loss: 0.3933848946790902


100%|██████████| 296/296 [00:02<00:00, 118.66it/s, loss=0.28260243]  
100%|██████████| 93/93 [00:00<00:00, 196.60it/s, loss=0.2174033]  


Epoch 36 has training loss: 0.3599310796105144
Epoch 36 has validation loss: 0.3784741396484997


100%|██████████| 296/296 [00:02<00:00, 124.16it/s, loss=0.00017605662] 
100%|██████████| 93/93 [00:00<00:00, 204.52it/s, loss=0.26171422] 


Epoch 37 has training loss: 0.34909186091298017
Epoch 37 has validation loss: 0.3617229946208016
Best loss at epoch 36


100%|██████████| 296/296 [00:02<00:00, 125.02it/s, loss=0.113753706] 
100%|██████████| 93/93 [00:00<00:00, 195.37it/s, loss=0.15543438]  


Epoch 38 has training loss: 0.3548523123958935
Epoch 38 has validation loss: 0.40741539104611324


100%|██████████| 296/296 [00:02<00:00, 125.29it/s, loss=0.46239066]   
100%|██████████| 93/93 [00:00<00:00, 203.07it/s, loss=0.18931992] 


Epoch 39 has training loss: 0.374315019608607
Epoch 39 has validation loss: 0.4065117324647364


100%|██████████| 296/296 [00:02<00:00, 122.81it/s, loss=0.24216805]   
100%|██████████| 93/93 [00:00<00:00, 195.36it/s, loss=0.21514946] 


Epoch 40 has training loss: 0.36920878846344624
Epoch 40 has validation loss: 0.3800595585427057


100%|██████████| 296/296 [00:02<00:00, 122.39it/s, loss=0.1264524]    
100%|██████████| 93/93 [00:00<00:00, 190.19it/s, loss=0.29086348]  


Epoch 41 has training loss: 0.33241748948989264
Epoch 41 has validation loss: 0.410923509851038


100%|██████████| 296/296 [00:02<00:00, 124.30it/s, loss=0.1761471]    
100%|██████████| 93/93 [00:00<00:00, 201.75it/s, loss=0.324228]    


Epoch 42 has training loss: 0.3492529916453055
Epoch 42 has validation loss: 0.37622797519709633


100%|██████████| 296/296 [00:02<00:00, 125.39it/s, loss=0.17316768]   
100%|██████████| 93/93 [00:00<00:00, 204.30it/s, loss=0.24502942] 


Epoch 43 has training loss: 0.3337815567029375
Epoch 43 has validation loss: 0.3767334167014367


100%|██████████| 296/296 [00:02<00:00, 126.21it/s, loss=0.08224913]   
100%|██████████| 93/93 [00:00<00:00, 205.31it/s, loss=0.34444135] 


Epoch 44 has training loss: 0.32899899569531194
Epoch 44 has validation loss: 0.39826456520966785


100%|██████████| 296/296 [00:02<00:00, 125.67it/s, loss=0.054311268]  
100%|██████████| 93/93 [00:00<00:00, 197.46it/s, loss=0.3364433]   


Epoch 45 has training loss: 0.33445907454265855
Epoch 45 has validation loss: 0.37148784009379243


100%|██████████| 296/296 [00:02<00:00, 125.85it/s, loss=0.0011614966] 
100%|██████████| 93/93 [00:00<00:00, 201.74it/s, loss=0.41886574] 


Epoch 46 has training loss: 0.318428639571283
Epoch 46 has validation loss: 0.3713548834565827


100%|██████████| 296/296 [00:02<00:00, 125.80it/s, loss=0.034179077]  
100%|██████████| 93/93 [00:00<00:00, 204.40it/s, loss=0.64597404] 


Epoch 47 has training loss: 0.3139184421639535
Epoch 47 has validation loss: 0.42267639894959746


100%|██████████| 296/296 [00:02<00:00, 125.06it/s, loss=0.15115117]   
100%|██████████| 93/93 [00:00<00:00, 199.14it/s, loss=0.4045418]  


Epoch 48 has training loss: 0.32195773320150645
Epoch 48 has validation loss: 0.382055815518333


100%|██████████| 296/296 [00:02<00:00, 125.17it/s, loss=0.19334863]   
100%|██████████| 93/93 [00:00<00:00, 197.82it/s, loss=0.3585338]  


Epoch 49 has training loss: 0.3257794246196313
Epoch 49 has validation loss: 0.3710971082190192


100%|██████████| 296/296 [00:02<00:00, 118.52it/s, loss=0.0082794055] 
100%|██████████| 93/93 [00:00<00:00, 203.07it/s, loss=0.23611782]  

Epoch 50 has training loss: 0.30431771248379086
Epoch 50 has validation loss: 0.39268589193209186





In [10]:
comment_classification_Model= torch.load(f'best_models/{huggingface_model_name_alias}.pkl')
comment_classification_Model.to(DEVICE)
comment_classification_Model.eval()
predictions = []
pbar_validation = tqdm(val_torch_dataloader)
for embeddings_batch, embedding_fathers, mask_batch, labels_batch in pbar_validation:
  predictions.append(comment_classification_Model(embeddings_batch, embedding_fathers, mask_batch))

predictions = np.stack([e for b in predictions for e in b.cpu().detach().numpy()])

  0%|          | 0/93 [00:00<?, ?it/s]

  a_t = torch.nn.Softmax()(e_t)
100%|██████████| 93/93 [00:00<00:00, 284.39it/s]


In [11]:
predictions_softmax = torch.nn.Softmax(dim=1)(torch.from_numpy(predictions)).detach().numpy()

predictions_indices = np.argmax(predictions_softmax, axis = 1)
predicted_normal_labels = labelEncoder.inverse_transform(predictions_indices.ravel())

from sklearn.metrics import classification_report

print(classification_report(val_educ_dataset["label"], predicted_normal_labels))

              precision    recall  f1-score   support

  Irrelevant       0.82      0.76      0.79        74
    Relevant       0.85      0.89      0.87       112

    accuracy                           0.84       186
   macro avg       0.84      0.82      0.83       186
weighted avg       0.84      0.84      0.84       186



In [12]:
predictions = []
pbar_test = tqdm(test_torch_dataloader)
for embeddings_batch, embedding_fathers, mask_batch, labels_batch in pbar_test:
  predictions.append(comment_classification_Model(embeddings_batch, embedding_fathers, mask_batch))

predictions = np.stack([e for b in predictions for e in b.cpu().detach().numpy()])

  a_t = torch.nn.Softmax()(e_t)
100%|██████████| 80/80 [00:00<00:00, 311.29it/s]


In [13]:
predictions_softmax = torch.nn.Softmax(dim=1)(torch.from_numpy(predictions)).detach().numpy()

predictions_indices = np.argmax(predictions_softmax, axis = 1)
predicted_normal_labels = labelEncoder.inverse_transform(predictions_indices.ravel())

from sklearn.metrics import classification_report

print(classification_report(test_educ_dataset["label"], predicted_normal_labels))

              precision    recall  f1-score   support

  Irrelevant       0.85      0.84      0.84        74
    Relevant       0.86      0.87      0.87        86

    accuracy                           0.86       160
   macro avg       0.86      0.85      0.86       160
weighted avg       0.86      0.86      0.86       160



In [14]:
predictions = []
pbar_div = tqdm(div_dataloader_torch)
for embeddings_batch, embedding_fathers, mask_batch, labels_batch in pbar_div:
  predictions.append(comment_classification_Model(embeddings_batch, embedding_fathers, mask_batch))

predictions = np.stack([e for b in predictions for e in b.cpu().detach().numpy()])

  a_t = torch.nn.Softmax()(e_t)
100%|██████████| 252/252 [00:00<00:00, 317.38it/s]


In [15]:
predictions_softmax = torch.nn.Softmax(dim=1)(torch.from_numpy(predictions)).detach().numpy()

predictions_indices = np.argmax(predictions_softmax, axis = 1)
predicted_normal_labels = labelEncoder.inverse_transform(predictions_indices.ravel())

from sklearn.metrics import classification_report

print(classification_report(dataset_div["label"], predicted_normal_labels))

              precision    recall  f1-score   support

  Irrelevant       0.96      0.68      0.80       393
    Relevant       0.45      0.90      0.60       111

    accuracy                           0.73       504
   macro avg       0.70      0.79      0.70       504
weighted avg       0.85      0.73      0.75       504

