In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as pl
import re
from sklearn.metrics import classification_report
import bs4
from collections import defaultdict

WORKING_DIR='.'
DEVICE = 'cuda'

In [2]:
times_unlabelled = 4

dataset_educ = pd.read_json(f'{WORKING_DIR}/Dataset_educ_1.1.json', orient = 'index')
dataset_div = pd.read_json(f'{WORKING_DIR}/Dataset_div2_final.json', orient = 'index')
dataset_unlabelled = pd.read_json(f'{WORKING_DIR}/Dataset_unlabelled.json', orient = 'index')


In [3]:
regex_link_ful = re.compile('<a href.*\/contest/.*/submission/.*<\/a>')
code_regex = re.compile('<code>(\s|.)*?<\/code>')
def preprocess_for_transfomers(texts):
  preprocessed_texts = []
  for t in texts:
    t_codes = code_regex.sub(' (code) ', t)
    t_link = regex_link_ful.sub(f' (link to problem) ', t_codes)
    bs = bs4.BeautifulSoup(t_link)
    preprocessed_texts.append(bs.text)
  
  return preprocessed_texts

dataset_educ.loc[~(dataset_educ['label'] == 'Irrelevant'), 'label'] = 'Relevant'
preprocessed_text = preprocess_for_transfomers(dataset_educ['text'])
dataset_educ["preprocessed_text"] = preprocessed_text


dataset_div.loc[~(dataset_div['label'] == 'Irrelevant'), 'label'] = 'Relevant'
preprocessed_text = preprocess_for_transfomers(dataset_div['text'])
dataset_div["preprocessed_text"] = preprocessed_text

preprocessed_text = preprocess_for_transfomers(dataset_unlabelled['text'])
dataset_unlabelled["preprocessed_text"] = preprocessed_text

In [4]:
TREE_FATHER_PATH_LENGTH = 3

def compute_father_indices_pd(df):
  father_tree = {}

  for _,com in df.iterrows():
    father_tree[com.id] = com.father_id

  comment_father_indices = {}

  for df_index,com in df.iterrows():
      indices = []
      last_ind = com.id
      for i in range(TREE_FATHER_PATH_LENGTH):
        if(father_tree[last_ind] not in father_tree):
          indices.append(-1)
          continue
        if(last_ind != -1):
          last_ind = father_tree[last_ind]
        
        if(last_ind != -1):
          indices.append(df[df.id == last_ind].index.values[0])
        else:
          indices.append(last_ind)
      indices.reverse()
      comment_father_indices[df_index] = indices
  return comment_father_indices

father_indices_educ = compute_father_indices_pd(dataset_educ)
father_indices_div = compute_father_indices_pd(dataset_div)
father_indices_unlabelled = compute_father_indices_pd(dataset_unlabelled)

In [5]:
from embeddings_generation.utils import *

huggingface_model_name = "finetuned-128bert-base"
huggingface_model_name_alias = huggingface_model_name.split("/")[-1] + "_educ"

if(not embedding_already_persisted(huggingface_model_name_alias)):
    persist_embeddings(dataset_educ["preprocessed_text"], huggingface_model_name, huggingface_model_name_alias, dataset_educ.index)
    
embeddings_educ = load_embeddings(huggingface_model_name_alias)

huggingface_model_name_alias_div = huggingface_model_name.split("/")[-1] +'_div'
if(not embedding_already_persisted(huggingface_model_name_alias_div)):
    persist_embeddings(dataset_div["preprocessed_text"], huggingface_model_name, huggingface_model_name_alias_div, dataset_div.index)
embeddings_div = load_embeddings(huggingface_model_name_alias_div)

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
import torch
from torch.utils.data.dataset import Dataset
from torch.utils.data.dataloader import DataLoader
import numpy as np
import random
RANDOM_SEED = 443
DEVICE = "cpu"

torch.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

groups = list(dataset_educ.groupby(lambda k : k.split("?")[0]))
random.shuffle(groups)

train_groups = groups[:10]
validation_groups = groups[10:13]
test_groups = groups[13:]

train_educ_dataset = dataset_educ.loc[[idx for _, g in train_groups for idx in g.index.tolist()]]
val_educ_dataset = dataset_educ.loc[[idx for _, g in validation_groups for idx in g.index.tolist()]]
test_educ_dataset = dataset_educ.loc[[idx for _, g in test_groups for idx in g.index.tolist()]]

from sklearn.preprocessing import LabelEncoder
labelEncoder = LabelEncoder()
train_encoded_labels = labelEncoder.fit_transform(train_educ_dataset["label"])
val_encoded_labels = labelEncoder.fit_transform(val_educ_dataset["label"])
test_encoded_labels = labelEncoder.fit_transform(test_educ_dataset["label"])

div_encoded_labels = labelEncoder.fit_transform(dataset_div["label"])

num_classes = len(labelEncoder.classes_)

In [7]:
def choose_valid_subset(df_unlabelled:pd.DataFrame, num):
    random_df = df_unlabelled.copy().sample(frac = 1, random_state = 42)
    random_fathers = {}

    selected_indices = set()
    for index in random_df.iloc[:num].index:
        if(len(selected_indices) < num):
            selected_indices.add(index)
        else:
            break
        
        father_indices = father_indices_unlabelled[index]

        for f_idx in father_indices[::-1]:
            if(f_idx == -1 or len(selected_indices) < num):
                break
            selected_indices.add(f_idx)
    
    for idx in selected_indices:
        father_indices = father_indices_unlabelled[idx].copy()

        for f_idx in range(len(father_indices)):
            if(father_indices[f_idx] not in selected_indices):
                father_indices[f_idx] = -1
        random_fathers[idx] = father_indices

    return df_unlabelled.loc[list(selected_indices)], random_fathers

dataset_erroneous, father_indices_erroneous = choose_valid_subset(dataset_unlabelled, len(train_educ_dataset))
dataset_unlabelled.drop(index= dataset_erroneous.index, inplace=True)
            
dataset_unlabelled, father_indices_unlabelled = choose_valid_subset(dataset_unlabelled, len(train_educ_dataset) * times_unlabelled)          

In [8]:
from transformers import AutoModel, AutoTokenizer
from textattack.augmentation import EasyDataAugmenter, CharSwapAugmenter, WordNetAugmenter

easydata_augmenter = EasyDataAugmenter(transformations_per_example= 1)
charswap_augmenter = CharSwapAugmenter()
wordnet_augmenter = WordNetAugmenter()

def weak_augmentation(text):
    return charswap_augmenter.augment(wordnet_augmenter.augment(text)[0])[0]
def strong_augmentation(text):
    return easydata_augmenter.augment(text)[0]

backbone_tokenizer = AutoTokenizer.from_pretrained(huggingface_model_name)
backbone_model = AutoModel.from_pretrained(huggingface_model_name, output_hidden_states = True).to(DEVICE)

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Xzzyaa23\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Xzzyaa23\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
Some weights of the model checkpoint at finetuned-128bert-base were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a mod

In [9]:
class BlogCommentDataset(Dataset):
    def __init__(self, dataset:pd.DataFrame, embeddings:dict, father_indices:dict, num_last_layers_embeddings_agg ,labels):
        self.dataset = dataset
        self.embeddings = embeddings
        self.father_indices = father_indices
        self.num_last_layers_embeddings_agg = num_last_layers_embeddings_agg
        self.labels = labels
    def __len__(self):
        return len(self.dataset)
    def __getitem__(self, index):
        df_index = self.dataset.index[index]
        embedding = self._agg_emb(self.embeddings[df_index])
        fathers = self.father_indices[df_index]

        embedding_fathers = []
        masks = []
        for f_idx in fathers:
            if(f_idx == -1):
                embedding_fathers.append(torch.zeros(embedding.shape[0]))
                masks.append(1)
            else:
                embedding_fathers.append(self._agg_emb(self.embeddings[f_idx]))
                masks.append(0)

        embedding_fathers = torch.stack(embedding_fathers)

        return torch.tensor(index), embedding.to(DEVICE), embedding_fathers.to(DEVICE), torch.tensor(masks, dtype = torch.float32).to(DEVICE), torch.tensor(self.labels[index], dtype = torch.long).to(DEVICE)

    def _agg_emb(self, embedding):
        embedding = np.array(embedding, dtype=np.float32)
        embedding = embedding[0, -self.num_last_layers_embeddings_agg:, :].mean(0)
        return torch.from_numpy(embedding)
    

class BlogCommentUnlabelledDataset(Dataset):
    def __init__(self, dataset:pd.DataFrame, father_indices:dict, num_last_layers_embeddings_agg, tokenizer, model):

        self.backbone_tokenizer = tokenizer
        self.backbone_model = model

        self.dataset = dataset
        self.father_indices = father_indices
        self.num_last_layers_embeddings_agg = num_last_layers_embeddings_agg
    def __len__(self):
        return len(self.dataset)
    def __getitem__(self, idx):
        df_index = self.dataset.index[idx]
        comment_text = [self.dataset.loc[df_index]['preprocessed_text']]

        fathers_text = []
        masks = []
        count_pad = 0
        for f_idx in self.father_indices[str(df_index)]:
            if(f_idx == -1):
                masks.append(1)
                count_pad+=1
            else:
                fathers_text.append(self.dataset.loc[f_idx]['preprocessed_text'])
                masks.append(0)
        
        text_generate_embeddings = comment_text + fathers_text
        comment_embedding_softaug, fathers_embedding_softaug = self._generate_embeddings_using_augmentation(text_generate_embeddings, count_pad, weak_augmentation)
        comment_embedding_hardaug, fathers_embedding_hardaug = self._generate_embeddings_using_augmentation(text_generate_embeddings, count_pad, strong_augmentation)


        return torch.tensor(idx), comment_embedding_softaug, fathers_embedding_softaug, comment_embedding_hardaug, fathers_embedding_hardaug, torch.tensor(masks, dtype = torch.float32).to(DEVICE)
    
    def _generate_embeddings_using_augmentation(self, text, pad_start, aug_fn):
        tokens = self.backbone_tokenizer([aug_fn(t) for t in text], return_tensors = 'pt', truncation = True, padding = 'max_length', max_length = 500).to(DEVICE)
        hidden_states = self.backbone_model(**tokens).hidden_states

        embeddings =[state[:, 0, :] for state in hidden_states]
        embeddings = torch.stack(embeddings).permute(1,0,2)[:,-self.num_last_layers_embeddings_agg:,:].mean(dim = 1).clone()

        if(len(embeddings) > 1):
            return embeddings[0], torch.concat([torch.zeros((pad_start, 768)).to(DEVICE), embeddings[1:]])
        else:
            return embeddings[0], torch.concat([torch.zeros((pad_start, 768)).to(DEVICE)])
        
class BlogCommentErroneousDataset(Dataset):
    def __init__(self, dataset:pd.DataFrame, father_indices:dict, num_last_layers_embeddings_agg, tokenizer, model):
        self.backbone_tokenizer = tokenizer
        self.backbone_model = model

        self.dataset = dataset
        self.father_indices = father_indices
        self.num_last_layers_embeddings_agg = num_last_layers_embeddings_agg
    def __len__(self):
        return len(self.dataset)
    def __getitem__(self, idx):
        df_index = self.dataset.index[idx]
        comment_text = [self.dataset.loc[df_index]['preprocessed_text']]

        fathers_text = []
        masks = []
        count_pad = 0
        for f_idx in self.father_indices[str(df_index)]:
            if(f_idx == -1):
                masks.append(1)
                count_pad+=1
            else:
                fathers_text.append(self.dataset.loc[f_idx]['preprocessed_text'])
                masks.append(0)
        
        text_generate_embeddings = comment_text + fathers_text
        comment_embedding_strongaug, fathers_embedding_strongaug = self._generate_embeddings_using_augmentation(text_generate_embeddings, count_pad, strong_augmentation)


        return torch.tensor(idx), comment_embedding_strongaug, fathers_embedding_strongaug, torch.tensor(masks, dtype = torch.float32).to(DEVICE), torch.tensor(num_classes, dtype = torch.long).to(DEVICE)
    
    def _generate_embeddings_using_augmentation(self, text, pad_start, aug_fn):
        tokens = self.backbone_tokenizer([aug_fn(t) for t in text], return_tensors = 'pt', truncation = True, padding = 'max_length', max_length = 500).to(DEVICE)

        with torch.no_grad():
            hidden_states = self.backbone_model(**tokens).hidden_states

        embeddings =[state[:, 0, :] for state in hidden_states]
        embeddings = torch.stack(embeddings).permute(1,0,2)[:,-self.num_last_layers_embeddings_agg:,:].mean(dim = 1).clone()

        if(len(embeddings) > 1):
            return embeddings[0], torch.concat([torch.zeros((pad_start, 768)).to(DEVICE), embeddings[1:]])
        else:
            return embeddings[0], torch.concat([torch.zeros((pad_start, 768)).to(DEVICE)])

train_torch_dataset = BlogCommentDataset(train_educ_dataset, embeddings_educ, father_indices_educ, 3, train_encoded_labels)
train_torch_dataloader = DataLoader(train_torch_dataset, 2, shuffle=True)

val_torch_dataset = BlogCommentDataset(val_educ_dataset, embeddings_educ, father_indices_educ, 3, val_encoded_labels)
val_torch_dataloader = DataLoader(val_torch_dataset, 2, shuffle=False)

test_torch_dataset = BlogCommentDataset(test_educ_dataset, embeddings_educ, father_indices_educ, 3, test_encoded_labels)
test_torch_dataloader = DataLoader(test_torch_dataset, 2, shuffle=False)

div_dataset_torch = BlogCommentDataset(dataset_div, embeddings_div, father_indices_div, 3, div_encoded_labels)
div_dataloader_torch = DataLoader(div_dataset_torch, 2, shuffle=False)

unlabelled_torch_dataset = BlogCommentUnlabelledDataset(dataset_unlabelled, father_indices_unlabelled, 3, backbone_tokenizer, backbone_model)
unlabelled_dataloader_torch = DataLoader(unlabelled_torch_dataset, 2, shuffle=True)

erroneous_torch_dataset = BlogCommentErroneousDataset(dataset_erroneous, father_indices_erroneous, 3, backbone_tokenizer, backbone_model)
erroneous_dataloader_torch = DataLoader(erroneous_torch_dataset, 2, shuffle=True)

In [26]:
from tqdm import tqdm
import math
class CommentClassificationModel(torch.nn.Module):
    def __init__(self, nrLabels):
        super(CommentClassificationModel, self).__init__()
        self.k = torch.nn.Linear(768, 128)
        self.q = torch.nn.Linear(768, 128)
        self.v = torch.nn.Linear(768, 128) 

        self.comment_proj = torch.nn.Linear(768, 128) 
        self.relu = torch.nn.ReLU()
        self.output = torch.nn.Linear(128, nrLabels) 
        self.hidden1 = torch.nn.Linear(128 * 2, 128)  
        self.dropout = torch.nn.Dropout()  

    def forward(self, x, fathers_x, mask):
        x = self.dropout(x)
        key = self.k(x)
        queries = self.q(fathers_x)
        values = self.v(fathers_x)

        key = torch.unsqueeze(key, -1)

        e_t = torch.bmm(queries, key)  / math.sqrt(256)
        e_t = torch.squeeze(e_t, -1)

        e_t = e_t + mask * -2e9
        a_t = torch.nn.Softmax(-1)(e_t)

        a_t = torch.unsqueeze(a_t, -1)
        average_att = torch.bmm(a_t.permute(0, 2, 1), values)
        average_att = average_att.squeeze(1)
        average_att = average_att

        com_proj = self.comment_proj(x)

        h1 = self.dropout(self.relu(torch.cat([average_att, com_proj], -1)))
        h2 = self.dropout(self.relu(self.hidden1(h1)))

        return self.output(h2)

comment_classification_Model = CommentClassificationModel(len(labelEncoder.classes_) + 1)
comment_classification_Model.to(DEVICE)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(comment_classification_Model.parameters())

pm_t = {}

def calculate_pm(logits):
  pm = torch.zeros(logits.shape)

  for c in range(logits.shape[1]):
    pm[:, c] = logits[:, c]
    pm[:,c]-= logits[:, [c_other for c_other in range(logits.shape[1]) if c_other != c]].amax(-1)
  return pm


apm_threshold = -torch.inf
fix_match_threshold = 0.70
lambda_importance = 1

NrUnlabelledEpochs = 5

unlabelled_batches_accumulation = times_unlabelled // 2

for u_epoch in range(NrUnlabelledEpochs):
  training_average_loss = 0
  unlabelled_average_loss = 0
  erroneous_average_loss = 0
  overall_average_loss = 0
  
  unlabelled_dataloader_torch_iterator = unlabelled_dataloader_torch._get_iterator()
 

  comment_classification_Model.train()
  pbar_train = tqdm(zip(train_torch_dataloader, erroneous_dataloader_torch))
  pbar_train.total = len(train_torch_dataloader)
  for t_batch, e_batch in pbar_train:
      optimizer.zero_grad()
      t_indexes_batch, t_embeddings_batch, t_embeddings_fathers_batch, t_mask_batch, t_labels_batch = t_batch
      e_indexes_batch, e_embeddings_batch, e_embeddings_fathers_batch, e_mask_batch, e_labels_batch = e_batch

      for idx in e_indexes_batch:
           e_idx = f'{idx}_e'
           if(e_idx not in pm_t):
              pm_t[e_idx] = torch.zeros((NrUnlabelledEpochs,num_classes + 1), dtype=torch.float32, device='cpu')
      
      t_yhat = comment_classification_Model(t_embeddings_batch, t_embeddings_fathers_batch, t_mask_batch)
      e_yhat = comment_classification_Model(e_embeddings_batch, e_embeddings_fathers_batch, e_mask_batch)

      loss_t = criterion(t_yhat, t_labels_batch)
      loss_e = criterion(e_yhat, e_labels_batch)

      for idx, pm in zip(e_indexes_batch, calculate_pm(e_yhat.detach().cpu())):
        e_idx = f'{idx}_e'
        pm_t[e_idx][u_epoch] = pm

      loss_u = None
      for _ in range(unlabelled_batches_accumulation):
        u_indexes_batch, u_softaug_embeddings_batch, u_softaug_embedding_fathers, u_hardaug_embeddings_batch, u_hardaug_embedding_fathers, u_mask_batch = next(unlabelled_dataloader_torch_iterator)
    
        for idx in u_indexes_batch:
          u_idx = f'{idx}_u'
          if(u_idx not in pm_t):
            pm_t[u_idx] = torch.zeros((NrUnlabelledEpochs,num_classes + 1), dtype=torch.float32, device='cpu')
        
        u_soft_yhat = comment_classification_Model(u_softaug_embeddings_batch, u_softaug_embedding_fathers, u_mask_batch)
        u_hard_yhat = comment_classification_Model(u_hardaug_embeddings_batch, u_hardaug_embedding_fathers, u_mask_batch)

        apm_u = []
        
        for idx, pm in zip(u_indexes_batch, calculate_pm(u_soft_yhat.detach().cpu())):
          u_idx = f'{idx}_u'
          pm_t[u_idx][u_epoch] = pm

          apm_u.append(pm_t[u_idx][:u_epoch+1, :].mean(0))

        apm_u = torch.stack(apm_u)
    
        pred_u_soft_logit = torch.argmax(u_soft_yhat, -1).detach().cpu()
        pred_u_soft_prob =  torch.nn.functional.softmax(u_soft_yhat, -1).detach().cpu()
        pred_u_soft_class = torch.argmax(u_soft_yhat, -1).detach().cpu()

        pred_u_hard_logit = torch.argmax(u_hard_yhat, -1).detach().cpu()
        pred_u_hard_class = torch.argmax(u_hard_yhat, -1).detach().cpu()
      
        mask_u = (pred_u_soft_prob > fix_match_threshold) & (torch.gather(apm_u, -1, pred_u_soft_class.unsqueeze(-1)) > apm_threshold)
        if(mask_u.any() == True):
          if(loss_u == None):
            loss_u = criterion(u_soft_yhat[mask_u], u_hard_yhat[mask_u])
          else:
            loss_u += criterion(u_soft_yhat[mask_u], u_hard_yhat[mask_u])

      if(loss_u == None):
        overall_loss = loss_t + lambda_importance * (loss_e)
        loss_u = torch.tensor(-1)
      else:
        overall_loss = loss_t + lambda_importance * (loss_u + loss_e)

      overall_loss.backward()
      optimizer.step()

      loss_t = loss_t.detach().cpu().numpy()
      loss_u = loss_u.detach().cpu().numpy()
      loss_e = loss_e.detach().cpu().numpy()
      
      overall_loss = overall_loss.detach().cpu().numpy()
      pbar_train.set_postfix({'loss_t': loss_t, "loss_u": loss_u, "loss_e":loss_e, "overall_loss":overall_loss})

      training_average_loss += loss_t
      unlabelled_average_loss += loss_u
      erroneous_average_loss += loss_e

  pbar_validation = tqdm(val_torch_dataloader)
  validation_average_loss = 0
  comment_classification_Model.eval()
  for embeddings_batch, embedding_fathers, mask_batch, labels_batch in pbar_validation:
    with torch.no_grad():
      yhat = comment_classification_Model(embeddings_batch, embedding_fathers, mask_batch)
      loss = criterion(yhat, labels_batch)

      validation_average_loss += loss.cpu().detach().numpy()

      pbar_validation.set_postfix({'loss': loss.cpu().detach().numpy()})
      
  print(f'Epoch {u_epoch + 1} has training loss: {training_average_loss / len(pbar_train)}')
  print(f'Epoch {u_epoch + 1} has validation loss: {validation_average_loss / len(pbar_validation)}')

  apm_e = []

  for key, val in pm_t.items():
     if(key.endswith('_e')):
        apm_e.append(torch.mean(val[:u_epoch + 1, num_classes], 0))

  apm_threshold = torch.stack(apm_e).T.quantile(torch.tensor(0.95), 1)


  1%|          | 3/296 [01:51<3:01:24, 37.15s/it, loss_t=1.1393039, loss_u=-1, loss_e=1.0511429, overall_loss=2.1904469]


KeyboardInterrupt: 

In [None]:
predictions = []
pbar_validation = tqdm(val_torch_dataloader)
for embeddings_batch, embedding_fathers, mask_batch, labels_batch in pbar_validation:
  predictions.append(comment_classification_Model(embeddings_batch, embedding_fathers, mask_batch))

predictions = np.stack([e for b in predictions for e in b.cpu().detach().numpy()])

In [None]:
predictions_softmax = torch.nn.Softmax(dim=1)(torch.from_numpy(predictions)).detach().numpy()

predictions_indices = np.argmax(predictions_softmax, axis = 1)
predicted_normal_labels = labelEncoder.inverse_transform(predictions_indices.ravel())

from sklearn.metrics import classification_report

print(classification_report(val_educ_dataset["label"], predicted_normal_labels))

In [None]:
predictions = []
pbar_test = tqdm(test_torch_dataloader)
for embeddings_batch, embedding_fathers, mask_batch, labels_batch in pbar_test:
  predictions.append(comment_classification_Model(embeddings_batch, embedding_fathers, mask_batch))

predictions = np.stack([e for b in predictions for e in b.cpu().detach().numpy()])

In [None]:
predictions_softmax = torch.nn.Softmax(dim=1)(torch.from_numpy(predictions)).detach().numpy()

predictions_indices = np.argmax(predictions_softmax, axis = 1)
predicted_normal_labels = labelEncoder.inverse_transform(predictions_indices.ravel())

from sklearn.metrics import classification_report

print(classification_report(test_educ_dataset["label"], predicted_normal_labels))