<a href="https://colab.research.google.com/github/BecomeAllan/Bert_meta_learning_papers/blob/main/Meta_learning_EFL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Requirements

In [None]:
!pip install transformers==4.16.2
!pip install torchmetrics==0.8.0

!pip install matplotlib==3.5.1

In [None]:
%matplotlib inline
import torch.nn.functional as F
import torch.nn as nn
import math
import torch
import numpy as np
import pandas as pd
import time
import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.manifold import TSNE
from copy import deepcopy, copy
import seaborn as sns
import matplotlib.pylab as plt
from pprint import pprint
import shutil
import datetime
import re
import json
from pathlib import Path

SEED = 2222

gen_seed = torch.Generator().manual_seed(SEED)

# Useful Functions

## Random

In [None]:
import os
import torch
import numpy as np
import random
import json, pickle

# Random seed function
def random_seed(value):
    torch.backends.cudnn.deterministic=True
    torch.manual_seed(value)
    torch.cuda.manual_seed(value)
    np.random.seed(value)
    random.seed(value)

# Batch creation function
def create_batch_of_tasks(taskset, is_shuffle = True, batch_size = 4):
    idxs = list(range(0,len(taskset)))
    if is_shuffle:
        random.shuffle(idxs)
    for i in range(0,len(idxs), batch_size):
        yield [taskset[idxs[i]] for i in range(i, min(i + batch_size,len(taskset)))]


## Diagnosis funs

In [None]:
def prepare_data(data, batch_size,tokenizer,max_seq_length,
                 input = 'text', output = 'label',
                 train_size_per_class = 5, global_datasets = False):
  data = data.reset_index().drop("index", axis=1)

  if global_datasets:
    global data_train, data_test

  data_train = data.groupby('label').sample(train_size_per_class, replace=False)
  idex = data.index.isin(data_train.index)
  data_test = data[~idex].reset_index()


  # Train
  ## Transforma em dataset
  dataset_train = SLR_DataSet(
    data = data_train.sample(frac=1),
    input = input,
    output = output,
    tokenizer=tokenizer,
    max_seq_length =max_seq_length)

  # Test
  # Dataloaders
    ## Transforma em dataset
  dataset_test = SLR_DataSet(
    data = data_test,
    input = input,
    output = output,
    tokenizer=tokenizer,
    max_seq_length =max_seq_length)
  
  # Dataloaders
  ## Treino 
  data_train_loader = DataLoader(dataset_train,
                           shuffle=True,
                           batch_size=batch_size['train']
                                )
  
  if len(dataset_test) % batch_size['test'] == 1 :
    data_test_loader = DataLoader(dataset_test,
                                    batch_size=batch_size['test'],
                                    drop_last=True)
  else:
    data_test_loader = DataLoader(dataset_test,
                                    batch_size=batch_size['test'],
                                    drop_last=False)

  return data_train_loader, data_test_loader, data_train, data_test


In [None]:
from tqdm import tqdm

def meta_train(data, model, device, Info, print_epoch =True, size_layer=0, Test_resource =None):

  learner = Learner(model = model, device = device, **Info)
  
  # Testing tasks
  if isinstance(Test_resource, pd.DataFrame):
    test = MetaTask(Test_resource, num_task = 0, k_support=10, k_query=10,
                  training=False, **Info)


  torch.clear_autocast_cache()
  gc.collect()
  torch.cuda.empty_cache()

  # Meta epoca
  for epoch in tqdm(range(Info['meta_epoch']), desc= "Meta epoch ", ncols=80):
    # print("Meta Epoca:", epoch)
      
      # Tarefas de treino
      train = MetaTask(data,
                      num_task = Info['num_task_train'],
                      k_support=Info['k_qry'],
                      k_query=Info['k_spt'], **Info)

      # Batchs de tarefas    
      db = create_batch_of_tasks(train, is_shuffle = True, batch_size = Info["outer_batch_size"])

      if print_epoch:
      # Outer loop bach training
        for step, task_batch in enumerate(db):          
            print("\n-----------------Training Mode","Meta_epoch:", epoch ,"-----------------\n")
            # meta-feedfoward
            acc = learner(task_batch, valid_train= print_epoch)
            print('Step:', step, '\ttraining Acc:', acc)
        if isinstance(Test_resource, pd.DataFrame):
          # Validating Model 
          if ((epoch+1) % 4) + step == 0:
              random_seed(123)
              print("\n-----------------Testing Mode-----------------\n")
              db_test = create_batch_of_tasks(test, is_shuffle = False, batch_size = 1)
              acc_all_test = []

              # Looping testing tasks
              for test_batch in db_test:
                  acc = learner(test_batch, training = False)
                  acc_all_test.append(acc)

              print('Test acc:', np.mean(acc_all_test))
              del acc_all_test, db_test

              # Restarting training randomly
              random_seed(int(time.time() % 10))
          
        
      else:
        for step, task_batch in enumerate(db):
            acc = learner(task_batch, print_epoch, valid_train= print_epoch)

  torch.clear_autocast_cache()
  gc.collect()
  torch.cuda.empty_cache()

In [None]:
def train_loop(data_train_loader, data_test_loader, model, device, epoch = 4, lr = 1, print_info = True, name = 'name'):
  # Inicia o modelo
  model_meta = deepcopy(model)
  optimizer = Adam(model_meta.parameters(), lr=lr)

  model_meta.to(device)
  model_meta.train()

  # Loop de treino da tarefa
  for i in range(0, epoch):
      all_loss = []

      # Inner training batch (support set)
      for inner_step, batch in enumerate(data_train_loader):
          batch = tuple(t.to(device) for t in batch)
          input_ids, attention_mask,q_token_type_ids, label_id = batch
          
          # Feedfoward
          loss, _, _ = model_meta(input_ids, attention_mask,q_token_type_ids, labels = label_id.squeeze())
          
          # Calcula gradientes
          loss.backward()

          # Atualiza os parametros
          optimizer.step()
          optimizer.zero_grad()
          
          all_loss.append(loss.item())
      

      if (i % 2 == 0) & print_info:
          print("Loss: ", np.mean(all_loss))


  # Predicao no banco de teste
  model_meta.eval()
  all_loss = []
  all_acc = []
  features = []
  labels = []
  predi_logit = []

  with torch.no_grad():
      for inner_step, batch in enumerate(tqdm(data_test_loader,
                                              desc="Test validation | " + name,
                                              ncols=80)) :
        batch = tuple(t.to(device) for t in batch)
        input_ids, attention_mask,q_token_type_ids, label_id = batch

        # Predicoes
        _, feature, prediction = model_meta(input_ids, attention_mask,q_token_type_ids, labels = label_id.squeeze())

        prediction = prediction.detach().cpu().squeeze()
        label_id = label_id.detach().cpu()
        logit = feature[1].detach().cpu()
        feature_lat = feature[0].detach().cpu()


        labels.append(label_id.numpy().squeeze())
        features.append(feature_lat.numpy())
        predi_logit.append(logit.numpy())

        acc = fn.accuracy(prediction, label_id).item()
        all_acc.append(acc)
      del input_ids, attention_mask, label_id, batch

  if print_info:
    print("acc:", np.mean(all_acc))

  model_meta.to('cpu')
  gc.collect()
  torch.cuda.empty_cache()

  del model_meta, optimizer


  features = np.concatenate(np.array(features,dtype=object))
  labels = np.concatenate(np.array(labels,dtype=object))
  logits = np.concatenate(np.array(predi_logit,dtype=object))

  features = torch.tensor(features.astype(np.float32)).detach().clone()
  labels = torch.tensor(labels.astype(int)).detach().clone()
  logits = torch.tensor(logits.astype(np.float32)).detach().clone()

  # Reducao de dimensionalidade
  X_embedded = TSNE(n_components=2, learning_rate='auto',
                    init='random').fit_transform(features.detach().clone())

  return logits.detach().clone(), X_embedded, labels.detach().clone(), features.detach().clone()

In [None]:
def wss_calc(logit, labels, trsh = 0.5):
  
  # Predicao com base nos treshould
  predict_trash = torch.sigmoid(logit).squeeze() >= trsh
  CM = confusion_matrix(labels, predict_trash.to(int) )
  tn, fp, fne, tp = CM.ravel()

  P = (tp + fne)  
  N = (tn + fp) 
  recall = tp/(tp+fne)

  # Wss antigo
  wss_old = (tn + fne)/len(labels) -(1- recall)

  # WSS novo
  wss_new = (tn/N - fne/P)

  return {
      "wss": round(wss_old,4),
      "awss": round(wss_new,4),
      "R": round(recall,4),
      "CM": CM
      }

In [None]:
from sklearn.metrics import confusion_matrix
from torchmetrics import functional as fn
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score
import ipywidgets as widgets
from IPython.display import HTML, display, clear_output
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

def plot(logits, X_embedded, labels, tresh, show = True,
         namefig = "plot", make_plot = True, print_stats = True, save = True):
  col = pd.MultiIndex.from_tuples([
                                   ("Predict", "0"),
                                   ("Predict", "1")
                                   ])
  index = pd.MultiIndex.from_tuples([
                                   ("Real", "0"),
                                   ("Real", "1")
                                   ])

  predict = torch.sigmoid(logits).detach().clone()

  roc_auc = dict()

  fpr, tpr, thresholds = roc_curve(labels, predict.squeeze())

  # Sem especificar o tresh
  # WSS
  ## indice do recall 0.95
  idx_wss95 = sum(tpr < 0.95)
  thresholds95 = thresholds[idx_wss95]

  wss95_info = wss_calc(logits,labels, thresholds95 )
  acc_wss95 = fn.accuracy(predict, labels, threshold=thresholds95)
  f1_wss95 = fn.f1_score(predict, labels, threshold=thresholds95)


  # Especificando o tresh
  # Treshold avaliation


  ## WSS
  wss_info = wss_calc(logits,labels, tresh )
  # Accuraci
  acc_wssR = fn.accuracy(predict, labels, threshold=tresh)
  f1_wssR = fn.f1_score(predict, labels, threshold=tresh)


  metrics= {
      # WSS
      "WSS@95": wss95_info['wss'],
      "AWSS@95": wss95_info['awss'],
      "WSS@R": wss_info['wss'],
      "AWSS@R": wss_info['awss'],
      # Recall
      "Recall_WSS@95": wss95_info['R'],
      "Recall_WSS@R": wss_info['R'],
      # acc
      "acc@95": acc_wss95.item(),
      "acc@R": acc_wssR.item(),
      # f1
      "f1@95": f1_wss95.item(),
      "f1@R": f1_wssR.item(),
      # treshould 95
      "treshould@95": thresholds95
  }

  # print stats

  if print_stats:
    wss95= f"WSS@95:{wss95_info['wss']}, R: {wss95_info['R']}"
    wss95_adj= f"ASSWSS@95:{wss95_info['awss']}"
    print(wss95)
    print(wss95_adj)
    print('Acc.:', round(acc_wss95.item(), 4))
    print('F1-score:', round(f1_wss95.item(), 4))
    print(f"Treshold to wss95: {round(thresholds95, 4)}")
    cm = pd.DataFrame(wss95_info['CM'],
              index=index,
              columns=col)
    
    print("\nConfusion matrix:")
    print(cm)
    print("\n---Metrics with threshold:", tresh, "----\n")
    wss= f"WSS@R:{wss_info['wss']}, R: {wss_info['R']}"
    print(wss)
    wss_adj= f"AWSS@R:{wss_info['awss']}"
    print(wss_adj)
    print('Acc.:', round(acc_wssR.item(), 4))
    print('F1-score:', round(f1_wssR.item(), 4))
    cm = pd.DataFrame(wss_info['CM'],
                index=index,
                columns=col)
      
    print("\nConfusion matrix:")
    print(cm)


  # Graficos

  if make_plot:

    fig, axes = plt.subplots(1, 4, figsize=(25,10))
    alpha = torch.squeeze(predict).numpy()

    # plots

    p1 = sns.scatterplot(x=X_embedded[:, 0],
                  y=X_embedded[:, 1],
                  hue=labels,
                  alpha=alpha, ax = axes[0]).set_title('Predictions-TSNE')
    
    t_wss = predict >= thresholds95
    t_wss = t_wss.squeeze().numpy()

    p2 = sns.scatterplot(x=X_embedded[t_wss, 0],
                  y=X_embedded[t_wss, 1],
                  hue=labels[t_wss],
                  alpha=alpha[t_wss], ax = axes[1]).set_title('WSS@95')

    t = predict >= tresh
    t = t.squeeze().numpy()

    p3 = sns.scatterplot(x=X_embedded[t, 0],
                  y=X_embedded[t, 1],
                  hue=labels[t],
                  alpha=alpha[t], ax = axes[2]).set_title(f'Predictions-Treshold {tresh}')


    roc_auc = auc(fpr, tpr)
    lw = 2

    axes[3].plot(
      fpr,
      tpr,
      color="darkorange",
      lw=lw,
      label="ROC curve (area = %0.2f)" % roc_auc)
    
    axes[3].plot([0, 1], [0, 1], color="navy", lw=lw, linestyle="--")
    axes[3].axhline(y=0.95, color='r', linestyle='-')
    axes[3].set(xlabel="False Positive Rate", ylabel="True Positive Rate", title= "ROC")
    axes[3].legend(loc="lower right")

    if show:
      plt.show()
    
    if save:
      fig.savefig(namefig, dpi=fig.dpi)

  return metrics

def auc_plot(logits,labels, color = "darkorange", label = "test"):
    predict = torch.sigmoid(logits).detach().clone()
    fpr, tpr, thresholds = roc_curve(labels, predict.squeeze())
    roc_auc = auc(fpr, tpr)
    lw = 2

    label = label + str(round(roc_auc,2))
    # print(label)

    plt.plot(
      fpr,
      tpr,
      color=color,
      lw=lw,
      label= label 
      )
    plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
    plt.axhline(y=0.95, color='r', linestyle='-')



In [None]:
from sklearn.metrics import confusion_matrix
from torchmetrics import functional as fn
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score
import ipywidgets as widgets
from IPython.display import HTML, display, clear_output
import matplotlib.pyplot as plt
import seaborn as sns
import warnings


class diagnosis():
  def __init__(self, names, Valid_resource, batch_size_test, model,Info,start = 0):
    self.names=names
    self.Valid_resource=Valid_resource
    self.batch_size_test=batch_size_test
    self.model=model
    self.start=start 

    self.value_trash = widgets.FloatText(
        value=0.95,
        description='tresh',
        disabled=False
    )

    self.valueb = widgets.IntText(
        value=10,
        description='size',
        disabled=False
    )

    self.train_b = widgets.Button(description="Train")
    self.next_b = widgets.Button(description="Next")
    self.eval_b = widgets.Button(description="Evaluation")

    self.hbox = widgets.HBox([self.train_b, self.valueb])

    self.next_b.on_click(self.Next_button)
    self.train_b.on_click(self.Train_button)
    self.eval_b.on_click(self.Evaluation_button)


  # Next button
  def Next_button(self,p):
    clear_output()
    self.i=self.i+1

    # global domain
    self.domain = names[self.i]
    print("Name:", self.domain)

    # global data
    self.data = self.Valid_resource[self.Valid_resource['domain'] == self.domain]
    print(self.data['label'].value_counts())

    display(self.hbox)
    display(self.next_b)

  # Train button
  def Train_button(self, y):
    clear_output()
    print(self.domain)

    # Preparing data for training
    self.data_train_loader, self.data_test_loader, self.data_train, self.data_test = prepare_data(self.data,
              train_size_per_class = self.valueb.value,
              batch_size = {'train': Info['inner_batch_size'],
                            'test': batch_size_test},
              max_seq_length = Info['max_seq_length'],
              tokenizer = Info['tokenizer'],
              input = "text",
              output = "label")

    self.logits, self.X_embedded, self.labels, self.features = train_loop(self.data_train_loader, self.data_test_loader,
                                                        model, device,
                                                        epoch = Info['inner_update_step'],
                                                        lr=Info['inner_update_lr'],
                                                        print_info=True,
                                                        name = self.domain)

    tresh_box = widgets.HBox([self.eval_b, self.value_trash])
    display(self.hbox)
    display(tresh_box)
    display(self.next_b)

  # Evaluation button
  def Evaluation_button(self, te):
    clear_output()
    tresh_box = widgets.HBox([self.eval_b, self.value_trash])

    print(self.domain)
    # print("\n")
    print("-------Train data-------")
    print(data_train['label'].value_counts())
    print("-------Test data-------")
    print(data_test['label'].value_counts())
    # print("\n")
    
    display(self.next_b)
    display(tresh_box)
    display(self.hbox)

    
    metrics = plot(self.logits, self.X_embedded, self.labels,
                    tresh=Info['tresh'], show = True,
                    # namefig= "./"+base_path +"/"+"Results/size_layer/"+ name_domain+'/' +str(n_layers) + '/img/' + str(attempt) + 'plots',
                    namefig= 'test',
                  make_plot = True,
                  print_stats = True,
                  save=False)

  def __call__(self):
    self.i= self.start-1

    clear_output()
    display(self.next_b)

## Simulation attemps

In [None]:
from collections import defaultdict

def pipeline_simpulation(Valid_resource, names_to_valid, path_save, model, Info):
  for name in names_to_valid:
    name = re.sub("\.csv", "",name)
    Path(path_save  + name + "/img").mkdir(parents=True, exist_ok=True)

  roc_stats = defaultdict(lambda: defaultdict(
      lambda: defaultdict(
          list
          )
      )
  )


  n_attempt = 5

  all_metrics = []
  for name in names_to_valid:
    # break
    data = Valid_resource[Valid_resource['domain'] == name].reset_index().drop("index", axis=1)

    for attempt in range(n_attempt):

      data_train_loader, data_test_loader,  _ , _ = prepare_data(data,
                train_size_per_class = Info['k_spt'],
                batch_size = {'train': Info['inner_batch_size'],
                              'test': 100},
                max_seq_length = Info['max_seq_length'],
                tokenizer = Info['tokenizer'],
                input = "text",
                output = "label")
      


      print("---"*4,"attempt", attempt, "---"*4)
      logits, X_embedded, labels, features = train_loop(data_train_loader, data_test_loader,
                                                        model, device,
                                                        epoch = Info['inner_update_step'],
                                                        lr=Info['inner_update_lr'],
                                                        print_info=False,
                                                        name = name)
      
      
      name_domain = re.sub("\.csv", "",name)

      metrics = plot(logits, X_embedded, labels,
                    tresh=Info['tresh'], show = False,
                    namefig= path_save  + name_domain + "/img/" + str(attempt) + 'plots',
        make_plot = True, print_stats = False, save =  True)

      
      fpr, tpr, _ = roc_curve(labels, torch.sigmoid(logits).squeeze())
      
      metrics['name'] = name_domain
      metrics['layer_size'] = Info['bert_layers']
      metrics['attempt'] = attempt
      roc_stats[name_domain][str(Info['bert_layers'])]['fpr'].append(fpr.tolist())
      roc_stats[name_domain][str(Info['bert_layers'])]['tpr'].append(tpr.tolist())
      all_metrics.append(metrics)

      pd.DataFrame(all_metrics).to_csv(path_save+ "metrics.csv")

      roc_path =  path_save + "roc_stats.json"
      with open(roc_path, 'w') as fp:
          json.dump(roc_stats, fp)


      del fpr, tpr, logits, X_embedded, labels
      del features, metrics,  _


  save_info = Info.copy()
  save_info['model'] = initializer_model.tokenizer.name_or_path
  save_info.pop("tokenizer")
  save_info.pop("bert_layers")

  info_path =  path_save+"info.json"
  with open(info_path, 'w') as fp:
      json.dump(save_info, fp)

## Statistics

In [None]:
# Loading dataset statistics
def load_data_statistics(paths):
  size = []
  pos = []
  neg = []
  for p in paths:
    data = pd.read_csv(p) 
    data = data.dropna()
    # Dataset size
    size.append(len(data))
    # Number of positive labels
    pos.append(data['labels'].value_counts()[1])
    # Number of negative labels
    neg.append(data['labels'].value_counts()[0])
  del data

  info_load = pd.DataFrame({
      "size":size,
      "pos":pos,
      "neg":neg,
      "names":names,
      "paths": paths })
  return info_load

# Loading the datasets
def load_data(train_info_load):

  col = ['abstract','title', 'labels', 'domain']

  data_train = pd.DataFrame(columns=col)
  for p in train_info_load['paths']:  
    data_temp = pd.read_csv(p).loc[:, ['labels', 'title', 'abstract']]
    data_temp = pd.read_csv(p).loc[:, ['labels', 'title', 'abstract']]
    data_temp['domain'] = os.path.basename(p)
    data_train = pd.concat([data_train, data_temp])
    
  data_train['text'] = data_train['title'] + data_train['abstract'].replace(np.nan, '')

  return( data_train \
            .replace({"labels":{0:"negative", 1:'positive'}})\
            .rename({"labels":"label"} , axis=1)\
            .loc[ :,("text","domain","label")]
        )

# Importing data

The 64 topic-agnostic labeled datasets proposed can be downloaded and mounted below:

In [None]:
!wget -O data.zip https://www.dropbox.com/sh/or0eyfo8znyu2kp/AABxXJVII48U0vY8TT3Bbp6Ea?dl=0
!unzip data.zip

In [None]:
import glob
import os

path = 'SLR_data'
paths = glob.glob(f"{path}/**/*.csv", recursive=True)
pprint(paths)

names = [os.path.basename(p) for p in paths]

## Evaluating text data

In [None]:
#@title Untreated text data { display-mode: "form" }

import pandas as pd
import ipywidgets as widgets
from IPython.display import HTML, display, clear_output
from pprint import pprint
import unicodedata

# Loading data
def handle_data(path):
  data = pd.read_csv(path)
  print()
  sample = data.sample(1)
  text =  sample['abstract'].values[0]
  label = sample['labels'].values[0]
  new_text = unicodedata.normalize("NFKD",str(text))
  print(f"Label = {label}")
  print(f"len(text) = {len(new_text)}")
  pprint(new_text)
  print()
i=0

# Next button
def next_button(p):
  global i
  i=i+1
  try:
    clear_output()
    display(hbox)
    print(f"File: {names[i]}")
    handle_data(paths[i])
    global ref
    ref = paths[i]
    print(f"path: {paths[i]}")
    # print(f"Control number: {i}")
  except  Exception as inst:
    print(inst)
    i=len(paths)
    print('End')

# Previous Button
def prev_button(p):
  global i
  i=i-1
  try:
    clear_output()
    display(hbox)
    print(f"File: {names[i]}")
    handle_data(paths[i])
    global ref
    ref = paths[i]
    print(f"path: {paths[i]}")
    # print(f"Control number: {i}")
  except  Exception as inst:
    print(inst)
    i=0
    print('End')

next_b = widgets.Button(description="Next")
previous_b = widgets.Button(description="Previous")

hbox = widgets.HBox([previous_b, next_b])
display(hbox)

next_b.on_click(next_button)
previous_b.on_click(prev_button)

### Treated text data


In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import unicodedata
import re

# Regex multiple replace function
def multiple_replace(dict, text):

  # Building regex from dict keys
  regex = re.compile("(%s)" % "|".join(map(re.escape, dict.keys())))

  # Substitution
  return regex.sub(lambda mo: dict[mo.string[mo.start():mo.end()]], text) 

# Undesirable patterns within texts
patterns = {
    'CONCLUSIONS AND IMPLICATIONS':'',
    'BACKGROUND AND PURPOSE':'',
    'EXPERIMENTAL APPROACH':'',
    'KEY RESULTS AEA':'',
    '©':'',
    '®':'',
    'μ':'',
    '(C)':'',
    'OBJECTIVE:':'',
    'MATERIALS AND METHODS:':'',
    'SIGNIFICANCE:':'',
    'BACKGROUND:':'',
    'RESULTS:':'',
    'METHODS:':'',
    'CONCLUSIONS:':'',
    'AIM:':'',
    'STUDY DESIGN:':'',
    'CLINICAL RELEVANCE:':'',
    'CONCLUSION:':'',
    'HYPOTHESIS:':'',
    'CLINICAL RELEVANCE:':'',
    'Questions/Purposes:':'',
    'Introduction:':'',
    'PURPOSE:':'',
    'PATIENTS AND METHODS:':'',
    'FINDINGS:':'',
    'INTERPRETATIONS:':'',
    'FUNDING:':'',
    'PROGRESS:':'',
    'CONTEXT:':'',
    'MEASURES:':'',
    'DESIGN:':'',
    'BACKGROUND AND OBJECTIVES:':'',
    '<p>':'',
    '</p>':'',
    '<<ETX>>':'',
    '+/-':'',
    }
 
patterns = {x.lower():y for x,y in patterns.items()}

In [None]:
#@title Treated text data { display-mode: "form" }

import pandas as pd
import ipywidgets as widgets
from IPython.display import HTML, display, clear_output
import unicodedata
import re

# undesirable patterns within texts
patterns = {
    'CONCLUSIONS AND IMPLICATIONS':'',
    'BACKGROUND AND PURPOSE':'',
    'EXPERIMENTAL APPROACH':'',
    'KEY RESULTS AEA':'',
    '©':'',
    '®':'',
    'μ':'',
    '(C)':'',
    'OBJECTIVE:':'',
    'MATERIALS AND METHODS:':'',
    'SIGNIFICANCE:':'',
    'BACKGROUND:':'',
    'RESULTS:':'',
    'METHODS:':'',
    'CONCLUSIONS:':'',
    'AIM:':'',
    'STUDY DESIGN:':'',
    'CLINICAL RELEVANCE:':'',
    'CONCLUSION:':'',
    'HYPOTHESIS:':'',
    'CLINICAL RELEVANCE:':'',
    'Questions/Purposes:':'',
    'Introduction:':'',
    'PURPOSE:':'',
    'PATIENTS AND METHODS:':'',
    'FINDINGS:':'',
    'INTERPRETATIONS:':'',
    'FUNDING:':'',
    'PROGRESS:':'',
    'CONTEXT:':'',
    'MEASURES:':'',
    'DESIGN:':'',
    'BACKGROUND AND OBJECTIVES:':'',
    '<p>':'',
    '</p>':'',
    '<<ETX>>':'',
    '+/-':'',
    }
 
patterns = {x.lower():y for x,y in patterns.items()}

def multiple_replace(dict, text):
  # Create a regular expression  from the dictionary keys
  regex = re.compile("(%s)" % "|".join(map(re.escape, dict.keys())))

  # For each match, look-up corresponding value in dictionary
  return regex.sub(lambda mo: dict[mo.string[mo.start():mo.end()]], text) 

# Text processing
def treat_text(text):
  text = unicodedata.normalize("NFKD",str(text))
  text = multiple_replace(patterns,text.lower())
  text = re.sub('(\(.+\))|(\[.+\])|( \d )|(<)|(>)|(- )','', text)
  text = re.sub('( +)',' ', text)
  text = re.sub('(, ,)|(,,)',',', text)
  text = re.sub('(%)|(per cent)',' percent', text)
  return text

def handle_data(path):
  data = pd.read_csv(path)
  # print('Columns:')
  # print(data.columns)
  print()
  sample = data.sample(1)
  text =  sample['abstract'].values[0]
  label = sample['labels'].values[0]
  new_text = treat_text(text)
  print(f"Label = {label}")
  print(f"len(text) = {len(new_text)}")
  pprint(new_text)
  print()

i=0

# Next button
def next_button(p):
  global i
  i=i+1
  try:
    clear_output()
    display(hbox)
    print(f"File: {names[i]}")
    handle_data(paths[i])
    global ref
    ref = paths[i]
    print(f"path: {paths[i]}")
    # print(f"Control number: {i}")
  except  Exception as inst:
    print(inst)
    i=len(paths)
    print('End')

# Previous Button
def prev_button(p):
  global i
  i=i-1
  try:
    clear_output()
    display(hbox)
    print(f"File: {names[i]}")
    handle_data(paths[i])
    global ref
    ref = paths[i]
    print(f"path: {paths[i]}")
    # print(f"Control number: {i}")
  except  Exception as inst:
    print(inst)
    i=0
    print('End')

next_b = widgets.Button(description="Next")
previous_b = widgets.Button(description="Previous")

hbox = widgets.HBox([previous_b, next_b])
display(hbox)

next_b.on_click(next_button)
previous_b.on_click(prev_button)

# Downloading pre-trained model


In [None]:
# Fetching pre-trained model and tokenizer

class initializer:
  def __init__(self, MODEL_NAME, **config):    
    self.MODEL_NAME = MODEL_NAME

    model = config.get("model")
    tokenizer = config.get("tokenizer")

    # Model
    self.model = model.from_pretrained(MODEL_NAME, 
                                       return_dict=True,
                                       output_attentions = False)
    # Tokenizer
    self.tokenizer = tokenizer.from_pretrained(MODEL_NAME,
                                               do_lower_case = True)

In [None]:
# Model and tokenizer of choice
config = {
    "model": AutoModelForSequenceClassification,
    "tokenizer": AutoTokenizer
     }

# Pre-trained model initializer (uncased sciBERT)
initializer_model = initializer('allenai/scibert_scivocab_uncased', **config)

# Model

Functions based on the repository: https://github.com/mailong25/meta-learning-bert

## Domain Learner

In [None]:
# Pre-trained model
class Encoder(nn.Module):
  def __init__(self, layers, freeze_bert, model):
    super(Encoder, self).__init__()

    # Dummy Parameter
    self.dummy_param = nn.Parameter(torch.empty(0))
    
    # Pre-trained model
    self.model = deepcopy(model)

    # Freezing bert parameters
    if freeze_bert:
      for param in self.model.parameters():
        param.requires_grad = freeze_bert

    # Selecting hidden layers of the pre-trained model
    old_model_encoder = self.model.encoder.layer
    new_model_encoder = nn.ModuleList()
    
    for i in layers:
      new_model_encoder.append(old_model_encoder[i])

    self.model.encoder.layer = new_model_encoder
  
  # Feed forward
  def forward(self, **x):
    return self.model(**x)['pooler_output']

# Complete model
class SLR_Classifier(nn.Module):
  def __init__(self, **data):
    super(SLR_Classifier, self).__init__()

    # Dummy Parameter
    self.dummy_param = nn.Parameter(torch.empty(0))

    # Loss function
    # Binary Cross Entropy with logits reduced to mean
    self.loss_fn = nn.BCEWithLogitsLoss(reduction = 'mean',
                                        pos_weight=torch.FloatTensor([data.get("pos_weight",  2.5)]))

    # Pre-trained model
    self.Encoder = Encoder(layers = data.get("bert_layers",  range(12)),
                           freeze_bert = data.get("freeze_bert",  False),
                           model = data.get("model"),
                           )

    # Feature Map Layer
    self.feature_map = nn.Sequential(
            # nn.LayerNorm(self.Encoder.model.config.hidden_size),
            nn.BatchNorm1d(self.Encoder.model.config.hidden_size),
            # nn.Dropout(data.get("drop", 0.5)),
            nn.Linear(self.Encoder.model.config.hidden_size, 200),
            nn.Dropout(data.get("drop", 0.5)),
        )

    # Classifier Layer
    self.classifier = nn.Sequential(
            # nn.LayerNorm(self.Encoder.model.config.hidden_size),
            # nn.Dropout(data.get("drop", 0.5)),
            # nn.BatchNorm1d(self.Encoder.model.config.hidden_size),
            # nn.Dropout(data.get("drop", 0.5)),
            nn.Tanh(),
            nn.Linear(200, 1)
        )

    # Initializing layer parameters
    nn.init.normal_(self.feature_map[1].weight, mean=0, std=0.00001)
    nn.init.zeros_(self.feature_map[1].bias)

  # Feed forward
  def forward(self, input_ids, attention_mask, token_type_ids, labels):
    
    predict = self.Encoder(**{"input_ids":input_ids,
                              "attention_mask":attention_mask,
                              "token_type_ids":token_type_ids})
    feature = self.feature_map(predict)
    logit = self.classifier(feature)

    predict = torch.sigmoid(logit)
    
    # Loss function 
    loss = self.loss_fn(logit.to(torch.float), labels.to(torch.float).unsqueeze(1))

    return [loss, [feature, logit], predict]

## Meta Learner

In [None]:
from torch import nn
from torch.nn import functional as F
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from torch.optim import Adam
from torch.nn import CrossEntropyLoss
from transformers import BertForSequenceClassification
from copy import deepcopy
import gc
from sklearn.metrics import accuracy_score
import torch
import numpy as np
import torchmetrics
from torchmetrics import functional as fn

class Learner(nn.Module):

    def __init__(self, **args):
        """
        :param args:
        """
        super(Learner, self).__init__()
        
        self.inner_print = args.get('inner_print')
        self.inner_batch_size = args.get('inner_batch_size')
        self.outer_update_lr  = args.get('outer_update_lr')
        self.inner_update_lr  = args.get('inner_update_lr')
        self.inner_update_step = args.get('inner_update_step')
        self.inner_update_step_eval = args.get('inner_update_step_eval')
        self.model = args.get('model')
        self.device = args.get('device')
        
        # Outer optimizer
        self.outer_optimizer = Adam(self.model.parameters(), lr=self.outer_update_lr)
        self.model.train()

    def forward(self, batch_tasks, training = True, valid_train = True):
        """
        batch = [(support TensorDataset, query TensorDataset),
                 (support TensorDataset, query TensorDataset),
                 (support TensorDataset, query TensorDataset),
                 (support TensorDataset, query TensorDataset)]
        
        # support = TensorDataset(all_input_ids, all_attention_mask, all_segment_ids, all_label_ids)
        """
        task_accs = []
        task_f1 = []
        task_recall = []
        sum_gradients = []
        num_task = len(batch_tasks)
        num_inner_update_step = self.inner_update_step if training else self.inner_update_step_eval

        # Outer loop tasks 
        for task_id, task in enumerate(batch_tasks):
            support = task[0]
            query   = task[1]
            name   = task[2]
            
            # Copying model
            fast_model = deepcopy(self.model)
            fast_model.to(self.device)
            
            # Inner trainer optimizer
            inner_optimizer = Adam(fast_model.parameters(), lr=self.inner_update_lr)
            
            # Creating training data loaders
            if len(support) % self.inner_batch_size == 1 :
              support_dataloader = DataLoader(support, sampler=RandomSampler(support),
                                              batch_size=self.inner_batch_size,
                                              drop_last=True)
            else:
              support_dataloader = DataLoader(support, sampler=RandomSampler(support),
                                              batch_size=self.inner_batch_size,
                                              drop_last=False)
                            
            # steps_per_epoch=len(support) // self.inner_batch_size
            # total_training_steps = steps_per_epoch * 5
            # warmup_steps = total_training_steps // 3
            #            

            # scheduler = get_linear_schedule_with_warmup(
            #            inner_optimizer, 
            #           num_warmup_steps=warmup_steps,
            #           num_training_steps=total_training_steps
            #           )

            fast_model.train()            

            # Inner loop training epoch (support set)
            if valid_train:
              print('----Task',task_id,":", name, '----')

            for i in range(0, num_inner_update_step):
                all_loss = []

                # Inner loop training batch (support set)
                for inner_step, batch in enumerate(support_dataloader):
                    batch = tuple(t.to(self.device) for t in batch)
                    input_ids, attention_mask, token_type_ids, label_id = batch

                    # Feed Foward
                    loss, _, _ = fast_model(input_ids, attention_mask, token_type_ids=token_type_ids, labels = label_id)
                                  
                    # Computing gradients
                    loss.backward()
                    # torch.nn.utils.clip_grad_norm_(fast_model.parameters(), max_norm=1)
                    
                    # Updating inner training parameters
                    inner_optimizer.step()
                    inner_optimizer.zero_grad()
                    
                    # Appending losses
                    all_loss.append(loss.item())
                    
                    del batch, input_ids, attention_mask, label_id
                    torch.cuda.empty_cache()
                
                if valid_train:
                  if (i+1) % self.inner_print == 0:
                      print("Inner Loss: ", np.mean(all_loss))

            fast_model.to(torch.device('cpu'))
            
            # Inner training phase weights
            if training:
                meta_weights = list(self.model.parameters())
                fast_weights = list(fast_model.parameters())

                # Appending gradients
                gradients = []
                for i, (meta_params, fast_params) in enumerate(zip(meta_weights, fast_weights)):
                    gradient = meta_params - fast_params
                    if task_id == 0:
                        sum_gradients.append(gradient)
                    else:
                        sum_gradients[i] += gradient


            # Inner test (query set)
            fast_model.to(self.device)
            fast_model.eval()

            if valid_train:
              # Inner test (query set)
              fast_model.to(self.device)
              fast_model.eval()
              
            with torch.no_grad():
                # Data loader
                query_dataloader = DataLoader(query, sampler=None, batch_size=len(query))
                query_batch = iter(query_dataloader).next()
                query_batch = tuple(t.to(self.device) for t in query_batch)
                q_input_ids, q_attention_mask, q_token_type_ids, q_label_id = query_batch
                
                # Feedfoward
                _, _, pre_label_id = fast_model(q_input_ids, q_attention_mask, q_token_type_ids, labels = q_label_id)

                # Predictions
                pre_label_id = pre_label_id.detach().cpu().squeeze()
                # Labels
                q_label_id = q_label_id.detach().cpu()

                # Calculating metrics
                acc = fn.accuracy(pre_label_id, q_label_id).item()
                recall = fn.recall(pre_label_id, q_label_id).item(),
                f1 = fn.f1_score(pre_label_id, q_label_id).item()

                # appending metrics
                task_accs.append(acc)
                task_f1.append(f1)
                task_recall.append(recall)
            
                fast_model.to(torch.device('cpu'))

            del fast_model, inner_optimizer
            torch.cuda.empty_cache()
        
        print("\n")
        print("f1:",np.mean(task_f1))
        print("recall:",np.mean(task_recall))

        # Updating outer training parameters
        if training:
            # Mean of gradients
            for i in range(0,len(sum_gradients)):
                sum_gradients[i] = sum_gradients[i] / float(num_task)

            # Indexing parameters to model
            for i, params in enumerate(self.model.parameters()):
                params.grad = sum_gradients[i]

            # Updating parameters
            self.outer_optimizer.step()
            self.outer_optimizer.zero_grad()
            
            del sum_gradients
            gc.collect()
            torch.cuda.empty_cache()

        if valid_train:
          return np.mean(task_accs)
        else:
          return np.array(0)

## Task Loader


In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

LABEL_MAP = {'negative': 0, 'positive': 1}

class SLR_DataSet(Dataset):
  def __init__(self, **args):
    self.tokenizer = args.get('tokenizer')
    self.data = args.get('data')
    self.max_seq_length = args.get("max_seq_length", 512)
    self.INPUT_NAME = args.get("input", 'x')
    self.LABEL_NAME = args.get("output", 'y')

  # Tokenizing and processing text
  def encode_text(self, example):
    comment_text = example[self.INPUT_NAME]
    comment_text = self.treat_text(comment_text)
    
    labels = LABEL_MAP[example[self.LABEL_NAME]]
    encoding = self.tokenizer.encode_plus(
      (comment_text, "It is great text"),
      add_special_tokens=True,
      max_length=self.max_seq_length,
      return_token_type_ids=True,
      padding="max_length",
      truncation=True,
      return_attention_mask=True,
      return_tensors='pt',
    )

    
    return tuple((
      encoding["input_ids"].flatten(),
      encoding["attention_mask"].flatten(),
      encoding["token_type_ids"].flatten(),
      torch.tensor([torch.tensor(labels).to(int)])
    ))
  
  # Text processing function
  def treat_text(self, text):
    text = unicodedata.normalize("NFKD",str(text))
    text = multiple_replace(patterns,text.lower())
    text = re.sub('(\(.+\))|(\[.+\])|( \d )|(<)|(>)|(- )','', text)
    text = re.sub('( +)',' ', text)
    text = re.sub('(, ,)|(,,)',',', text)
    text = re.sub('(%)|(per cent)',' percent', text)
    return text

  def __len__(self):
    return len(self.data)

  # Returning data
  def __getitem__(self, index: int):
    # print(index)
    data_row = self.data.reset_index().iloc[index]
    temp_data =  self.encode_text(data_row)
    return temp_data

## Tasks maker

In [None]:
from torch.utils.data import Dataset
from torch.utils.data import TensorDataset
from sklearn.model_selection import train_test_split

LABEL_MAP  = {'positive':1, 'negative':0}

# Creating Meta Tasks
class MetaTask(Dataset):
    def __init__(self, examples, num_task, k_support, k_query, tokenizer, training=True, max_seq_length=512, **args):
        """
        :param samples: list of samples
        :param num_task: number of training tasks.
        :param k_support: number of classes support samples per task
        :param k_query: number of classes query sample per task
        """
        self.examples = examples
        
        self.num_task =  num_task
        self.k_support = k_support
        self.k_query = k_query
        self.tokenizer = tokenizer
        self.max_seq_length = max_seq_length
        
        # Randomly generating tasks
        self.create_batch(self.num_task, training)
        
    # Creating batch
    def create_batch(self, num_task, training):
        self.supports = []  # support set
        self.queries = []  # query set
        self.task_names = [] # Name of task
        self.supports_indexs = [] # index of supports
        self.queries_indexs = [] # index of queries
        self.num_task=num_task
        
        # Available tasks
        domains = self.examples['domain'].unique()

        # If not training, create all tasks
        if not(training):
          self.task_names = domains
          num_task = len(self.task_names)
          self.num_task=num_task

        
        for b in range(num_task):  # For each task,
            total_per_class = self.k_support + self.k_query 
            task_size = 2*self.k_support + 2*self.k_query 

            # Select a task at random
            if training:  
              domain = random.choice(domains)
              self.task_names.append(domain)
            else:
              domain = self.task_names[b]

            # Task data
            domainExamples = self.examples[self.examples['domain'] == domain]

            # Minimal label quantity
            min_per_class = min(domainExamples['label'].value_counts())

            if total_per_class > min_per_class:
              total_per_class = min_per_class
            
            # Select k_support + k_query task examples
            # Sample (n) from each label(class)
            selected_examples = domainExamples.groupby("label").sample(total_per_class, replace = False)

            # Split data into support (training) and query (testing) sets
            s, q = train_test_split(selected_examples,
                                    stratify= selected_examples["label"],
                                    test_size= 2*self.k_query/task_size,
                                    shuffle=True)
            
            # Permutating data
            s = s.sample(frac=1)  
            q = q.sample(frac=1) 

            # Appending indexes
            if not(training):
              self.supports_indexs.append(s.index)
              self.queries_indexs.append(q.index)

            # Creating list of support (training) and query (testing) tasks
            self.supports.append(s.to_dict('records'))
            self.queries.append(q.to_dict('records'))

    # Creating task tensors
    def create_feature_set(self, examples):
        all_input_ids      = torch.empty(len(examples), self.max_seq_length, dtype = torch.long)
        all_attention_mask = torch.empty(len(examples), self.max_seq_length, dtype = torch.long)
        all_token_type_ids = torch.empty(len(examples), self.max_seq_length, dtype = torch.long)
        all_label_ids      = torch.empty(len(examples), dtype = torch.long)

        for _id, e in enumerate(examples):
          all_input_ids[_id], all_attention_mask[_id], all_token_type_ids[_id], all_label_ids[_id] = self.encode_text(e)

        return TensorDataset(
            all_input_ids,
            all_attention_mask,
            all_token_type_ids,
            all_label_ids
        ) 
      
    # Data encoding
    def encode_text(self, example):
      comment_text = example["text"]
      comment_text = self.treat_text(comment_text)
      labels = LABEL_MAP[example["label"]]

      encoding = self.tokenizer.encode_plus(
        (comment_text, "It is great text"),
        add_special_tokens=True,
        max_length=self.max_seq_length,
        return_token_type_ids=True,
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
      )

      return tuple((
        encoding["input_ids"].flatten(),
        encoding["attention_mask"].flatten(),
        encoding["token_type_ids"].flatten(),
        torch.tensor([torch.tensor(labels).to(int)])
      ))


    # Regex text processing
    def treat_text(self, text):
      text = unicodedata.normalize("NFKD",str(text))
      text = multiple_replace(patterns,text.lower())
      text = re.sub('(\(.+\))|(\[.+\])|( \d )|(<)|(>)|(- )','', text)
      text = re.sub('( +)',' ', text)
      text = re.sub('(, ,)|(,,)',',', text)
      text = re.sub('(%)|(per cent)',' percent', text)
      return text

    # Returns data upon calling
    def __getitem__(self, index):
        support_set = self.create_feature_set(self.supports[index])
        query_set   = self.create_feature_set(self.queries[index])
        name        = self.task_names[index]
        return support_set, query_set, name

    def __len__(self):
        return self.num_task

# 50-50 split simulation

### Data

#### Data Statistics

In [None]:
# Loading descriptive statistics
info_load = load_data_statistics(paths)

info_load.head()

#### Data split

In [None]:
# Defining selection criteria for the training dataset: datasets with over 50 positive and 50 negative labels
select_criteria = (info_load['pos'] >= 50) & (info_load['neg'] >= 50)
print(info_load[select_criteria]['names'] )

In [None]:
# Defining the training dataset
Train_resource = load_data(
    info_load[select_criteria]
)

Train_resource = Train_resource.dropna(axis=0).reset_index().drop("index", axis=1)

# Validation data
conditional = Train_resource['domain'] == "PFOS-PFOA.csv"
valid_tranf = Train_resource[conditional]
Train_resource = Train_resource[~conditional].reset_index().drop("index", axis=1)

In [None]:
# Selecting data with at least 40 words separated by spaces
crit = Train_resource["text"].apply(lambda x: len(x.split(" "))) >= 40
Train_resource = Train_resource[crit].reset_index().drop("index", axis=1)

In [None]:
# Data that doesn't fit 50pos/50neg criteria
rest_resource = load_data(
    info_load[~select_criteria]
)

rest_resource = pd.concat([valid_tranf, rest_resource]).reset_index().drop("index", axis=1)

In [None]:
# Defining the testing dataset
Test_resource = rest_resource[rest_resource['domain'].isin(['SR11_Li.csv',
 'SR14_Funakoshi.csv',
 'SR2_Meng.csv',
 'SR6_Wang.csv',
 'SR7_Zhou.csv',
 'SR8_Liu.csv',
 'SR9_Douxfils.csv',
 'Distal_radius_fractures_approach.csv',
 'Hallux_valgus_prognostic.csv',
 'Head_and_neck_cancer_imaging.csv',
 'Obstetric_emergency_training.csv',
 'Pregnancy_medication.csv',
 'Shoulderdystocia_positioning.csv',
 'Shoulderdystocia_recurrence.csv',
 'SR12_Cavender.csv',
 'SR13_Chatterjee.csv',
 'SR1_Yang.csv',
 'SR3_Segelov.csv',
 'SR4_Li.csv',
 'SR5_Lv.csv'])]

Test_resource = Test_resource.dropna(axis=0).reset_index().drop("index", axis=1)

# Selecting data with at least 40 words separated by spaces
crit = Test_resource["text"].apply(lambda x: len(x.split(" "))) >= 40
Test_resource = Test_resource[crit].reset_index().drop("index", axis=1)

In [None]:
# Defining the validation dataset
Valid_resource = rest_resource[rest_resource['domain'].isin(['PFOS-PFOA.csv',
 'Bos_2018.csv',
 'Wolters_2018.csv',
 'SkeletalMuscleRelaxants.csv',
 'Fluoride.csv',
 'Kitchenham_2010.csv',
 'Radjenovic_2013.csv',
 'Opiods.csv',
 'Leafy_Greens_Future_set.csv',
 'Distal_radius_fractures_closed_reduction.csv',
 'Head_and_neck_cancer_bone.csv',
 'Shoulder_replacement_diagnostic.csv',
 'Shoulder_replacement_surgery.csv',
 'Total_knee_replacement.csv',
 'Vascular_access.csv'])]

Valid_resource =  Valid_resource.dropna(axis=0).reset_index().drop("index", axis=1)

# Selecting data with at least 40 words separated by spaces
crit = Valid_resource["text"].apply(lambda x: len(x.split(" "))) >= 40
Valid_resource = Valid_resource[crit].reset_index().drop("index", axis=1)

### Exploratory Data Analysis

#### Training dataset

In [None]:
Train_resource["text"].apply(lambda x: len(x.split(" "))).hist()
plt.show()

In [None]:
Train_resource.groupby('domain')['label']\
              .value_counts()

#### Testing Dataset

In [None]:
Test_resource["text"].apply(lambda x: len(x.split(" "))).hist()
plt.show()

In [None]:
Test_resource.groupby('domain')['label']\
              .value_counts()

#### Validation Dataset

In [None]:
Valid_resource["text"].apply(lambda x: len(x.split(" "))).hist()
plt.show()

In [None]:
Valid_resource.groupby('domain')['label']\
              .value_counts()

## Meta learning phase

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Meta-Training informations
Info = {
    "inner_print": 2, 
    "bert_layers": 2,
    "tokenizer": initializer_model.tokenizer,
    "max_seq_length": 512,
    "meta_epoch":5, # Outer loop epochs
    "k_spt":8, # Support (training) examples per class, binary case
    "k_qry":8, # Query (testing) examples per class, binary case
    "outer_batch_size": 5, # Size of batch of tasks
    "inner_batch_size": 4, # Size of batch of classifications
    "outer_update_lr" : 5e-5, # Learning rate of task optimizer
    "inner_update_lr" : 5e-5, # Learning rate of classification optimizer
    "inner_update_step" : 4, # Inner loop epochs
    "inner_update_step_eval": 4, # Validation inner loop epochs
    "num_task_train" : 2, # Number of training tasks
    # "num_task_test" : 5 # Number of testing tasks
    "pos_weight" : 3 # p > 1 increases recall, p < 1 increases precision, applied in loss function
}

model = SLR_Classifier(bert_layers = range(Info["bert_layers"]),
                       model = initializer_model.model.bert,
                       drop=0.2)


meta_train(data = Train_resource,
          model = model,
          device = device,
          Info = Info,
          print_epoch =True,
          size_layer=Info["bert_layers"],
          Test_resource=Test_resource)

## Testing trained meta model

In [None]:
Valid_resource.groupby('domain')['label'].value_counts()

### Task diagnosis

In [None]:
from sklearn.manifold import TSNE
import seaborn as sns

# Initializing model
model_to_finetunning = deepcopy(model)

# Creating validation tasks
valid = MetaTask(Valid_resource,
                 num_task = 500,
                 k_support=10,
                 k_query=20,
                 tokenizer = Info['tokenizer'],
                 training=False)

# Diagnostic dataset
i = valid.task_names =='Opiods.csv'

# Task index
idx = np.array(range(len(i)))[i].item()

# Support (train) e query (test) data
support = valid[idx][0]
query   = valid[idx][1]
name   = valid[idx][2]

print(name)
print(Valid_resource[Valid_resource['domain']== name]['label'].value_counts())
print("k_suport_examples:",len(support))
print("k_query_examples:",len(query))

# Support data loader
support_dataloader = DataLoader(support, sampler=RandomSampler(support),
                                batch_size=5)


#### Untrained model

In [None]:
model_to_finetunning.to(device)

# Inner Optimizer
inner_optimizer = Adam(model_to_finetunning.parameters(), lr=5e-5)


# Predicting
model_to_finetunning.eval()
with torch.no_grad():
    query_dataloader = DataLoader(query, sampler=None, batch_size=len(query))
    query_batch = iter(query_dataloader).next()
    query_batch = tuple(t.to(device) for t in query_batch)
    q_input_ids, q_attention_mask, q_token_type_ids, q_label_id = query_batch
    
    # Predictions
    _, features, predictions = model_to_finetunning(q_input_ids, q_attention_mask, q_token_type_ids, labels = q_label_id)

    predictions = predictions.detach().cpu().squeeze()
    q_label_id = q_label_id.detach().cpu()

    acc = fn.accuracy(predictions, q_label_id).item()
    print("acc:",acc)

# TSNE Dimensionality reduction
X_embedded = TSNE(n_components=2, learning_rate='auto',
                  init='random').fit_transform(features[0].to('cpu'))

# Plot
sns.scatterplot(x=X_embedded[:, 0],
                y=X_embedded[:, 1],
                hue=q_label_id)
plt.show()

#### Trained model

In [None]:
model_to_finetunning = deepcopy(model)
inner_optimizer = Adam(model_to_finetunning.parameters(), lr=5e-5)
model_to_finetunning.train()
model_to_finetunning.to(device)

# Inner loop training
for i in range(0,Info['inner_update_step']):
    all_loss = []

    # Inner training batch (support set)
    for inner_step, batch in enumerate(support_dataloader):
        
        batch = tuple(t.to("cuda") for t in batch)
        input_ids, attention_mask, token_type_ids, label_id = batch

        # Feed Foward
        loss, _, _ = model_to_finetunning(input_ids, attention_mask, token_type_ids=token_type_ids, labels = label_id)
                      
        loss.backward()
        inner_optimizer.step()
        inner_optimizer.zero_grad()
        
        all_loss.append(loss.item())
    
    if i % Info["inner_print"] == 0:
        print("Inner Loss: ", np.mean(all_loss))

# Predicting
model_to_finetunning.eval()
with torch.no_grad():
    query_dataloader = DataLoader(query, sampler=None, batch_size=len(query))
    query_batch = iter(query_dataloader).next()
    query_batch = tuple(t.to(device) for t in query_batch)
    q_input_ids, q_attention_mask, q_token_type_ids, q_label_id = query_batch
    
    # Predictions
    _, features, predictions = model_to_finetunning(q_input_ids, q_attention_mask, q_token_type_ids, labels = q_label_id)

    predictions = predictions.detach().cpu().squeeze()
    q_label_id = q_label_id.detach().cpu()

    acc = fn.accuracy(predictions, q_label_id).item()
    print("acc:",acc)



model_to_finetunning.to(torch.device('cpu'))
del  inner_optimizer, model_to_finetunning
torch.cuda.empty_cache()


# TSNE Dimensionality reduction
X_embedded = TSNE(n_components=2, learning_rate='auto',
                  init='random').fit_transform(features[0].to('cpu'))

# Plot
sns.scatterplot(x=X_embedded[:, 0],
                y=X_embedded[:, 1],
                hue=q_label_id,
                alpha=torch.sigmoid(features[1]).to('cpu').view(-1))
plt.show()

## Domain learning phase with validation resource

In [None]:
Valid_resource.groupby('domain')['label'].value_counts()

### Diagnosis

In [None]:
# model

# Info = {
#     "inner_print": 1, 
#     "bert_layers": 3,
#     "device": torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
#     # "model": model,
#     "tokenizer": initializer_bert.tokenizer,
#     "max_seq_length": 512,
#     "meta_epoch":30, # Numero de epocas do outerloop
#     "k_spt":20, # Numero de treino por classe (k_spt) caso binario
#     "k_qry":20, # Numero de valicao por classe (k_spt) caso binario
#     "outer_batch_size": 5, # Divide as tasks em batchs tasks
#     "inner_batch_size": 5, # Divide as classificacao em batch de classificao
#     "outer_update_lr" : 5e-5, # Learning rate do otimizador das tasks
#     "inner_update_lr" : 5e-5, # Learning rate do otimizador das classificao
#     "inner_update_step" : 3, # Numero de epocas dentro do inner loop
#     "inner_update_step_eval": 3, # Numero de epocas dentro do inner loop de validacao
#     "num_task_train" : 100, # Quantidade de tarefas de treino
#     "num_task_test" : 5 # Quantidade de tarefas de test
# }

Info['tresh'] = 0.9
Info["inner_update_step_eval"] = 3
Info["inner_print"] = 1

Useful information:

+ Variables should be specified within `Info`

+ Model should be in the `model` variable

+ Training should be done with specification in the `device` variable

It's possible to access dome of the data from the testing dataset after pressing "Train", such as:

+ `logits`: logits from classifier stage, no activation

+ `X_embedded`: 2-dimensional values from dimensionality reduction of the latent space

+ `features`: Latent space values  (feature_map layer output)

+ `labels`: True values

+ `data_train`: Training data (Not the same order given to the model on training)

+ `data_test`: Testing data

+ `batch_size_test`: Batch size from the testing dataset, so that prediction time can be reduced

In [None]:
#@title Evaluating validation dataset
batch_size_test =   20 #@param {type:"number"}
# Task names
names = Valid_resource['domain'].unique()

diagnosis5050 = diagnosis(names, Valid_resource, batch_size_test, model,Info, start=0)
diagnosis5050()

### Text Avaliation of Positives examples

In [None]:
def treat_text(text):
    text = unicodedata.normalize("NFKD",str(text))
    text = multiple_replace(patterns,text.lower())
    text = re.sub('(\(.+\))|(\[.+\])|( \d )|(<)|(>)|(- )','', text)
    text = re.sub('( +)',' ', text)
    text = re.sub('(, ,)|(,,)',',', text)
    text = re.sub('(%)|(per cent)',' percent', text)
    return text

data_test = diagnosis5050.data_test
logits = diagnosis5050.logits

# random choice a positive example
indx= data_test[data_test['label'] == 'positive'].index
indx= np.random.choice(indx)


# print the example
pprint(data_test.iloc[indx])

# Prediction by the model
print("Predicted:",torch.sigmoid(logits[indx]).item() )

# Text of the example
print('Treated Text:')
pprint(treat_text(data_test['text'].iloc[indx]))

# Text of the example
print('Text:')
pprint(treat_text(data_test['text'].iloc[indx]))

## Save model

In [None]:
import shutil
import datetime
import re
import json
from pathlib import Path

base_path = '5050split'
path_save = "./"+"Models/"+base_path+"/"


# Creating directory
Path(path_save).mkdir(parents=True, exist_ok=True)

# Directory data
check = (str(datetime.datetime.now()))[0:19]
check = re.sub("[:-]","_",check)
check = re.sub(" ","_hr_",check)

print("Data e hora do salvamento:", check)

# Creating directory
Path(f"{path_save}/{check}").mkdir(parents=True, exist_ok=True)

# Saving paths
model_path = f'{path_save}/{check}/model.pt'
meta_info_path = f'{path_save}/{check}/Info.json'

# Meta info
save_info = Info.copy()
save_info['model'] = initializer_model.tokenizer.name_or_path
save_info.pop("tokenizer")

## Saving meta info
with open(meta_info_path, 'w') as fp:
    json.dump(save_info, fp)

## Saving entire model
torch.save(model, model_path)

# To save only parameters:
# torch.save(model.state_dict(), model_path)

## Load model

In [None]:
# Downloading entire model
model = torch.load(model_path)
model

## Computing simulation attemps

In [None]:
names_to_valid = Valid_resource['domain'].unique()

In [None]:
torch.clear_autocast_cache()
gc.collect()
torch.cuda.empty_cache()

base_path = '5050split'
path_save = "./"+"Results/"+base_path+"/"

pipeline_simpulation(Valid_resource, names_to_valid, path_save, model, Info)

# Comparison simulation

### Data

In [None]:
import glob
import os

path = 'SLR_data'

cohen_paths = glob.glob(f"{path}/**/cohen/*.csv", recursive=True)
SWIFT_paths = glob.glob(f"{path}/**/SWIFT systematic review data/*.csv", recursive=True)

cohen_names = [os.path.basename(p) for p in cohen_paths]
SWIFT_names = [os.path.basename(p) for p in SWIFT_paths]

base_path = 'Comparison'

#### Data Statistics

#### Data split

In [None]:
from itertools import chain

All_data = load_data(
    info_load
)

All_data = All_data.dropna().reset_index().drop("index", axis=1)

comparison_names = list(chain.from_iterable([cohen_names, SWIFT_names]))

cohen_data = All_data[All_data['domain'].isin(cohen_names)]
cohen_data =  cohen_data.reset_index().drop("index", axis=1)

SWIFT_data = All_data[All_data['domain'].isin(SWIFT_names)]
SWIFT_data =  SWIFT_data.reset_index().drop("index", axis=1)

Train_resource = All_data[~All_data['domain'].isin(comparison_names)]
Train_resource =  Train_resource.reset_index().drop("index", axis=1)

In [None]:
# Selecting data with at least 40 words separated by spaces
crit = Train_resource["text"].apply(lambda x: len(x.split(" "))) >= 40
Train_resource = Train_resource[crit].reset_index().drop("index", axis=1)

### Exploratory Data Analysis

#### Training dataset

In [None]:
Train_resource["text"].apply(lambda x: len(x.split(" "))).hist()
plt.show()

In [None]:
Train_resource.groupby('domain')['label']\
              .value_counts()

#### cohen Dataset

In [None]:
cohen_data["text"].apply(lambda x: len(x.split(" "))).hist()
plt.show()

In [None]:
cohen_data.groupby('domain')['label']\
              .value_counts()

#### SWIFT Dataset

In [None]:
SWIFT_data["text"].apply(lambda x: len(x.split(" "))).hist()
plt.show()

In [None]:
SWIFT_data.groupby('domain')['label']\
              .value_counts()

## Meta learning phase

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Meta-Training informations
Info = {
    "inner_print": 2, 
    "bert_layers": 2,
    "tokenizer": initializer_model.tokenizer,
    "max_seq_length": 512,
    "meta_epoch":5, # Outer loop epochs
    "k_spt":8, # Support (training) examples per class, binary case
    "k_qry":8, # Query (testing) examples per class, binary case
    "outer_batch_size": 5, # Size of batch of tasks
    "inner_batch_size": 4, # Size of batch of classifications
    "outer_update_lr" : 5e-5, # Learning rate of task optimizer
    "inner_update_lr" : 5e-5, # Learning rate of classification optimizer
    "inner_update_step" : 4, # Inner loop epochs
    "inner_update_step_eval": 4, # Validation inner loop epochs
    "num_task_train" : 2, # Number of training tasks
    # "num_task_test" : 5 # Number of testing tasks
    "pos_weight" : 3 # p > 1 increases recall, p < 1 increases precision, applied in loss function
}

model = SLR_Classifier(bert_layers = range(Info["bert_layers"]),
                       model = initializer_model.model.bert,
                       drop=0.2)


meta_train(data = Train_resource,
          model = model,
          device = device,
          Info = Info,
          print_epoch =True,
          size_layer=Info["bert_layers"],
          Test_resource=None)

## Testing trained meta model in SWIFT

In [None]:
SWIFT_data.groupby('domain')['label'].value_counts()

### Task diagnosis

In [None]:
from sklearn.manifold import TSNE
import seaborn as sns

# Initializing model
model_to_finetunning = deepcopy(model)

# Creating validation tasks
valid = MetaTask(SWIFT_data,
                 num_task = 500,
                 k_support=10,
                 k_query=20,
                 tokenizer = Info['tokenizer'],
                 training=False)

# Diagnostic dataset
i = valid.task_names =='Neuropain.csv'

# Task index
idx = np.array(range(len(i)))[i].item()

# Support (train) e query (test) data
support = valid[idx][0]
query   = valid[idx][1]
name   = valid[idx][2]

print(name)
print(SWIFT_data[SWIFT_data['domain']== name]['label'].value_counts())
print("k_suport_examples:",len(support))
print("k_query_examples:",len(query))

# Support data loader
support_dataloader = DataLoader(support, sampler=RandomSampler(support),
                                batch_size=5)


#### Untrained model

In [None]:
model_to_finetunning.to(device)

# Inner Optimizer
inner_optimizer = Adam(model_to_finetunning.parameters(), lr=5e-5)


# Predicting
model_to_finetunning.eval()
with torch.no_grad():
    query_dataloader = DataLoader(query, sampler=None, batch_size=len(query))
    query_batch = iter(query_dataloader).next()
    query_batch = tuple(t.to(device) for t in query_batch)
    q_input_ids, q_attention_mask, q_token_type_ids, q_label_id = query_batch
    
    # Predictions
    _, features, predictions = model_to_finetunning(q_input_ids, q_attention_mask, q_token_type_ids, labels = q_label_id)

    predictions = predictions.detach().cpu().squeeze()
    q_label_id = q_label_id.detach().cpu()

    acc = fn.accuracy(predictions, q_label_id).item()
    print("acc:",acc)

# TSNE Dimensionality reduction
X_embedded = TSNE(n_components=2, learning_rate='auto',
                  init='random').fit_transform(features[0].to('cpu'))

# Plot
sns.scatterplot(x=X_embedded[:, 0],
                y=X_embedded[:, 1],
                hue=q_label_id)
plt.show()

#### Trained model

In [None]:
model_to_finetunning = deepcopy(model)
inner_optimizer = Adam(model_to_finetunning.parameters(), lr=5e-5)
model_to_finetunning.train()
model_to_finetunning.to(device)

# Inner loop training
for i in range(0,Info['inner_update_step']):
    all_loss = []

    # Inner training batch (support set)
    for inner_step, batch in enumerate(support_dataloader):
        
        batch = tuple(t.to("cuda") for t in batch)
        input_ids, attention_mask, token_type_ids, label_id = batch

        # Feed Foward
        loss, _, _ = model_to_finetunning(input_ids, attention_mask, token_type_ids=token_type_ids, labels = label_id)
                      
        loss.backward()
        inner_optimizer.step()
        inner_optimizer.zero_grad()
        
        all_loss.append(loss.item())
    
    if i % Info["inner_print"] == 0:
        print("Inner Loss: ", np.mean(all_loss))

# Predicting
model_to_finetunning.eval()
with torch.no_grad():
    query_dataloader = DataLoader(query, sampler=None, batch_size=len(query))
    query_batch = iter(query_dataloader).next()
    query_batch = tuple(t.to(device) for t in query_batch)
    q_input_ids, q_attention_mask, q_token_type_ids, q_label_id = query_batch
    
    # Predictions
    _, features, predictions = model_to_finetunning(q_input_ids, q_attention_mask, q_token_type_ids, labels = q_label_id)

    predictions = predictions.detach().cpu().squeeze()
    q_label_id = q_label_id.detach().cpu()

    acc = fn.accuracy(predictions, q_label_id).item()
    print("acc:",acc)



model_to_finetunning.to(torch.device('cpu'))
del  inner_optimizer, model_to_finetunning
torch.cuda.empty_cache()


# TSNE Dimensionality reduction
X_embedded = TSNE(n_components=2, learning_rate='auto',
                  init='random').fit_transform(features[0].to('cpu'))

# Plot
sns.scatterplot(x=X_embedded[:, 0],
                y=X_embedded[:, 1],
                hue=q_label_id,
                alpha=torch.sigmoid(features[1]).to('cpu').view(-1))
plt.show()

## Domain learning phase in SWIFT datasets

In [None]:
SWIFT_data.groupby('domain')['label'].value_counts()

### Diagnosis

In [None]:
# model

# Info = {
#     "inner_print": 1, 
#     "bert_layers": 3,
#     "device": torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
#     # "model": model,
#     "tokenizer": initializer_bert.tokenizer,
#     "max_seq_length": 512,
#     "meta_epoch":30, # Numero de epocas do outerloop
#     "k_spt":20, # Numero de treino por classe (k_spt) caso binario
#     "k_qry":20, # Numero de valicao por classe (k_spt) caso binario
#     "outer_batch_size": 5, # Divide as tasks em batchs tasks
#     "inner_batch_size": 5, # Divide as classificacao em batch de classificao
#     "outer_update_lr" : 5e-5, # Learning rate do otimizador das tasks
#     "inner_update_lr" : 5e-5, # Learning rate do otimizador das classificao
#     "inner_update_step" : 3, # Numero de epocas dentro do inner loop
#     "inner_update_step_eval": 3, # Numero de epocas dentro do inner loop de validacao
#     "num_task_train" : 100, # Quantidade de tarefas de treino
#     "num_task_test" : 5 # Quantidade de tarefas de test
# }

Info['tresh'] = 0.9
Info["inner_update_step_eval"] = 3
Info["inner_print"] = 1

Useful information:

+ Variables should be specified within `Info`

+ Model should be in the `model` variable

+ Training should be done with specification in the `device` variable

It's possible to access dome of the data from the testing dataset after pressing "Train", such as:

+ `logits`: logits from classifier stage, no activation

+ `X_embedded`: 2-dimensional values from dimensionality reduction of the latent space

+ `features`: Latent space values  (feature_map layer output)

+ `labels`: True values

+ `data_train`: Training data (Not the same order given to the model on training)

+ `data_test`: Testing data

+ `batch_size_test`: Batch size from the testing dataset, so that prediction time can be reduced

In [None]:
#@title Evaluating validation dataset
batch_size_test =   20 #@param {type:"number"}
# Task names
names = SWIFT_data['domain'].unique()

diagnosis_comp = diagnosis(names, SWIFT_data, batch_size_test, model,Info, start=0)
diagnosis_comp()

### Text Avaliation of Positives examples

In [None]:
def treat_text(text):
    text = unicodedata.normalize("NFKD",str(text))
    text = multiple_replace(patterns,text.lower())
    text = re.sub('(\(.+\))|(\[.+\])|( \d )|(<)|(>)|(- )','', text)
    text = re.sub('( +)',' ', text)
    text = re.sub('(, ,)|(,,)',',', text)
    text = re.sub('(%)|(per cent)',' percent', text)
    return text

data_test = diagnosis_comp.data_test
logits = diagnosis_comp.logits

# random choice a positive example
indx= data_test[data_test['label'] == 'positive'].index
indx= np.random.choice(indx)


# print the example
pprint(data_test.iloc[indx])

# Prediction by the model
print("Predicted:",torch.sigmoid(logits[indx]).item() )

# Text of the example
print('Treated Text:')
pprint(treat_text(data_test['text'].iloc[indx]))

# Text of the example
print('Text:')
pprint(treat_text(data_test['text'].iloc[indx]))

## Save model

In [None]:
import shutil
import datetime
import re
import json
from pathlib import Path

base_path = 'Comparison'
path_save = "./"+"Models/"+base_path+"/"



# Creating directory
Path(path_save).mkdir(parents=True, exist_ok=True)

# Directory data
check = (str(datetime.datetime.now()))[0:19]
check = re.sub("[:-]","_",check)
check = re.sub(" ","_hr_",check)

print("Data e hora do salvamento:", check)

# Creating directory
Path(f"{path_save}/{check}").mkdir(parents=True, exist_ok=True)

# Saving paths
model_path = f'{path_save}/{check}/model.pt'
meta_info_path = f'{path_save}/{check}/Info.json'

# Meta info
save_info = Info.copy()
save_info['model'] = initializer_model.tokenizer.name_or_path
save_info.pop("tokenizer")

## Saving meta info
with open(meta_info_path, 'w') as fp:
    json.dump(save_info, fp)

## Saving entire model
torch.save(model, model_path)

# To save only parameters:
# torch.save(model.state_dict(), model_path)

## Load model

In [None]:
# Downloading entire model
model = torch.load(model_path)
model

## Computing simulation attemps on SWIFT datasets

In [None]:
torch.clear_autocast_cache()
gc.collect()
torch.cuda.empty_cache()

path_save = "./"+"Results/"+base_path+"/SWIFT/"

pipeline_simpulation(SWIFT_data, SWIFT_names, path_save, model, Info)

## Computing simulation attemps on cohen datasets

In [None]:
torch.clear_autocast_cache()
gc.collect()
torch.cuda.empty_cache()

path_save = "./"+"Results/"+base_path+"/Drugs/"

pipeline_simpulation(cohen_data, cohen_names, path_save, model, Info)