# Experimento finetuning 1ª iteración
* Objetivo: Determinar si se consigue aprendizaje con el fine-tuning supervisado
* Método: Fine-tuning del tipo (etiqueta=fenotipo, valor=abstract) con una capa softmax al final del BERT
* Datos: abstracts.csv, phenotypes-22-12-15.csv

## 1. Cargar datos

In [32]:
import sentence_transformers
import torch
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Cargar el BERT de partida

BERTBASE =  'sentence-transformers/stsb-bert-base'
PRITAMDEKAMODEL = 'pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb'
bertmodel = SentenceTransformer(PRITAMDEKAMODEL)
# Se puede aumentar max_seq_length?

# Función clean abstract

# Download the stopwords from NLTK
nltk.download('punkt')
nltk.download('stopwords')

def clean_abstract(abstract):
    # Convert the text to lowercase
    abstract = abstract.lower()

    # Remove punctuation
    abstract = abstract.translate(str.maketrans('', '', string.punctuation))

    # Tokenize the text
    tokens = word_tokenize(abstract)

    # Remove stopwords
    tokens = [word for word in tokens if not word in stopwords.words()]

    # Join the tokens back into a single string
    abstract = ' '.join(tokens)

    return abstract

# Obtener los datos de entrenamiento

PATH_DATA = '../pubmed-queries/abstracts'
PATH_DATA_CSV = PATH_DATA + '/abstracts.csv'
PATH_DATA_FENOTIPOS = '../pubmed-queries/results/phenotypes-22-12-15.csv'
PATH_INDEX_FENOTIPOS = PATH_DATA + '/index-phenotypes.csv'
SEED = 42

dfPapers = pd.read_csv(PATH_DATA_CSV, sep='\t', low_memory=False, na_values=[''])
dfPhenotypes = pd.read_csv(PATH_DATA_FENOTIPOS, sep=';', low_memory=False, na_values=[''])
dfIndex = pd.read_csv(PATH_INDEX_FENOTIPOS, sep='\t', low_memory=False, na_values=[''])

# Cargar la ontología

from pyhpo import Ontology

onto = Ontology('../pubmed-queries/hpo-22-12-15-data')

[nltk_data] Downloading package punkt to /home/domingo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/domingo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## 2. Obtener dataset de entrenamiento

In [33]:
# phenotypeId	phenotypeName	numberPapers	paperList

# Tomar la lista de fenotipos = tags
tags = dfIndex['phenotypeName']
numlabels = len(tags)
print(numlabels, 'tags')
print(tags[:5])
# Separar abstracts en train, validation y test

train = dfPapers.sample(frac=0.8, random_state=SEED)
dTest = dfPapers.drop(train.index)
dVal = train.sample(frac=0.2, random_state=SEED)
dTrain = train.drop(dVal.index)

# Considerar train_test_split

# paperId	phenotypeId	phenotypeName	title	abstract
list = [dTrain, dVal, dTest]
names = ['Train', 'Validation', 'Test']
for j in range(0, 3):
    l = list[j]
    print(names[j],': ', len(l), '\n')
    for i in range(0, 2):
        print(l.iloc[i])
    print('')


100 tags
0                    Temporomandibular joint ankylosis
1                                             Dyslexia
2    Stippling of the epiphysis of the proximal pha...
3                                 Ankle joint effusion
4                              Reduced C-peptide level
Name: phenotypeName, dtype: object
Train :  14865 

paperId                                                   22103752
phenotypeId                                             HP:0003557
phenotypeName       Increased variability in muscle fiber diameter
title            Myosin light chain 3f attenuates age-induced d...
abstract         Aging is characterized by a progressive loss o...
Name: 13987, dtype: object
paperId                                                   16498179
phenotypeId                                             HP:0025317
phenotypeName                                        Cubitus varus
title            Kinematics and kinetics of the racket-arm duri...
abstract         The purposes of t

## 3. ¿Cómo se hace el fine-tuning?
Para nuestro caso particular necesitamos pasarle los tags, añadir la red neuronal a la salida y la capa softmax y la forma de evaluación.


In [34]:
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, SentencesDataset, losses
torch.manual_seed(SEED)

num_epochs = 5

model = bertmodel

# TODO: Documentarse cómo se prepara el DataLoader con los pares abstract-fenotipo

train_dataloader = DataLoader(dTrain, shuffle=True, batch_size=16)
val_dataloader = DataLoader(dVal, shuffle=False, batch_size=16)
test_dataloader = DataLoader(dTest, shuffle=False, batch_size=16)

# TODO: Documentarse sobre loss y evaluator

train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=numlabels)
evaluator = sentence_transformers.evaluation.LabelAccuracyEvaluator(val_dataloader, '', softmax_model=None, write_csv=True)

# TODO: Documentarse sobre los hiperparámetros y preparar el grid

model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=evaluator,
    epochs=num_epochs,
    evaluation_steps=50,
    warmup_steps=100,
    output_path='./output/fine-tuned-bio-bert',
)

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/930 [00:00<?, ?it/s]

KeyError: 1171