In [1]:
from tqdm import tqdm
from loadDataset import load_dataset_as_dict

In [3]:
train_dataset = load_dataset_as_dict("../data/train.json")
print(f"Número de instancias en train: {len(train_dataset['inputs'])}")
print(f"Instancia 0 input: {train_dataset['inputs'][0]}")
print(f"Instancia 0 label: {train_dataset['labels'][0]}")

dev_dataset = load_dataset_as_dict("../data/dev.json")
print(f"Número de instancias en dev: {len(dev_dataset['inputs'])}")

Número de instancias en train: 182822
Instancia 0 input: <s>Chronic urethral obstruction because of urinary calculi, prostatic hyperophy, tumors, normal pregnancy, tumors, uterine prolapse or functional disorders cause hydronephrosis which by definition is used to describe dilatation of renal pelvis and calculus associated with progressive atrophy of the kidney due to obstruction to the outflow of urine Refer Robbins 7yh/9,1012,9/e. P950</s>Chronic urethral obstruction due to benign prismatic hyperplasia can lead to the following change in kidney parenchyma</s>Hyperplasia</s>Hyperophy</s>Atrophy</s>Dyplasia</s>
Instancia 0 label: 3
Número de instancias en dev: 4183


# Tokenización

In [6]:
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


Preparamos el tokenizador:

In [7]:
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased", use_fast=True) # TODO. Usar el modelo que debería usar, no cualquier cosa.

# Indicamos cuales son los tokens especiales para que no los parta
special_tokens_dict = {
    'bos_token': '<s>',
    'sep_token': '</s>',
    'eos_token': '</s>'
    }
tokenizer.add_special_tokens(special_tokens_dict)

2

In [None]:
tokenized_and_encoded_train_dataset = tokenizer(train_dataset['inputs']) # TODO. Añadir padding y truncation

tokenized_train_dataset = []
for input in tqdm(train_dataset['inputs']):
    tokenized_train_dataset.append(tokenizer.tokenize(input))

tokenized_and_encoded_dev_dataset = tokenizer(dev_dataset['inputs']) # TODO. Añadir padding y truncation

Comprobación de una instancia de `train`:

In [6]:
print(tokenized_train_dataset[0])
print(len(tokenized_train_dataset[0]))
print(tokenized_and_encoded_train_dataset['input_ids'][0])
print(len(tokenized_and_encoded_train_dataset['input_ids'][0]))

['<s>', 'chronic', 'ur', '##eth', '##ral', 'obstruction', 'because', 'of', 'ur', '##ina', '##ry', 'cal', '##cu', '##li', ',', 'pro', '##static', 'hyper', '##op', '##hy', ',', 'tumors', ',', 'normal', 'pregnancy', ',', 'tumors', ',', 'ut', '##erine', 'pro', '##la', '##pse', 'or', 'functional', 'disorders', 'cause', 'hydro', '##ne', '##ph', '##rosis', 'which', 'by', 'definition', 'is', 'used', 'to', 'describe', 'dil', '##ata', '##tion', 'of', 'renal', 'pe', '##lvis', 'and', 'calculus', 'associated', 'with', 'progressive', 'at', '##rop', '##hy', 'of', 'the', 'kidney', 'due', 'to', 'obstruction', 'to', 'the', 'out', '##flow', 'of', 'urine', 'refer', 'robbins', '7', '##y', '##h', '/', '9', ',', '101', '##2', ',', '9', '/', 'e', '.', 'p', '##9', '##50', '</s>', 'chronic', 'ur', '##eth', '##ral', 'obstruction', 'due', 'to', 'benign', 'prism', '##atic', 'hyper', '##pl', '##asia', 'can', 'lead', 'to', 'the', 'following', 'change', 'in', 'kidney', 'par', '##en', '##chy', '##ma', '</s>', 'hyper',

Distribución de número de tokens por split (train y dev):

In [10]:
import pandas as pd
import plotly.graph_objects as go

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [None]:
df_train = pd.DataFrame({'number_of_tokens': [len(instance) for instance in tokenized_and_encoded_train_dataset['input_ids']]})
df_dev = pd.DataFrame({'number_of_tokens': [len(instance) for instance in tokenized_and_encoded_dev_dataset['input_ids']]})

# Crea el primer boxplot
trace1 = go.Box(
    x=df_train['number_of_tokens'],
    name='Train',
    boxpoints='all'
)

# Crea el segundo boxplot
trace2 = go.Box(
    x=df_dev['number_of_tokens'],
    name='Dev',
    boxpoints='all'
)

# Combina los boxplots en una sola figura
data = [trace1, trace2]
layout = go.Layout(title="Distribution of the number of tokens in the input (<s>Context</s>...)", xaxis_title="Number of tokens", yaxis_title="Split", xaxis=dict(type='log', autorange=True))

fig = go.Figure(data=data, layout=layout)
fig.show()

Comprobamos si hay alguna instancia con más de 512 tokens e imprimimos ejemplos:

In [19]:
def more_than_512_tokens(tokenized_dataset):
    number_of_more_than_512_tokens = 0
    for instance_tokenized in tokenized_dataset['input_ids']:
        if len(instance_tokenized) > 512:
            number_of_more_than_512_tokens += 1
    return number_of_more_than_512_tokens


### TRAIN ###

more_than_512_tokens_train = more_than_512_tokens(tokenized_and_encoded_train_dataset)
print(f"Number of instances with more than 512 tokens (train): {more_than_512_tokens_train}/{len(tokenized_and_encoded_train_dataset['input_ids'])} ({more_than_512_tokens_train/len(tokenized_and_encoded_train_dataset['input_ids'])*100:.2f}%)")

number_of_examples = 10
for i, instance_tokenized in enumerate(tokenized_and_encoded_train_dataset['input_ids']):
        if len(instance_tokenized) > 512:
            instance_split = train_dataset['inputs'][i].split('</s>')
            print(f"Instance {i}:\n\t Context: {instance_split[0].replace('<s>', '')} \n\t Question: {instance_split[1]} \n\t Option A: {instance_split[2]} \n\t Option B: {instance_split[3]} \n\t Option C: {instance_split[4]} \n\t Option D: {instance_split[5]} \n\t Correct option: {train_dataset['labels'][i]}")
            number_of_examples -= 1
            if number_of_examples == 0:
                break


### DEV ###

more_than_512_tokens_dev = more_than_512_tokens(tokenized_and_encoded_dev_dataset)
print(f"Number of instances with more than 512 tokens (dev): {more_than_512_tokens_dev}/{len(tokenized_and_encoded_dev_dataset['input_ids'])} ({more_than_512_tokens_dev/len(tokenized_and_encoded_dev_dataset['input_ids'])*100:.2f}%)")


Number of instances with more than 512 tokens (train): 5611/182822 (3.07%)
Instance 14:
	 Context: Urine microscopy for malignant cytology "Painless haematuria is by far the most common symptom of bladder cancer and should be regarded as indicative of a bladder carcinoma until proven otherwise." Bailey & Love 25/e p1336 (24/e p1363) Cigarette smoking is the main etiological factor and accounts ,for about 50% of bladder cancers Among the given options 'urine microscopy' is the best choice for investigating a pt. with high suspicion for bladder ca. This is not a good screening test but a positive result is highly specific. The best investigation would be cystoscopy & biopsy. Investigations for bladder cancer 1. Urinary cytology - Exfoliated cells from both normal and neoplastic urothelium can be readily identified in voided urine. Examination of cytological specimens can detect the malignant cells either at the time of initial presentation or during follow-up. Cytological examination may

Boxplot de diferentes partes de los inputs:

In [8]:
encoded_train_context = []
encoded_train_question = []
encoded_train_optiona = []
encoded_train_optionb = []
encoded_train_optionc = []
encoded_train_optiond = []
encoded_train_options = []

for input in tqdm(train_dataset['inputs']):
    context = input.split('</s>')[0].replace('<s>', '')
    question = input.split('</s>')[1]
    optiona = input.split('</s>')[2]
    optionb = input.split('</s>')[3]
    optionc = input.split('</s>')[4]
    optiond = input.split('</s>')[5]
    options = optiona + '</s>' + optionb + '</s>' + optionc + '</s>' + optiond

    encoded_train_context.append(tokenizer(context))
    encoded_train_question.append(tokenizer(question))
    encoded_train_optiona.append(tokenizer(optiona))
    encoded_train_optionb.append(tokenizer(optionb))
    encoded_train_optionc.append(tokenizer(optionc))
    encoded_train_optiond.append(tokenizer(optiond))
    encoded_train_options.append(tokenizer(options))

print(len(encoded_train_context))

100%|██████████| 182822/182822 [01:18<00:00, 2336.20it/s]

182822





In [None]:
df_train_context = pd.DataFrame({'number_of_tokens': [len(instance['input_ids']) for instance in encoded_train_context]})
df_train_question = pd.DataFrame({'number_of_tokens': [len(instance['input_ids']) for instance in encoded_train_question]})
df_train_optiona = pd.DataFrame({'number_of_tokens': [len(instance['input_ids']) for instance in encoded_train_optiona]})
df_train_optionb = pd.DataFrame({'number_of_tokens': [len(instance['input_ids']) for instance in encoded_train_optionb]})
df_train_optionc = pd.DataFrame({'number_of_tokens': [len(instance['input_ids']) for instance in encoded_train_optionc]})
df_train_optiond = pd.DataFrame({'number_of_tokens': [len(instance['input_ids']) for instance in encoded_train_optiond]})

# Crea el primer boxplot
trace1 = go.Box(
    x=df_train_context['number_of_tokens'],
    name='Train context',
    boxpoints='all'
)

# Crea el segundo boxplot
trace2 = go.Box(
    x=df_train_question['number_of_tokens'],
    name='Train question',
    boxpoints='all'
)

# Crea el tercer boxplot
trace3 = go.Box(
    x=df_train_optiona['number_of_tokens'],
    name='Train option A',
    boxpoints='all'
)

# Crea el cuarto boxplot
trace4 = go.Box(
    x=df_train_optionb['number_of_tokens'],
    name='Train option B',
    boxpoints='all'
)

# Crea el quinto boxplot
trace5 = go.Box(
    x=df_train_optionc['number_of_tokens'],
    name='Train option C',
    boxpoints='all'
)

# Crea el sexto boxplot
trace6 = go.Box(
    x=df_train_optiond['number_of_tokens'],
    name='Train option D',
    boxpoints='all'
)

# Crea el séptimo boxplot
trace7 = go.Box(
    x=df_train_optiond['number_of_tokens'],
    name='Train options',
    boxpoints='all'
)

# Combina los boxplots en una sola figura
data = [trace1, trace2, trace3, trace4, trace5, trace6]
layout = go.Layout(
    title="Distribution of the number of tokens in the input", 
    xaxis_title="Number of tokens", 
    yaxis_title="Split", 
    xaxis=dict(type='log', autorange=True)
    )

fig = go.Figure(data=data, layout=layout)
fig.show()
fig.write_image("fig1.png")