In [14]:
import pandas as pd
import torch

from transformers import LayoutLMTokenizerFast, LayoutLMForTokenClassification, Trainer, TrainingArguments
from datasets import Dataset

In [None]:
# Load the dataset
df = pd.read_csv('/kaggle/input/layoutlm/medquad.csv')

In [15]:
df.head()

Unnamed: 0,question,answer,source,focus_area
0,What is (are) Glaucoma ?,Glaucoma is a group of diseases that can damag...,NIHSeniorHealth,Glaucoma
1,What causes Glaucoma ?,"Nearly 2.7 million people have glaucoma, a lea...",NIHSeniorHealth,Glaucoma
2,What are the symptoms of Glaucoma ?,Symptoms of Glaucoma Glaucoma can develop in ...,NIHSeniorHealth,Glaucoma
3,What are the treatments for Glaucoma ?,"Although open-angle glaucoma cannot be cured, ...",NIHSeniorHealth,Glaucoma
4,What is (are) Glaucoma ?,Glaucoma is a group of diseases that can damag...,NIHSeniorHealth,Glaucoma


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16412 entries, 0 to 16411
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   question    16412 non-null  object
 1   answer      16407 non-null  object
 2   source      16412 non-null  object
 3   focus_area  16398 non-null  object
dtypes: object(4)
memory usage: 513.0+ KB


In [19]:
df.isna().sum()

question       0
answer         5
source         0
focus_area    14
dtype: int64

In [20]:
# Drop rows with NaN values in the 'answer' column
df = df.dropna(subset=['answer'])
df.isna().sum()

question      0
answer        0
source        0
focus_area    0
dtype: int64

## Tokenization and Formatting
Use a tokenizer from the Hugging Face Transformers library to tokenize the text.

In [21]:
# Initialize the tokenizer
tokenizer = LayoutLMTokenizerFast.from_pretrained("microsoft/layoutlm-base-uncased")

# Create dummy labels for the example
df['labels'] = df['answer'].apply(lambda x: [0] * len(tokenizer.tokenize(x)))

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

def tokenize_and_format(examples):
    tokenized_inputs = tokenizer(examples['answer'], padding='max_length', truncation=True, 
                                 max_length=512, return_offsets_mapping=True)
    labels = []
    for label in examples['labels']:
        word_ids = tokenized_inputs.word_ids(batch_index=0)
        # Adjust the length of the labels to match the word_ids
        label_ids = [-100 if word_id is None else label[min(word_id, len(label) - 1)] for word_id in word_ids]
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    tokenized_inputs.pop("offset_mapping")  # Remove offset_mapping as it's not needed
    return tokenized_inputs

# Tokenize the dataset
tokenized_dataset = dataset.map(tokenize_and_format, batched=True, remove_columns=dataset.column_names)

tokenized_dataset.head()

0    [input_ids, token_type_ids, attention_mask, of...
1    [input_ids, token_type_ids, attention_mask, of...
2    [input_ids, token_type_ids, attention_mask, of...
3    [input_ids, token_type_ids, attention_mask, of...
4    [input_ids, token_type_ids, attention_mask, of...
dtype: object


## Model Training
Prepare the model and set up the training loop.

In [25]:
# Initialize the model
model = LayoutLMForTokenClassification.from_pretrained("microsoft/layoutlm-base-uncased", num_labels=2)

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
)

# Train the model
trainer.train()

Token indices sequence length is longer than the specified maximum sequence length for this model (1191 > 512). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/16407 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/451M [00:00<?, ?B/s]

Some weights of LayoutLMForTokenClassification were not initialized from the model checkpoint at microsoft/layoutlm-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc




Epoch,Training Loss,Validation Loss
1,0.0,2e-06
2,0.0,1e-06
3,0.0,1e-06




TrainOutput(global_step=6153, training_loss=0.0002219991392513361, metrics={'train_runtime': 4498.2599, 'train_samples_per_second': 10.942, 'train_steps_per_second': 1.368, 'total_flos': 1.295058925587456e+16, 'train_loss': 0.0002219991392513361, 'epoch': 3.0})

## Evaluation
Evaluate the model on the test set.

In [26]:
# Evaluate the model
results = trainer.evaluate()
print(results)



{'eval_loss': 1.080354650184745e-06, 'eval_runtime': 377.3254, 'eval_samples_per_second': 43.482, 'eval_steps_per_second': 5.436, 'epoch': 3.0}


## Make predictions on new data.

In [28]:
# Predict on new data
new_text = ["This is a new sentence for token classification."]
tokenized_inputs = tokenizer(new_text, padding='max_length', truncation=True, max_length=512, return_tensors="pt")

# Ensure tensors are on the same device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
tokenized_inputs = {k: v.to(device) for k, v in tokenized_inputs.items()}

model.eval()
with torch.no_grad():
    outputs = model(**tokenized_inputs)
    predictions = outputs.logits.argmax(dim=-1)
    print(predictions)

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0