In [1]:
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score
import pandas as pd

In [2]:
file_path = "/content/Field data.xlsx"
df = pd.read_excel(file_path)

In [None]:
texts = []
labels = []
label_number = 0
for column in df.columns:
  print(column)
  for index, value in df[column].items():
    # print(f"Row {index}: {value}")
    if pd.notna(value):
      texts.append(value)
      labels.append(label_number)
  label_number += 1

In [None]:
len(texts), len(labels)

(874, 874)

In [5]:
from sklearn.model_selection import train_test_split

texts_train, texts_val, labels_train, labels_val = train_test_split(texts, labels, test_size=0.2, random_state=42)

print("Training set length:", len(texts_train))
print("Validation set length:", len(texts_val))

Training set length: 699
Validation set length: 175


In [6]:
# Define your dataset class
class FieldNameDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'label': label
        }

# Example dataset
# texts = ["Please enter your name:", "What is your age?", "Please provide your address:", "Enter your email address:"]
# labels = [1, 0, 2, 3]  # Label 1: Name, Label 2: Age, Label 3: Address, Label 4: Email

# Initialize RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=10) # Change num_labels according to your number of classes

# Define hyperparameters
batch_size = 4
max_length = 32
learning_rate = 2e-5
num_epochs = 5

# Split the dataset into train and validation sets
train_texts = texts_train
train_labels = labels_train
val_texts = texts_val
val_labels = labels_val

# Create datasets and dataloaders for training and validation
train_dataset = FieldNameDataset(train_texts, train_labels, tokenizer, max_length)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataset = FieldNameDataset(val_texts, val_labels, tokenizer, max_length)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Define optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
loss_fn = torch.nn.CrossEntropyLoss()

# Training loop
for epoch in range(num_epochs):
    # Training
    model.train()
    total_loss = 0
    predictions_train = []
    true_labels_train = []
    for batch in train_dataloader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

        predictions_train.extend(torch.argmax(outputs.logits, dim=1).tolist())
        true_labels_train.extend(labels.tolist())

    avg_loss = total_loss / len(train_dataloader)
    train_accuracy = accuracy_score(true_labels_train, predictions_train)

    # Validation
    model.eval()
    predictions_val = []
    true_labels_val = []
    for batch in val_dataloader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']

        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

        predictions_val.extend(torch.argmax(outputs.logits, dim=1).tolist())
        true_labels_val.extend(labels.tolist())

    val_accuracy = accuracy_score(true_labels_val, predictions_val)
    # print(true_labels_val)
    # print(predictions_val)

    print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {avg_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, Val Accuracy: {val_accuracy:.4f}')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Train Loss: 1.0731, Train Accuracy: 0.7210, Val Accuracy: 0.9600
Epoch 2/5, Train Loss: 0.1732, Train Accuracy: 0.9742, Val Accuracy: 1.0000
Epoch 3/5, Train Loss: 0.0736, Train Accuracy: 0.9914, Val Accuracy: 0.9886
Epoch 4/5, Train Loss: 0.0514, Train Accuracy: 0.9928, Val Accuracy: 0.9543
Epoch 5/5, Train Loss: 0.0570, Train Accuracy: 0.9914, Val Accuracy: 0.9886


In [None]:
torch.save(model.state_dict(), 'roberta_model_weights.pth')

In [None]:
# Initialize a new model instance
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=10)

# Load the saved weights
model.load_state_dict(torch.load('roberta_model_weights.pth'))

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<All keys matched successfully>

In [None]:
input_text = "Please enter your name:"
input_encoding = tokenizer(input_text, truncation=True, padding='max_length', max_length=max_length, return_tensors='pt')
input_ids = input_encoding['input_ids']
attention_mask = input_encoding['attention_mask']

# Perform prediction
with torch.no_grad():
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)

# Extract predicted label
predicted_label_index = torch.argmax(outputs.logits).item()

In [None]:
predicted_label_index

4