In [1]:
!pip install torch scikit-learn
!pip install --upgrade pytorch_lightning
!pip install --upgrade transformers
!pip install --upgrade datasets
!pip install --upgrade torchmetrics

Collecting pytorch_lightning
  Downloading pytorch_lightning-2.1.0-py3-none-any.whl (774 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m774.6/774.6 kB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m00:01[0m
Installing collected packages: pytorch_lightning
  Attempting uninstall: pytorch_lightning
    Found existing installation: pytorch-lightning 2.0.8
    Uninstalling pytorch-lightning-2.0.8:
      Successfully uninstalled pytorch-lightning-2.0.8
Successfully installed pytorch_lightning-2.1.0
Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m62.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m81.3 MB/s[0m eta [36m0:0

In [14]:
import pytorch_lightning as pl
import torchmetrics
import torch
import datasets
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup, AdamW
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding

BATCH_SIZE_TRAIN = 16
BATCH_SIZE_EVAL = 16
NUM_EPOCH = 5
LEARNING_RATE = 2e-5
WARM_UP_STEPS = 400

# Load the dataset
dataset = datasets.load_dataset('google_wellformed_query')

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained('distilroberta-base')

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        inputs = tokenizer(item['content'], truncation=True, padding=False, return_tensors='pt')
        return {'input_ids': inputs['input_ids'].squeeze(), 'attention_mask': inputs['attention_mask'].squeeze(), 'labels': item['rating']}

class CustomModel(pl.LightningModule):
    def __init__(self, data_module):
        super().__init__()
        self.model = AutoModelForSequenceClassification.from_pretrained('distilroberta-base')
        self.regression_head = torch.nn.Linear(self.model.config.hidden_size, 1)
        self.data_module = data_module
        
        # Instantiate metrics
        self.train_mse = torchmetrics.MeanSquaredError()
        self.val_mse = torchmetrics.MeanSquaredError()
        self.test_mse = torchmetrics.MeanSquaredError()
        
        self.train_r2 = torchmetrics.R2Score()
        self.val_r2 = torchmetrics.R2Score()
        self.test_r2 = torchmetrics.R2Score()
        
        self.train_mae = torchmetrics.MeanAbsoluteError()
        self.val_mae = torchmetrics.MeanAbsoluteError()
        self.test_mae = torchmetrics.MeanAbsoluteError()

    def forward(self, input_ids, attention_mask, **kwargs):
        outputs = self.model.base_model(input_ids=input_ids, attention_mask=attention_mask)
        rating = self.regression_head(outputs.last_hidden_state[:, 0, :])
        rating = F.sigmoid(rating)
        return rating.squeeze()
    
    def training_step(self, batch, batch_idx):
        outputs = self(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
        loss = torch.nn.functional.mse_loss(outputs, batch['labels'])
        self.log('train_loss', loss)
        self.train_mse(outputs, batch['labels'])
        self.train_r2(outputs, batch['labels'])
        self.train_mae(outputs, batch['labels'])
        return loss
    
    def validation_step(self, batch, batch_idx):
        outputs = self(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
        loss = torch.nn.functional.mse_loss(outputs, batch['labels'])
        self.log('val_loss', loss)
        self.val_mse(outputs, batch['labels'])
        self.val_r2(outputs, batch['labels'])
        self.val_mae(outputs, batch['labels'])

    def test_step(self, batch, batch_idx):
        outputs = self(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
        loss = torch.nn.functional.mse_loss(outputs, batch['labels'])
        self.log('test_loss', loss)
        self.test_mse(outputs, batch['labels'])
        self.test_r2(outputs, batch['labels'])
        self.test_mae(outputs, batch['labels'])
        return {'test_loss': loss} 
    
    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=LEARNING_RATE)
        steps_per_epoch = len(self.data_module.train_dataset) // BATCH_SIZE_TRAIN
        total_steps = steps_per_epoch * NUM_EPOCH
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=WARM_UP_STEPS, num_training_steps=total_steps)
        return [optimizer], [{'scheduler': scheduler, 'interval': 'step'}]

    def on_validation_epoch_end(self):
        self.log('val_mse', self.val_mse.compute(), prog_bar=True)
        self.log('val_r2', self.val_r2.compute(), prog_bar=True)
        self.log('val_mae', self.val_mae.compute(), prog_bar=True)
        
        # don't forget to reset at the end of epoch
        self.val_mse.reset()
        self.val_r2.reset()
        self.val_mae.reset()

    def on_test_epoch_end(self):
        self.log('test_mse', self.test_mse.compute(), prog_bar=True)
        self.log('test_r2', self.test_r2.compute(), prog_bar=True)
        self.log('test_mae', self.test_mae.compute(), prog_bar=True)
        
        # don't forget to reset at the end of epoch
        self.test_mse.reset()
        self.test_r2.reset()
        self.test_mae.reset()
    
class CustomDataModule(pl.LightningDataModule):
    def __init__(self, dataset, tokenizer):
        super().__init__()
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer)
    
    def setup(self, stage=None):
        self.train_dataset = CustomDataset(self.dataset['train'])
        self.val_dataset = CustomDataset(self.dataset['validation'])
        self.test_dataset = CustomDataset(self.dataset['test'])
    
    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=BATCH_SIZE_TRAIN, shuffle=True, collate_fn=self.data_collator)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=BATCH_SIZE_EVAL, collate_fn=self.data_collator)
    
    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=BATCH_SIZE_EVAL, collate_fn=self.data_collator)

# Initialize data module, model, and trainer
data_module = CustomDataModule(dataset, tokenizer)
model = CustomModel(data_module=data_module)
trainer = pl.Trainer(max_epochs=NUM_EPOCH,accelerator="auto")

# Train the model
trainer.fit(model, data_module)

# Evaluate the model on the test dataset
trainer.test(datamodule=data_module)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Testing: |          | 0/? [00:00<?, ?it/s]

[{'test_loss': 0.061837393790483475,
  'test_mse': 0.061837393790483475,
  'test_r2': 0.5726782083511353,
  'test_mae': 0.183049738407135}]

In [16]:
print("Saving the model...")
model.model.save_pretrained("/kaggle/working/new")

# Save the tokenizer
tokenizer = AutoTokenizer.from_pretrained('distilroberta-base')
tokenizer_save_path = "/kaggle/working/new"
tokenizer.save_pretrained(tokenizer_save_path)

# Save the regression head
torch.save(model.regression_head.state_dict(), f"/kaggle/working/new/regression_head.pth")

Saving the model...


('/kaggle/working/new/tokenizer_config.json',
 '/kaggle/working/new/special_tokens_map.json',
 '/kaggle/working/new/vocab.json',
 '/kaggle/working/new/merges.txt',
 '/kaggle/working/new/added_tokens.json',
 '/kaggle/working/new/tokenizer.json')

In [15]:
# Sentences
sentences = [
    "The cat and dog in the yard.",  # Incorrect - It should be "The cat and dog are in the yard."
    "she don't like apples.",  # Incorrect - It should be "She doesn't like apples."
    "Is rain sunny days sometimes?",  # Incorrect - It should be "Do sunny days sometimes have rain?"
    "She enjoys reading books and playing chess.",  # Correct
    "How many planets are there in our solar system?"  # Correct
]

# Tokenizing the sentences
inputs = tokenizer(sentences, truncation=True, padding=True, return_tensors='pt')

# Getting the model's predictions
with torch.no_grad():  # Disabling gradient calculation as we are only doing inference
    model.eval()  # Setting the model to evaluation mode
    predicted_ratings = model(
        input_ids=inputs['input_ids'], 
        attention_mask=inputs['attention_mask']
    )

# The predicted_ratings is a tensor, so we'll convert it to a list of standard Python numbers
predicted_ratings = predicted_ratings.squeeze().tolist()

# Printing the predicted ratings
for i, rating in enumerate(predicted_ratings):
    print(f'Sentence: {sentences[i]}')
    print(f'Predicted Rating: {rating}\n')

#Reference
'''Sentence: The cat and dog in the yard.
Predicted Rating: 0.20430190861225128

Sentence: she don't like apples.
Predicted Rating: 0.08289700001478195

Sentence: Is rain sunny days sometimes?
Predicted Rating: 0.20011138916015625

Sentence: She enjoys reading books and playing chess.
Predicted Rating: 0.8915354013442993

Sentence: How many planets are there in our solar system?
Predicted Rating: 0.974799394607544
'''

Sentence: The cat and dog in the yard.
Predicted Rating: 0.2043018490076065

Sentence: she don't like apples.
Predicted Rating: 0.08289707452058792

Sentence: Is rain sunny days sometimes?
Predicted Rating: 0.20011107623577118

Sentence: She enjoys reading books and playing chess.
Predicted Rating: 0.8915352821350098

Sentence: How many planets are there in our solar system?
Predicted Rating: 0.974799394607544



"Sentence: The cat and dog in the yard.\nPredicted Rating: 0.3482873737812042\n\nSentence: she don't like apples.\nPredicted Rating: 0.07787154614925385\n\nSentence: Is rain sunny days sometimes?\nPredicted Rating: 0.19854165613651276\n\nSentence: She enjoys reading books and playing chess.\nPredicted Rating: 0.9327691793441772\n\nSentence: How many planets are there in our solar system?\nPredicted Rating: 0.9746372103691101\n"