In [1]:
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split

# Load your dataset (assuming a CSV file)
reviews_df = pd.read_csv("/kaggle/input/letterboxd-reviews-2024/cleaned_reviews.csv")

# Split dataset into training and test sets
train_df, test_df = train_test_split(reviews_df, test_size=0.2, random_state=42)

In [2]:
reviews_df.head()

Unnamed: 0,review_text,rating
0,monkey mondays #33,0.8
1,I mean...it's no Pride and Prejudice (2005) bu...,0.6
2,Addressed my inert fear of pink and pretty dre...,0.6
3,"it was good for the most part, couldn’t really...",0.7
4,"Well, I'm late to the bespoke party, but this ...",0.8


## 0. Dataset Preparation

In [5]:
from datasets import Dataset
from transformers import AutoTokenizer

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Convert DataFrame to Hugging Face Dataset with 'review_text' and 'rating'
train_dataset = Dataset.from_pandas(train_df[['review_text', 'rating']])
test_dataset = Dataset.from_pandas(test_df[['review_text', 'rating']])

# Rename 'rating' to 'labels'
train_dataset = train_dataset.rename_column('rating', 'labels')
test_dataset = test_dataset.rename_column('rating', 'labels')

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['review_text'], padding="max_length", truncation=True)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Set format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



Map:   0%|          | 0/40 [00:00<?, ? examples/s]

Map:   0%|          | 0/11 [00:00<?, ? examples/s]

## 1. Load and Modify the Pretrained Model

You will need to modify a pretrained BERT model to fit a regression task. Here’s an example of how you can adapt BERT for regression using the Hugging Face Transformers library:

In [6]:
from transformers import BertModel
import torch
from torch import nn

class BertForRegression(nn.Module):
    def __init__(self, model_name):
        super(BertForRegression, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.regressor = nn.Linear(self.bert.config.hidden_size, 1)  # 1 for regression output

    def forward(self, input_ids, attention_mask=None, token_type_ids=None, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pooled_output = outputs[1]  # [1] corresponds to the pooled output
        regression_output = self.regressor(pooled_output)

        # For training, return loss
        if labels is not None:
            loss_fct = torch.nn.MSELoss()
            loss = loss_fct(regression_output.squeeze(), labels)
            return (loss, regression_output)
        
        return regression_output



In [31]:
from transformers import BertModel, BertPreTrainedModel
import torch.nn as nn
import torch.nn.functional as F


class BertForRegression(BertPreTrainedModel):  # Inherit from BertPreTrainedModel
    def __init__(self, config):
        super().__init__(config)
        self.bert = BertModel(config)
        self.regressor = nn.Linear(config.hidden_size, 1)
        self.init_weights()  # Initialize the weights


    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            return_dict=True
        )
        pooled_output = outputs["pooler_output"]
        logits = self.regression_head(pooled_output)
        logits = torch.sigmoid(logits)  # Apply sigmoid to constrain output

        loss = None
        if labels is not None:
            loss_fct = torch.nn.MSELoss()
            loss = loss_fct(logits.squeeze(), labels)

        return (loss, logits) if loss is not None else logits



In [32]:
from transformers import BertModel, BertPreTrainedModel
import torch.nn as nn
import torch.nn.functional as F

class BertForRegression(nn.Module):
    def __init__(self, model_name):
        super(BertForRegression, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.regressor = nn.Linear(self.bert.config.hidden_size, 1)  # Output size 1 for regression

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            return_dict=True
        )
        pooled_output = outputs["pooler_output"]
        logits = self.regression_head(pooled_output)
        logits = torch.sigmoid(logits)  # Apply sigmoid to constrain output

        loss = None
        if labels is not None:
            loss_fct = torch.nn.MSELoss()
            loss = loss_fct(logits.squeeze(), labels)

        return (loss, logits) if loss is not None else logits


In [37]:
from transformers import BertModel
import torch.nn as nn
import torch

class BertForRegression(nn.Module):
    def __init__(self, model_name):
        super(BertForRegression, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.regressor = nn.Linear(self.bert.config.hidden_size, 1)  # Output size 1 for regression

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            return_dict=True
        )
        pooled_output = outputs["pooler_output"]
        logits = self.regressor(pooled_output)  # Use self.regressor
        logits = torch.sigmoid(logits)  # Apply sigmoid to constrain output to [0, 1]

        loss = None
        if labels is not None:
            loss_fct = torch.nn.MSELoss()
            loss = loss_fct(logits.squeeze(), labels)

        return (loss, logits) if loss is not None else logits


## 2. Use the Correct Loss Function

Use Mean Squared Error (MSE) for regression:

In [38]:
from transformers import Trainer
import torch

class RegressionTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get('labels')
        outputs = model(**inputs)
        logits = outputs[0]  # for regression model, outputs[0] is the predictions
        
        # Ensure labels and logits are of same shape
        if labels is None:
            raise ValueError("Labels are not provided in inputs.")
        
        loss_fct = torch.nn.MSELoss()
        loss = loss_fct(logits.squeeze(), labels)
        
        return (loss, outputs) if return_outputs else loss



In [39]:
class RegressionTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get('labels')
        outputs = model(**inputs)
        logits = outputs[0]  # Model should return logits in outputs[0]
        
        # Ensure labels and logits are of the same shape
        if labels is None:
            raise ValueError("Labels are not provided in inputs.")
        
        labels = labels.squeeze(-1)  # Match the dimension of logits
        loss_fct = torch.nn.MSELoss()
        loss = loss_fct(logits, labels)
        
        return (loss, outputs) if return_outputs else loss


## 4. Set Up and Train the Model

Initialize the Trainer with your regression-specific trainer and dataset:

In [42]:
from transformers import TrainingArguments

model_name = 'bert-base-uncased'  # Or your preferred BERT variant
model = BertForRegression(model_name)

training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    report_to="none"  
)





In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=2,  # Use smaller batch sizes
    per_device_eval_batch_size=2,
    num_train_epochs=100,  # More epochs to compensate for smaller batch size
    logging_dir='./logs',
    logging_steps=1,  # Log more frequently
    evaluation_strategy="steps",  # Evaluate more often
    eval_steps=1,  # Evaluate every step
    save_steps=5,  # Save checkpoints frequently
    save_total_limit=2,  # Keep only the last 2 checkpoints
    report_to="none",
    learning_rate=2e-5,  # Lower learning rate for fine-tuning on small data
    weight_decay=0.01,
)

In [None]:
trainer = RegressionTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,  # Ensure this is a properly formatted dataset
    eval_dataset=test_dataset
)

In [41]:
trainer.train()

  return F.mse_loss(input, target, reduction=self.reduction)


Epoch,Training Loss,Validation Loss
1,No log,0.122576
2,0.196600,0.059552
3,0.196600,0.042784


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


TrainOutput(global_step=15, training_loss=0.1582450787226359, metrics={'train_runtime': 428.7124, 'train_samples_per_second': 0.28, 'train_steps_per_second': 0.035, 'total_flos': 0.0, 'train_loss': 0.1582450787226359, 'epoch': 3.0})

In [12]:
# Evaluate the model on the test dataset
eval_results = trainer.evaluate(eval_dataset=test_dataset)

# Print evaluation results
print("Evaluation Results:", eval_results)

Evaluation Results: {'eval_loss': 0.012339148670434952, 'eval_runtime': 9.1825, 'eval_samples_per_second': 1.198, 'eval_steps_per_second': 0.218, 'epoch': 3.0}


In [52]:
# Set the model to evaluation mode
model.eval()

# Your own review text
review_text = "so good, best film"
review_text = "so bad, worst film"


# Tokenize the input review
inputs = tokenizer(review_text, padding="max_length", truncation=True, return_tensors="pt")

# Make the prediction
with torch.no_grad():
    outputs = model(**inputs)
    predicted_rating = outputs.squeeze().item()  # Directly access the tensor output

# Output the predicted rating
print(f"Predicted Rating: {predicted_rating:.2f}")


Predicted Rating: 0.39


In [23]:
# Set the model to evaluation mode
model.eval()

# Your own review text
review_text = "I absolutely hated it"

# Tokenize the input review
inputs = tokenizer(review_text, padding="max_length", truncation=True, return_tensors="pt")

# Make the prediction
with torch.no_grad():
    outputs = model(**inputs)
    predicted_rating = outputs.squeeze().item()
    predicted_rating = max(0.0, min(1.0, predicted_rating))  # Clamp to [0, 1] range

# Output the predicted rating
print(f"Predicted Rating: {predicted_rating:.2f}")


Predicted Rating: 0.00


In [14]:

# Set the model to evaluation mode
model.eval()

# Your own review text
review_text = "I absolutely loved this movie! The story was gripping and the acting was top-notch."

# Tokenize the input review
inputs = tokenizer(review_text, padding="max_length", truncation=True, return_tensors="pt")

# Make the prediction
with torch.no_grad():
    outputs = model(**inputs)
    predicted_rating = outputs.logits.squeeze().item()

# Output the predicted rating
print(f"Predicted Rating: {predicted_rating:.2f}")


AttributeError: 'Tensor' object has no attribute 'logits'

In [12]:
from transformers import AutoTokenizer

# Load pre-trained BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['review_text'], padding="max_length", truncation=True)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)



Map:   0%|          | 0/40 [00:00<?, ? examples/s]

Map:   0%|          | 0/11 [00:00<?, ? examples/s]

In [16]:
from transformers import AutoModelForSequenceClassification

# Load pre-trained BERT model with a regression head
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=1)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    eval_strategy="epoch",
    logging_dir="./logs",
    report_to="none"  
)


In [22]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)


In [24]:
from transformers import Trainer, TrainingArguments, BertForSequenceClassification
import torch

# Assuming you're using a classification model:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)  # Adjust num_labels to your case

# Define custom Trainer class with compute_loss method
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        # Forward pass
        outputs = model(**inputs)
        logits = outputs.get('logits')

        # Compute the loss (CrossEntropyLoss for classification)
        labels = inputs.get('labels')
        loss_fct = torch.nn.CrossEntropyLoss()
        loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
# Initialize the Trainer with the custom loss function
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,  # Your training dataset
    eval_dataset=test_dataset  # Your evaluation dataset
)

In [1]:
trainer.train()

NameError: name 'trainer' is not defined