## 1) Read in data

In [1]:
import pandas as pd

In [2]:
reviews_df = pd.read_csv("/kaggle/input/process-scraped-reviews/processed_reviews.csv")
reviews_df.head()

Unnamed: 0,review_text,rating
0,feminism displayed at its worst.,0.4
1,Ryan Gosling... Get in my bed RN please.\nThis...,1.0
2,im sorry to all the barbies hair that i’ve cut...,1.0
3,Micheal cera.,0.7
4,"Amazing, ending part made me cry 😢",0.8


# 2) Prepare Data

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader, TensorDataset
import torch
from torch import nn

## 2A) Split data into train/validation 

In [4]:
from sklearn.model_selection import train_test_split

# Load and split the DataFrame
train_df, val_df = train_test_split(reviews_df, test_size=0.2, random_state=42)

## 2B) Tokenize data

In [5]:
# Initialize the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')

# Tokenize the reviews
def tokenize_reviews(reviews, tokenizer, max_length=512):
    encodings = tokenizer(reviews, truncation=True, padding=True, max_length=max_length, return_tensors='pt')
    return encodings['input_ids'], encodings['attention_mask']

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]



## 2c) Use custom Dataset object

In [6]:
def preprocess_data(df, tokenizer):
    input_ids, attention_mask = tokenize_reviews(df['review_text'].tolist(), tokenizer)
    labels = torch.tensor(df['rating'].values, dtype=torch.float).unsqueeze(-1)  # Ensure labels are of shape [batch_size, 1]
    return input_ids, attention_mask, labels

class ReviewsDataset(Dataset):
    def __init__(self, input_ids, attention_mask, labels):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.labels[idx]
        }

# Create datasets for training and validation
input_ids_train, attention_mask_train, labels_train = preprocess_data(train_df, tokenizer)
input_ids_val, attention_mask_val, labels_val = preprocess_data(val_df, tokenizer)

train_dataset = ReviewsDataset(input_ids_train, attention_mask_train, labels_train)
val_dataset = ReviewsDataset(input_ids_val, attention_mask_val, labels_val)

# 3) Setting up the model

In [7]:
from transformers import DistilBertForSequenceClassification
import torch
from torch import nn

class CustomDistilBertForRegression(DistilBertForSequenceClassification):
    def __init__(self, original_model):
        super().__init__(config=original_model.config)
        self.distilbert = original_model.distilbert
        self.regressor = nn.Linear(original_model.config.hidden_size, 1)

    def forward(self, input_ids=None, attention_mask=None, labels=None, **kwargs):
        outputs = self.distilbert(input_ids, attention_mask=attention_mask, **kwargs)
        hidden_state = outputs.last_hidden_state
        logits = self.regressor(hidden_state[:, 0])  # Use the hidden state of the first token

        # Compute loss if labels are provided
        if labels is not None:
            loss_fct = nn.MSELoss()
            loss = loss_fct(logits.squeeze(-1), labels)
            return (loss, logits)  # Return loss and logits
        return logits


In [8]:
# Load the pre-trained model and modify it
original_model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

model = CustomDistilBertForRegression(original_model)
model.config.problem_type = "regression"

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

# 4) Training

In [9]:
from transformers import Trainer, TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy='epoch',
    remove_unused_columns=False,  # Ensure that the Trainer does not ignore columns
    report_to="none"
)

# Define a custom metric computation function if needed
def compute_metrics(p):
    predictions = p.predictions.squeeze(-1)  # Remove the last dimension
    labels = p.label_ids.squeeze(-1)
    mse = ((predictions - labels) ** 2).mean().item()
    return {'mse': mse}

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)



In [10]:
# Train the model
trainer.train()

  return F.mse_loss(input, target, reduction=self.reduction)


Epoch,Training Loss,Validation Loss,Mse
1,0.0439,0.043448,0.043097
2,0.0431,0.042954,0.042339
3,0.0446,0.042833,0.041851


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


TrainOutput(global_step=11595, training_loss=0.04420998163293178, metrics={'train_runtime': 2692.6952, 'train_samples_per_second': 68.887, 'train_steps_per_second': 4.306, 'total_flos': 1.271804012916129e+16, 'train_loss': 0.04420998163293178, 'epoch': 3.0})

In [11]:
# Save the model and tokenizer
model.save_pretrained('./bert-sentiment-letterboxd-regression1')
tokenizer.save_pretrained('./bert-sentiment-letterboxd-regression1')

('./bert-sentiment-letterboxd-regression1/tokenizer_config.json',
 './bert-sentiment-letterboxd-regression1/special_tokens_map.json',
 './bert-sentiment-letterboxd-regression1/vocab.txt',
 './bert-sentiment-letterboxd-regression1/added_tokens.json')