## 1) Read in data

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
reviews_df = pd.read_csv("/kaggle/input/process-scraped-reviews/processed_reviews.csv")
# Convert ratings to integer labels (0 to 9)
reviews_df['rating'] = (reviews_df['rating'] * 10 -1).astype(int)
print(reviews_df.shape)
reviews_df.head()

(77289, 2)


Unnamed: 0,review_text,rating
0,feminism displayed at its worst.,3
1,Ryan Gosling... Get in my bed RN please.\nThis...,9
2,im sorry to all the barbies hair that i’ve cut...,9
3,Micheal cera.,6
4,"Amazing, ending part made me cry 😢",7


In [3]:
# doing a test run with smaller dataset
#reviews_df = reviews_df.iloc[:3000]
#reviews_df.shape

# 2) Prepare Data

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader, TensorDataset
import torch
from torch import nn

## 2A) Split data into train/validation 

In [5]:
from sklearn.model_selection import train_test_split

# Stratified split to create train and temp (which will be split further into validation and test)
train_df, temp_df = train_test_split(reviews_df, test_size=0.4, stratify=reviews_df['rating'], random_state=42)

# Further split temp_df into validation and test sets
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['rating'], random_state=42)

## 2B) Tokenize data

In [6]:
# Initialize the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')

# Tokenize the reviews
def tokenize_reviews(reviews, tokenizer, max_length=512):
    encodings = tokenizer(reviews, truncation=True, padding=True, max_length=max_length, return_tensors='pt')
    return encodings['input_ids'], encodings['attention_mask']

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]



## 2c) Use custom Dataset object

In [7]:
from torch.utils.data import Dataset, DataLoader

class ReviewsDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        review = self.data.iloc[idx]
        encoding = self.tokenizer(
            review['review_text'],
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(review['rating'], dtype=torch.long)
        }

# Parameters
max_length = 128

# Prepare datasets using the stratified splits
train_dataset = ReviewsDataset(train_df, tokenizer, max_length)
val_dataset = ReviewsDataset(val_df, tokenizer, max_length)
test_dataset = ReviewsDataset(test_df, tokenizer, max_length)

# Data loaders (if not using Trainer)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# 3) Setting up the model

In [8]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch
from torch import nn

In [9]:
# Load the pre-trained model and modify it
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english", num_labels=10, ignore_mismatched_sizes=True)

print(model)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([10]) in the model instantiated
- classifier.weight: found shape torch.Size([2, 768]) in the checkpoint and torch.Size([10, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

# 4) Training

## 4a) Calculate weights and custom loss function 

In [10]:
import torch
from sklearn.utils.class_weight import compute_class_weight

# Compute class weights
class_weights = compute_class_weight('balanced', classes=reviews_df['rating'].unique(), y=reviews_df['rating'])
class_weights_dict = dict(zip(reviews_df['rating'].unique(), class_weights))

# Convert class weights to a tensor
class_weights_tensor = torch.tensor([class_weights_dict[c] for c in sorted(reviews_df['rating'].unique())], dtype=torch.float)

print("Class Weights:", *sorted(class_weights_dict.items()), sep='\n')

Class Weights:
(0, 10.734583333333333)
(1, 4.973552123552124)
(2, 5.2937671232876715)
(3, 2.2279907754396078)
(4, 1.814721765672693)
(5, 0.833035136882949)
(6, 0.7050629447181171)
(7, 0.4290972684876749)
(8, 0.7172993039443155)
(9, 0.46005357142857145)


## 4b) Modify the Loss Function to Include Class Weights

In [11]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
import torch.nn as nn

# Define the loss function with class weights
loss_fn = nn.CrossEntropyLoss(weight=class_weights_tensor)

# Define the loss function with class weights
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        # Forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        
        # Move the class weights tensor to the same device as the model's output
        class_weights_device = class_weights_tensor.to(logits.device)
        
        # Compute weighted loss
        loss_fn = nn.CrossEntropyLoss(weight=class_weights_device)
        loss = loss_fn(logits, labels)
        
        return (loss, outputs) if return_outputs else loss

## 4c) Prepare training args and initialize custom trainer

In [12]:
from transformers import Trainer, TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='epoch',
    report_to="none"
)

# Initialize the CustomTrainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)



## 4d ) Run training

In [13]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss
1,2.0481,1.936311
2,1.8608,1.885244
3,1.2713,2.141125


TrainOutput(global_step=8697, training_loss=1.722584720643153, metrics={'train_runtime': 1348.4278, 'train_samples_per_second': 103.171, 'train_steps_per_second': 6.45, 'total_flos': 4607840308953600.0, 'train_loss': 1.722584720643153, 'epoch': 3.0})

## 4e) Evaluate on test set

In [14]:
# Evaluate on test set
test_results = trainer.evaluate(test_dataset)
print(test_results)

{'eval_loss': 2.189265251159668, 'eval_runtime': 56.3622, 'eval_samples_per_second': 274.262, 'eval_steps_per_second': 17.157, 'epoch': 3.0}


## 5) Save model

In [15]:
# Save the model and tokenizer
model_path = './bert-sentiment-letterboxd-classification'
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('./bert-sentiment-letterboxd-classification/tokenizer_config.json',
 './bert-sentiment-letterboxd-classification/special_tokens_map.json',
 './bert-sentiment-letterboxd-classification/vocab.txt',
 './bert-sentiment-letterboxd-classification/added_tokens.json')

## 6) Do some quick predictions

In [16]:
from transformers import pipeline

# Check if GPU is available and set the device accordingly
device = 0 if torch.cuda.is_available() else -1

# Load the fine-tuned model
model = DistilBertForSequenceClassification.from_pretrained(model_path)

# Load the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

# Create a prediction pipeline
classifier = pipeline('text-classification', model=model, tokenizer=tokenizer, device=device)



In [17]:
def convert_label_to_stars(label):
    """
    Convert label to a star rating in text format.
    
    Parameters:
    - label (str): Model label (e.g., 'LABEL_0' to 'LABEL_9')
    
    Returns:
    - str: Star rating in text format (e.g., '★', '★★★½')
    """
    label_to_stars = {
        'LABEL_0': '½',
        'LABEL_1': '★',
        'LABEL_2': '★½',
        'LABEL_3': '★★',
        'LABEL_4': '★★½',
        'LABEL_5': '★★★',
        'LABEL_6': '★★★½',
        'LABEL_7': '★★★★',
        'LABEL_8': '★★★★½',
        'LABEL_9': '★★★★★'
    }
    return label_to_stars.get(label, 'Unknown')


def normalize_rating(star_rating):
    """
    Convert star rating in text format to a normalized rating score.
    
    Parameters:
    - star_rating (str): Star rating in text format (e.g., '★★★★½')
    
    Returns:
    - float: Normalized rating score.
    """
    rating_map = {'★': 1, '½': 0.5}
    return sum(rating_map[char] for char in star_rating if char in rating_map)

def predict_rating_from_review(review_text):
    prediction = classifier(review_text)

    # Convert the prediction to star rating and get confidence score
    predicted_label = prediction[0]['label']
    confidence_score = prediction[0]['score']
    star_rating_text = convert_label_to_stars(predicted_label)
    star_rating = normalize_rating(star_rating_text)

    # Output the star rating and confidence score
    print(f"Predicted Rating: {star_rating_text} ({star_rating} stars)")
    print(f"Confidence Score: {confidence_score:.2f}")
    
    return {'star_rating': star_rating, 'star_rating_text': star_rating_text,  'confidence_score': confidence_score }
    

In [18]:
# Example prediction
predict_rating_from_review("Best movie ever!")

Predicted Rating: ★★★★★ (5 stars)
Confidence Score: 0.91


{'star_rating': 5,
 'star_rating_text': '★★★★★',
 'confidence_score': 0.9135494232177734}

In [19]:
predict_rating_from_review("Okay movie, not my favourite!")

Predicted Rating: ★★½ (2.5 stars)
Confidence Score: 0.57


{'star_rating': 2.5,
 'star_rating_text': '★★½',
 'confidence_score': 0.5710189342498779}

In [20]:
predict_rating_from_review("Worst movie ever!")

Predicted Rating: ½ (0.5 stars)
Confidence Score: 0.99


{'star_rating': 0.5,
 'star_rating_text': '½',
 'confidence_score': 0.9918176531791687}