## Description of the Code


### Key Components:

#### - Data Loading and Preprocessing

Loads a Yelp reviews dataset from a CSV file.
Selects the first 1,000 reviews to reduce resource usage.
Adjusts the review ratings to be zero-indexed and renames the 'stars' column to 'label'.

#### - Dataset Splitting

#### - Model Fine-Tuning

#### - Defines functions to fine-tune BERT and RoBERTa models

#### - Comparison with SOTA


In [4]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from datasets import Dataset
from sklearn.metrics import accuracy_score

# Read CSV file using Pandas library
df = pd.read_csv('yelp.csv')

# Let's select the first 1,000 reviews for this example to reduce resource usage
df = df.head(1000)

# Ensure that the DataFrame has the expected columns
assert 'text' in df.columns and 'stars' in df.columns, "CSV file must contain 'text' and 'stars' columns"

# Let's Adjust labels to be in the range 0 to 4
df['stars'] = df['stars'] - 1

# Rename 'stars' column to 'label' to match the expected column name
df = df.rename(columns={"stars": "label"})

# Let's split the data into training and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Converting the DataFrame to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Function to preprocess the dataset
def preprocess_function(examples, tokenizer):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)

## Fine-Tuning step:

# Fine-Tune BERT
def fine_tune_bert(train_dataset, test_dataset):
    model_name = "prajjwal1/bert-mini"  # Use a smaller BERT model
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertForSequenceClassification.from_pretrained(model_name, num_labels=5)  # Yelp reviews are rated from 1 to 5 stars

    train_dataset = train_dataset.map(lambda e: preprocess_function(e, tokenizer), batched=True)
    test_dataset = test_dataset.map(lambda e: preprocess_function(e, tokenizer), batched=True)

    train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
    test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

    training_args = TrainingArguments(
        output_dir="./results/bert",
        evaluation_strategy="epoch",
        save_strategy="epoch",  # Set save_strategy to 'epoch'
        logging_dir="./logs/bert",
        learning_rate=2e-5,
        per_device_train_batch_size=4,  # Reduce the batch size
        per_device_eval_batch_size=4,  # Reduce the batch size
        num_train_epochs=2,  # Reduce the number of epochs
        weight_decay=0.01,
        save_total_limit=1,
        load_best_model_at_end=True,
        gradient_accumulation_steps=2,  # Accumulate gradients over 2 steps
    )

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = torch.argmax(torch.tensor(logits), dim=-1)
        return {'accuracy': accuracy_score(labels, predictions)}

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics
    )

    trainer.train()
    results = trainer.evaluate()
    return results['eval_accuracy']

# Fine-Tune RoBERTa (Custom Model)
def fine_tune_roberta(train_dataset, test_dataset):
    model_name = "distilroberta-base"  # Use a smaller RoBERTa model
    tokenizer = RobertaTokenizer.from_pretrained(model_name)
    model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=5)  # Yelp reviews are rated from 1 to 5 stars

    train_dataset = train_dataset.map(lambda e: preprocess_function(e, tokenizer), batched=True)
    test_dataset = test_dataset.map(lambda e: preprocess_function(e, tokenizer), batched=True)

    train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
    test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

    training_args = TrainingArguments(
        output_dir="./results/roberta",
        evaluation_strategy="epoch",
        save_strategy="epoch",  # Set save_strategy to 'epoch'
        logging_dir="./logs/roberta",
        learning_rate=2e-5,
        per_device_train_batch_size=4,  # Reduce the batch size
        per_device_eval_batch_size=4,  # Reduce the batch size
        num_train_epochs=2,  # Reduce the number of epochs
        weight_decay=0.01,
        save_total_limit=1,
        load_best_model_at_end=True,
        gradient_accumulation_steps=2,  # Accumulate gradients over 2 steps
    )

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = torch.argmax(torch.tensor(logits), dim=-1)
        return {'accuracy': accuracy_score(labels, predictions)}

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics
    )

    trainer.train()
    results = trainer.evaluate()
    return results['eval_accuracy']

# Let's compare with SOTA Results
def compare_with_sota():
    sota_accuracy = 0.85  # Example SOTA accuracy value for sentiment analysis

    bert_accuracy = fine_tune_bert(train_dataset, test_dataset)
    roberta_accuracy = fine_tune_roberta(train_dataset, test_dataset)

    print(f"Fine-tuned BERT Accuracy: {bert_accuracy:.4f}")
    print(f"Fine-tuned RoBERTa Accuracy: {roberta_accuracy:.4f}")
    print(f"State-of-the-Art Accuracy: {sota_accuracy:.4f}")

    if roberta_accuracy > sota_accuracy:
        print("Our RoBERTa model outperforms SOTA methods!")
    else:
        print("Our models are below SOTA performance. Further improvements are needed.")

if __name__ == "__main__":
    compare_with_sota()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-mini and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None)


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.455758,0.35
2,No log,1.446912,0.42




vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None)


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.155777,0.525
2,No log,1.053766,0.54


Fine-tuned BERT Accuracy: 0.4200
Fine-tuned RoBERTa Accuracy: 0.5400
State-of-the-Art Accuracy: 0.8500
Our models are below SOTA performance. Further improvements are needed.
