In [13]:
import pandas as pd
import torch
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments

In [3]:
# Load data
df = pd.read_csv("/content/yelp_reviews.csv")
df.columns = ['label', 'text']
df = df[['text', 'label']][:10000]

# Map labels to 0/1
df['label'] = df['label'].map({1: 0, 2: 1})  # 0 = negative, 1 = positive
print(df.shape)
df.sample(10)


(10000, 2)


Unnamed: 0,text,label
9903,Do not waste your time at this hotel. Don't be...,0
9213,Great selection of beers get this place it's 2...,0
9168,"Of course we tried it, for the name associated...",0
503,Had my first visit to the Saucer a few weeks a...,1
810,Stumbled on this place while sight seeing on a...,1
131,So fun to have a place like this in the Strip....,1
7759,Ive stayed here before and thought at the time...,0
5828,I agree with the other review about the decor....,0
6208,"I really like this place, but the prices are a...",1
4,"Picture Billy Joel's \""Piano Man\"" DOUBLED mix...",1


In [5]:
class CustomDataset(Dataset):
  def __init__(self, texts, labels, tokenizer, max_len=512):
    self.texts = texts
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, idx):
    text = str(self.texts[idx])
    label = torch.tensor(self.labels[idx])

    encoding = self.tokenizer(text, truncation=True, padding="max_length",
                              max_length=self.max_len)

    return {
        'input_ids': encoding['input_ids'],
        'attention_mask': encoding['attention_mask'],
        'labels': label
    }

In [7]:
checkpoint = 'distilbert-base-uncased'

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
X = df['text'].tolist()

y = df['label'].tolist()

dataset = CustomDataset(X, y, tokenizer)

In [9]:
dataset[0].keys()

dict_keys(['input_ids', 'attention_mask', 'labels'])

In [10]:
train_dataset, test_dataset = train_test_split(dataset, test_size=0.2, random_state=42)

In [12]:
def compute_metrics(example):
  labels = example.label_ids
  preds = example.predictions.argmax(-1)

  f1 = f1_score(labels, preds, average="weighted")
  acc = accuracy_score(labels, preds)

  return {'accuracy': acc, "f1": f1}

In [15]:
batch_size = 16
model_name = "distilbert_finetuned_sentiment"

args = TrainingArguments(
    output_dir = "output",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size = batch_size,
    learning_rate = 2e-5,
    num_train_epochs = 1,
    eval_strategy = 'epoch'
)

In [16]:
trainer = Trainer(model=model,
                  args=args,
                  train_dataset = train_dataset,
                  eval_dataset = test_dataset,
                  compute_metrics=compute_metrics,
                  tokenizer = tokenizer)

  trainer = Trainer(model=model,


In [17]:
trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33maachu8966[0m ([33maachu8966-kannur-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2308,0.152133,0.9485,0.948538


TrainOutput(global_step=500, training_loss=0.230830078125, metrics={'train_runtime': 445.3298, 'train_samples_per_second': 17.964, 'train_steps_per_second': 1.123, 'total_flos': 1059739189248000.0, 'train_loss': 0.230830078125, 'epoch': 1.0})

In [18]:
eval_result = trainer.evaluate()
print(eval_result)

{'eval_loss': 0.1521330177783966, 'eval_accuracy': 0.9485, 'eval_f1': 0.9485377645371555, 'eval_runtime': 31.1267, 'eval_samples_per_second': 64.254, 'eval_steps_per_second': 4.016, 'epoch': 1.0}


In [20]:
model.save_pretrained("distilbert_finetuned_sentiment")
tokenizer.save_pretrained("distilbert_finetuned_sentiment")

('distilbert_finetuned_sentiment/tokenizer_config.json',
 'distilbert_finetuned_sentiment/special_tokens_map.json',
 'distilbert_finetuned_sentiment/vocab.txt',
 'distilbert_finetuned_sentiment/added_tokens.json',
 'distilbert_finetuned_sentiment/tokenizer.json')

In [22]:
import shutil
from google.colab import files

# Replace 'folder_name' with your actual folder path
shutil.make_archive('distilbert_finetuned_sentiment', 'zip', 'distilbert_finetuned_sentiment')

# Download the zip file
files.download('distilbert_finetuned_sentiment.zip')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>