In [1]:
!pip install transformers
!pip install peft
!pip install torch
!pip install numpy
!pip install evaluate
from datasets import load_dataset, DatasetDict, Dataset
from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification,\
DataCollatorWithPadding,TrainingArguments,Trainer



In [2]:
from peft import PeftModel, PeftConfig, get_peft_model,LoraConfig
import evaluate
import torch
import numpy as np

In [3]:
#Choosing model to fine tune
model_choice = 'distilbert-base-uncased'

In [4]:
#define  label maps
idLabel={0:"Bad", 1 :"Good"}
labelId={0:"Bad", 1 :"Good"}


In [5]:
#generate classificaaiton model 
model = AutoModelForSequenceClassification.from_pretrained(
    model_choice,num_labels = 2, id2label = idLabel, label2id = labelId)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# Getting dat aready for classification

import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset

data = pd.read_csv("Restaurant_Reviews.tsv", delimiter='\t')

train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)  # Adjust test_size as needed

train_dict = {"text": train_data['Review'], "label": train_data['Liked']}
val_dict = {"text": val_data['Review'], "label": val_data['Liked']}

dataset={
    "train":Dataset.from_dict(train_dict),
    "validation":Dataset.from_dict(val_dict)
}

hf_dataset = DatasetDict(dataset)

In [7]:
# Initialize a tokenizer object from a pre-trained model
tokenizer = AutoTokenizer.from_pretrained(model_choice, add_prefix_space =True)

In [8]:
def tokenizer_fun(examples):
    text = examples["text"]
    tokenizer.truncation_side = 'left'
    tokenized_inputs = tokenizer(
        text,
        return_tensors ="np",  
        truncation=True,       
        max_length=512         
        )
    return tokenized_inputs
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))
    
tokenized_dataset =  hf_dataset.map(tokenizer_fun, batched = True)

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [9]:
data_collator = DataCollatorWithPadding(tokenizer = tokenizer)

In [11]:
# Load the accuracy metric from an evaluation module
accuracy = evaluate.load("accuracy")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}


In [12]:
import torch

# Define the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Move the model to the device
model.to(device)

# define list of examples
text_list = [ "The food was delicious and the service was excellent. I will definitely be back!",
             "Not a fan, don't recommed.",
            "Food was as as soggy as teh ",
            "This place is a hidden gem! The food was amazing and the prices were reasonable.",
            "This restaurant is overpriced and the food was bland. I wouldn't recommend it. ",
            "I was disappointed with the food and the service was slow. I won't be returning. "]

test_harder_list = [
    "The ambiance was quirky, but it somehow worked. The food was decent, but forgettable.",
    "This place is definitely an experience. I wouldn't say it's good or bad, just... unique.",
    "It met expectations. Nothing special, but nothing terrible either.",
    "This restaurant is definitely on the pricier side, but the service was impeccable.",
    "It's a shame they changed the menu. I used to love coming here, but now I'm not sure I'll be back.",
    "This place has a real 'hole-in-the-wall' vibe, but the food is surprisingly good.",
    "It was a fun night out with friends, but the food wasn't the main attraction.",
    "I can't decide if I loved it or hated it. It was definitely something different.",
    "This place gets a lot of hype, but I honestly don't get it. Maybe I just ordered the wrong thing.",
    "It's not fancy, but it's reliable. I know what I'm going to get every time I come here."
]
print("Untrained model predictions:")
print("----------------------------")
for text in text_list:
    # tokenize text
    inputs = tokenizer.encode(text, return_tensors="pt").to(device)
    # compute logits
    logits = model(inputs).logits
    # convert logits to label
    predictions = torch.argmax(logits)

    print(text + " - " + idLabel[predictions.tolist()])

Untrained model predictions:
----------------------------
The food was delicious and the service was excellent. I will definitely be back! - Good
Not a fan, don't recommed. - Good
Food was as as soggy as teh  - Good
This place is a hidden gem! The food was amazing and the prices were reasonable. - Good
This restaurant is overpriced and the food was bland. I wouldn't recommend it.  - Good
I was disappointed with the food and the service was slow. I won't be returning.  - Good


In [13]:
peft_config = LoraConfig(task_type="SEQ_CLS",
                        r=4,
                        lora_alpha=32,
                        lora_dropout=0.01,
                        target_modules = ['q_lin'])

In [14]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 628,994 || all params: 67,584,004 || trainable%: 0.9306847223789819


In [15]:
# hyperparameters
lr = 1e-3
batch_size = 4
num_epochs = 10

In [16]:
# define training arguments
training_args = TrainingArguments(
    output_dir= model_choice + "-lora-text-classification",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [17]:
# creater trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator, # this will dynamically pad examples in each batch to be equal length
    compute_metrics=compute_metrics,
)

# train model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.235645,{'accuracy': 0.91}
2,No log,0.384195,{'accuracy': 0.895}
3,0.258500,0.39868,{'accuracy': 0.915}
4,0.258500,0.531945,{'accuracy': 0.91}
5,0.051500,0.476825,{'accuracy': 0.92}
6,0.051500,0.477037,{'accuracy': 0.935}
7,0.051500,0.585966,{'accuracy': 0.92}
8,0.008800,0.555671,{'accuracy': 0.93}
9,0.008800,0.541229,{'accuracy': 0.935}
10,0.002600,0.51968,{'accuracy': 0.935}


Checkpoint destination directory distilbert-base-uncased-lora-text-classification\checkpoint-200 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory distilbert-base-uncased-lora-text-classification\checkpoint-400 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory distilbert-base-uncased-lora-text-classification\checkpoint-600 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory distilbert-base-uncased-lora-text-classification\checkpoint-800 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory distilbert-base-uncased-lora-text-classification\checkpoint-1000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory distilbert-base-uncased-lora-text-classificati

TrainOutput(global_step=2000, training_loss=0.08033175712823867, metrics={'train_runtime': 48.2187, 'train_samples_per_second': 165.911, 'train_steps_per_second': 41.478, 'total_flos': 50637174998208.0, 'train_loss': 0.08033175712823867, 'epoch': 10.0})

In [18]:
model.to(device)
print("Trained model predictions:")
print("--------------------------")
for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt").to(device) # moving to mps for Mac (can alternatively do 'cpu')

    logits = model(inputs).logits
    predictions = torch.max(logits,1).indices

    print(text + " - " + idLabel[predictions.tolist()[0]])

Trained model predictions:
--------------------------
The food was delicious and the service was excellent. I will definitely be back! - Good
Not a fan, don't recommed. - Bad
Food was as as soggy as teh  - Bad
This place is a hidden gem! The food was amazing and the prices were reasonable. - Good
This restaurant is overpriced and the food was bland. I wouldn't recommend it.  - Bad
I was disappointed with the food and the service was slow. I won't be returning.  - Bad
