In [1]:
import os
import json
import re
import pandas as pd

with open(r"C:\Users\Dreamcore\Downloads\yelp_academic_dataset_review.json\yelp_academic_dataset_review.json", encoding="utf-8") as f:
    data = [json.loads(line) for line in f if line.strip()]

df = pd.DataFrame(data)

In [2]:
review_id_to_finetune = ['UBp0zWyH60Hmw6Fsasei7w',
 'lUUhg8ltDsUZ9h0xnwY4Dg',
 'YcLXh-3UC9y6YFAI9xxzPQ',
 'ra9fNjYUumKp_iaqv-jjDg',
 'k7HWRysS3ICXxzhbddCemw',
 '-pBDXBop_8v1dKk-BBpyzQ',
 'cC79tWLtH1U1n-oi7EozMw',
 'W67uN2nO0Tp22YMoano4JA',
 'dTr-7y6FCf-EUgvAt-XYEQ',
 'tNhXpFVKYXjJx5mipbc_7A',
 'rJ2fUIf-Q7AGx79thOUjMg',
 '4hi2jgnEhwaKfFiZ_Tz4Hw',
 'M4zS53PfEzODW1P2U9JUig',
 '3djwsoWuYzfE8VD3jlNd8Q',
 'TW4DXj74C82qhGjOgnmjyQ',
 't4vzUBVfAPfg7i1EB9Y_6Q',
 'SiazHJtfovy9nwDFfICdwA',
 '0bT9gQpQO6MX3KVv5jrtdg',
 'pIFGqL210EybhN6t0Jtzjw',
 'Sk9DcKI55f0s_253vc50Ig',
 'psxbpQH6lDnBpXli53uxIw',
 '4MsqB18bYyuf0JnDzhKEsg',
 'pPKI8l5FuX3aOHCQAlsfsg',
 'XT_3UpEhO5eJIFxevnv_Yw',
 'Bh1cIMh_mZArniNQoCwZaA',
 'YwwXtjCW2r4tlodyzUfzKA']

sample_df = df[df["review_id"].isin(review_id_to_finetune)]

In [3]:
sample_df.head(n=1)

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
14,UBp0zWyH60Hmw6Fsasei7w,4Uh27DgGzsp6PqrH913giQ,otQS34_MymijPTdNBoBdCw,4.0,0,2,0,The bun makes the Sonoran Dog. It's like a snu...,2011-10-27 17:12:05


In [4]:
data = []

for sample in sample_df.iterrows():
    sample_id = sample[0]
    review = sample[1]["text"]
    sentiment = "positive"
    system_prompt = """
                    You are a classifier that determines whether a movie review expresses a positive or negative sentiment. 
                    Read the user's review carefully, paying attention to the final review verdict, and respond with exactly one word:\n\n
                    positive — if the final verdict is approving of the movie.\n
                    negative — if the final verdict is not approving of the movie.\n\n
                    Rules:\n
                    • Output only one lowercase word: 'positive' or 'negative'.\n
                    """
    conversations = [
        {"from": "human", "value": f"Review: {review}, What is the sentiment of this review?"},
        {"from": "gpt", "value": sentiment}
    ]
    data.append({
        "id": sample_id,
        "system": system_prompt,
        "conversations": conversations
    })
    

In [5]:
data

[{'id': 14,
  'system': "\n                    You are a classifier that determines whether a movie review expresses a positive or negative sentiment. \n                    Read the user's review carefully, paying attention to the final review verdict, and respond with exactly one word:\n\n\n                    positive — if the final verdict is approving of the movie.\n\n                    negative — if the final verdict is not approving of the movie.\n\n\n                    Rules:\n\n                    • Output only one lowercase word: 'positive' or 'negative'.\n\n                    ",
  'conversations': [{'from': 'human',
    'value': "Review: The bun makes the Sonoran Dog. It's like a snuggie for the pup. A first, it seems ridiculous and almost like it's going to be too much, exactly like everyone's favorite blanket with sleeves. Too much softness, too much smush, too indulgent.  Wrong. It's warm, soft, chewy, fragrant, and it succeeds where other famed Sonoran Dogs fail. \n\nT

In [6]:
import torch
from transformers import AutoTokenizer, AutoProcessor, TrainingArguments, LlavaForConditionalGeneration
from trl import SFTTrainer
from peft import LoraConfig

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
model_id = "llava-hf/llava-1.5-7b-hf"
model = LlavaForConditionalGeneration.from_pretrained(model_id,
                                                      torch_dtype=torch.float16)

`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 14.99it/s]


In [8]:
LLAVA_CHAT_TEMPLATE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. {% for conversation in conversations %}{% if conversation['from'] == 'human' %}USER: {% else %}ASSISTANT: {% endif %}{% if '<image>' in conversation['value'] %}{{ conversation['value'] }}{% else %}{{ conversation['value'] }}{% endif %}{% if conversation['from'] == 'human' %} {% else %}{{eos_token}}{% endif %}{% endfor %}"""

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
processor = AutoProcessor.from_pretrained(model_id)
processor.tokenizer = tokenizer

Fetching 2 files: 100%|██████████████████████████████████████████████████████████████████████████| 2/2 [00:00<?, ?it/s]
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [10]:
from PIL import Image

class LLavaDataCollator:
    def __init__(self, processor):
        self.processor = processor

    def __call__(self, examples):
        texts = []

        for example in examples:
            messages = []

            for conv in example["conversations"]:
                role = "user" if conv["from"] == "human" else "assistant"
                content_items = []

                # Only handle text, ignore any <image> markers
                for line in conv["value"].splitlines():
                    if line.strip() and line.strip() != "<image>":
                        content_items.append({"type": "text", "text": line.strip()})

                messages.append({"role": role, "content": content_items})

            # Apply chat template
            text = self.processor.tokenizer.apply_chat_template(
                messages, tokenize=False, add_generation_prompt=False
            )
            texts.append(text)

        # Tokenize batch
        batch = self.processor(
            text=texts,
            return_tensors="pt",
            padding=True
        )

        # Prepare labels
        labels = batch["input_ids"].clone()
        if self.processor.tokenizer.pad_token_id is not None:
            labels[labels == self.processor.tokenizer.pad_token_id] = -100
        batch["labels"] = labels

        return batch



data_collator = LLavaDataCollator(processor)

In [11]:
training_args = TrainingArguments(
    output_dir="llava-hf/llava-1.5-7b-hf-fine_tuned_on_yelp",
    report_to="tensorboard",
    learning_rate=1.4e-5,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,
    logging_steps=5,
    num_train_epochs=8,
    push_to_hub=False,
    gradient_checkpointing=True,
    remove_unused_columns=False,
    fp16=True,
    bf16=False
)

In [12]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=8,
    target_modules="all-linear"
)

In [13]:
from huggingface_hub import login
login("")

ValueError: Token fyp not found in C:\Users\Dreamcore\.cache\huggingface\stored_tokens

In [14]:
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=data,
    eval_dataset=data,
    peft_config=lora_config,
    data_collator=data_collator,
)

Fetching 2 files: 100%|██████████████████████████████████████████████████████████████████████████| 2/2 [00:00<?, ?it/s]
  return t.to(


In [15]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


Step,Training Loss
5,3.3781
10,3.3087
15,3.516
20,3.1229
25,3.414
30,3.5361


TrainOutput(global_step=32, training_loss=3.3575558364391327, metrics={'train_runtime': 8524.7324, 'train_samples_per_second': 0.024, 'train_steps_per_second': 0.004, 'total_flos': 4442098714398720.0, 'train_loss': 3.3575558364391327, 'entropy': 1.8576285243034363, 'num_tokens': 42968.0, 'mean_token_accuracy': 0.42250141501426697, 'epoch': 8.0})

In [14]:
trainer.model.save_pretrained("lora_finetuned_movie_review")