# LLM price prediction for amazon review - second part

In the second part of this project, we will fintuned an opensource LLM model to predict the price of an item, upload it to hugging face and then test the performance of our refined model.

In [55]:
import os
import re
import math
from tqdm import tqdm
from huggingface_hub import login
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, set_seed, BitsAndBytesConfig
from datasets import load_dataset, Dataset, DatasetDict
import wandb
from peft import LoraConfig
from trl import SFTTrainer, SFTConfig
from datetime import datetime
import matplotlib.pyplot as plt
from trl import DataCollatorForCompletionOnlyLM
from dotenv import load_dotenv
import logging

In [14]:
QWEN_2_5 = "Qwen/Qwen2.5-7B"
GEMMA_2 = "google/gemma-2-9b"
PHI_3 = "microsoft/Phi-3-medium-4k-instruct"
LLAMA_3_1 = "meta-llama/Meta-Llama-3.1-8B" #"meta-llama/Llama-3.2-3B"

BASE_MODEL = "meta-llama/Llama-3.2-3B"
PROJECT_NAME = "price-prediction"
HF_USER = "Cedric07"

DATASET_NAME = f"{HF_USER}/pricer-data"
MAX_SEQUENCE_LENGTH = 182
RUN_NAME =  f"{datetime.now():%Y-%m-%d_%H.%M.%S}"
PROJECT_RUN_NAME = f"{PROJECT_NAME}-{RUN_NAME}"
HUB_MODEL_NAME = f"{HF_USER}/{PROJECT_RUN_NAME}"

LORA_R = 32
LORA_ALPHA = 64
TARGET_MODULES = ["q_proj", "v_proj", "k_proj", "o_proj"]
LORA_DROPOUT = 0.1
QUANT_4_BIT = True

EPOCHS = 1
BATCH_SIZE = 4
GRADIENT_ACCUMULATION_STEPS = 1
LEARNING_RATE = 1e-4
LR_SCHEDULER_TYPE = 'cosine'
WARMUP_RATIO = 0.03
OPTIMIZER = "paged_adamw_32bit"

STEPS = 50
SAVE_STEPS = 2000
LOG_TO_WANDB = True
#DATASET_NAME = "ed-donner/pricer-data"
%matplotlib inline

In [15]:
HUB_MODEL_NAME

'Cedric07/price-prediction-2025-05-20_23.46.23'

In [56]:
load_dotenv(override=True)
load_dotenv('.env.anthropic', override=True)
load_dotenv('../.env.anthropic', override=True)
load_dotenv('../.env.shugging', override=True)
load_dotenv('../.env.wandb', override=True)
load_dotenv('../.env.wandbsilent', override=True)
openai_api_key = os.getenv('OPENAI_API_KEY')
anthropic_api_key = os.getenv('ANTHROPIC_API_KEY')
hugging_face_api_key = os.getenv('SHUGGING_FACE_API_KEY')
wandb_api_key = os.getenv('WANDB_API_KEY')
wandb_silent =  os.getenv('WANDB_SILENT') 

if openai_api_key:
    print(f"OpenAI API Key exists and begins {openai_api_key[:8]}")
else:
    print("OpenAI API Key not set")
    
if anthropic_api_key:
    print(f"Anthropic API Key exists and begins {anthropic_api_key[:7]}")
else:
    print("Anthropic API Key not set")
    
if hugging_face_api_key:
    print(f"Huggin Face API Key exists and begins {hugging_face_api_key[:7]}")
else:
    print("Hugging Face API Key not set") 
if wandb_api_key:
    print(f"WANDB API Key exists and begins {hugging_face_api_key[:7]}")
else:
    print("WANDB API Key not set") 

OpenAI API Key exists and begins sk-proj-
Anthropic API Key exists and begins sk-ant-
Huggin Face API Key exists and begins hf_XgXV
WANDB API Key exists and begins hf_XgXV


In [46]:
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
os.environ['ANTHROPIC_API_KEY'] =  os.getenv('ANTHROPIC_API_KEY')
os.environ['SHUGGING_FACE_API_KEY'] =  os.getenv('SHUGGING_FACE_API_KEY')
os.environ['WANDB_API_KEY'] =  os.getenv('WANDB_API_KEY') 
os.environ['WANDB_SILENT'] =  os.getenv('WANDB_SILENT') 

In [47]:
hf_token = os.environ['SHUGGING_FACE_API_KEY']
login(hf_token, add_to_git_credential=True)

In [57]:
logger = logging.getLogger("wandb")
logger.setLevel(logging.WARNING)

In [58]:
os.environ["WANDB_SILENT"] = "True"
wandb_api_key = os.environ['WANDB_API_KEY']
wandb.login(verify=True)

True

In [59]:
os.environ["WANDB_PROJECT"] = PROJECT_NAME
os.environ["WANDB_LOG_MODEL"] = "checkpoint" if LOG_TO_WANDB else "end"
os.environ["WANDB_WATCH"] = "gradients"

In [21]:
dataset = load_dataset("ed-donner/pricer-data")
train = dataset['train']
test = dataset['test']

In [22]:
if QUANT_4_BIT:
  quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
  )
else:
  quant_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_compute_dtype=torch.bfloat16
  )

In [23]:
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

In [None]:
base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=quant_config,
    device_map="auto",
)

In [None]:

response_template = "Price is $"
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)

In [None]:
lora_parameters = LoraConfig(
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    r=LORA_R,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=TARGET_MODULES,
)


In [None]:
train_parameters = SFTConfig(
    output_dir=PROJECT_RUN_NAME,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=1,
    eval_strategy="no",
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    optim=OPTIMIZER,
    save_steps=SAVE_STEPS,
    save_total_limit=10,
    logging_steps=STEPS,
    learning_rate=LEARNING_RATE,
    weight_decay=0.001,
    fp16=False,
    bf16=True,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=WARMUP_RATIO,
    group_by_length=True,
    lr_scheduler_type=LR_SCHEDULER_TYPE,
    report_to="wandb" if LOG_TO_WANDB else None,
    run_name=RUN_NAME,
    max_seq_length=MAX_SEQUENCE_LENGTH,
    dataset_text_field="text",
    save_strategy="steps",
    hub_strategy="every_save",
    push_to_hub=True,
    hub_model_id=HUB_MODEL_NAME,
    hub_private_repo=True
)

In [None]:
fine_tuning = SFTTrainer(
    model=base_model,
    train_dataset=train,
    peft_config=lora_parameters,
    args=train_parameters,
    data_collator=collator
  )

In [None]:
fine_tuning.train()

In [None]:
fine_tuning.model.push_to_hub(PROJECT_RUN_NAME, private=True)
print(f"Saved to the hub: {PROJECT_RUN_NAME}")