# LLM Fine-Tuning for Recommendation System

## TABLE OF CONTENT
### $~~~$ - 1. Load Data
### $~~~$ - 2. Data Transformation
### $~~~$ - 3. Model Preparation
### $~~~$ - 4. Fine-Tuning
### $~~~$ - 5. Save Fine-tuned Model
### $~~~$ - 6. Model Test

---
## 1. Load Data

In [None]:
from tqdm import tqdm
import pandas as pd
import os

In [None]:
base_dir = "../"

In [None]:
# Load Amazon products datasets
products_path = os.path.join(base_dir, 'trainData/amazon_products.train.csv')

In [None]:
# Read csv
products_df = pd.read_csv(products_path)

In [None]:
# Display basic information about the datasets
print("[*] Products Dataset:")
products_df.info()

In [None]:
products_df.head()

In [None]:
# Check for missing values
print("\n[*] Missing Values in Products:")
print(products_df.isnull().sum())

### Drop NaN PRICE (optional)

In [None]:
products_df.dropna(inplace=True)
products_df.reset_index(inplace=True, drop=True)
products_df.info()

---
## 2. Data Transformation

In [None]:
from datasets import Dataset
import numpy as np

In [None]:
products_df.head()

In [None]:
# Total products: 67712
#     0:9999  -
# 10000:19999 -
# 20000:29999 -
# 30000:39999 -
# 40000:49999 -
# 50000:59999 -
# 60000:END   -

# _products_df = products_df.loc[0:9999, :] # Test
_products_df = products_df.loc[:, :]

In [None]:
_products_df.columns

In [None]:
def construct_training_text(row):
    return (
        f"Product ID: {row['PRODUCT_ID']}\n"
        f"Title: {row['TITLE'].replace('\n', ' ')}\n"
        f"Description: {row['DESCRIPTION'].replace('\n', ' ')}\n"
        f"Category: {row['MAIN_CATEGORY']}\n"
        f"Average rating: {row['AVERAGE_RATING']}\n"
        f"Price: {row['PRICE']}\n"
        f"Details: {' | '.join((row['DETAILS'].strip('{}').replace('\'', '').split(', ')))}"
    )

In [None]:
products_text = _products_df.apply(construct_training_text, axis=1).tolist()
formatted_df = pd.DataFrame({'PRODUCT_ID': _products_df['PRODUCT_ID'].tolist(), 'RESPONSE': products_text})

print(f"[*] Text format preview:\n{products_text[0]}\n\n{products_text[1]}")

---
## 3. Model Preparation

In [None]:
# Check Python vision
!python -V
# Check CUDA vision
!nvcc --version

In [None]:
from getpass import getpass
import torch

In [None]:
# Check for GPU Availability
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.mps.is_available else "cpu")
#device = 'cpu' # Set to cpu when debugging
print(f"Using device: {device}")

In [None]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
access_token = getpass()
os.environ['HUGGINGFACEHUB_API_TOKEN'] = access_token

### Llama 3.2-1B

In [None]:
model_id = "meta-llama/Llama-3.2-1B"
# model_id = "meta-llama/Llama-3.2-1B-Instruct"

### Qwen 2.5-1B

In [None]:
# model_id = "Qwen/Qwen2.5-1.5B"
# model_id = "Qwen/Qwen2.5-1.5B-Instruct"

### Load from Local

In [None]:
# model_id = os.path.join(base_dir, f"models/{model_id.split('/')[-1]}/Final")

### Tokenizer

In [None]:
from transformers import AutoTokenizer

In [None]:
# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, token=access_token)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

print("[*] Tokenizer loaded.")

### Format Data Prompt

In [None]:
data_prompt = """Gives product information based on the given product ID.

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token
def formatting_prompt(examples):
    inputs = examples['PRODUCT_ID']
    outputs = examples["RESPONSE"]
    texts = []
    for input_, output in zip(inputs, outputs):
        text = data_prompt.format(input_, output) + EOS_TOKEN
        texts.append(text)
        
    return { "text" : texts, }

In [None]:
training_data = Dataset.from_pandas(formatted_df)
training_data = training_data.map(formatting_prompt, batched=True, remove_columns=['PRODUCT_ID', 'RESPONSE'])

In [None]:
training_data['text'][0]

### Check for token length

In [None]:
import matplotlib.pyplot as plt

In [None]:
token_lengths = [len(tokenizer.encode(entry, truncation=False)) for entry in tqdm(training_data['text'])]
print(f'[*] Max length: {np.max(token_lengths)}')
print(f'[*] length <= 512: {round((len([x for x in token_lengths if x <= 512])/len(token_lengths))*100, 2)}%')
print(f'[*] length <= 1024: {round((len([x for x in token_lengths if x <= 1024])/len(token_lengths))*100, 2)}%')

In [None]:
plt.hist(token_lengths, bins=20)
plt.title("Token Length Distribution")
plt.xlabel("Number of Tokens")
plt.ylabel("Frequency")
plt.show()

### Tokenization

In [None]:
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=1024, # 512, 1024, 2048
        padding="max_length"
    )

In [None]:
training_data_tokenized = training_data.map(tokenize_function, batched=True, remove_columns=["text"])
print("[*] Data tokenized.")

In [None]:
print(f"\n[*] Tokenized products preview:\n\t{training_data_tokenized}")

---
## 4. Fine-Tuning

In [None]:
from transformers import AutoModelForCausalLM, DataCollatorForLanguageModeling

In [None]:
# Load Model
model = AutoModelForCausalLM.from_pretrained(model_id, token=access_token).to(device)
print("[*] Model loaded.")

In [None]:
from transformers import TrainingArguments

In [None]:
def check_dir_path(path):
    if not os.path.exists(path):
        print(f'[*] Creating {path}...')
        os.mkdir(path)
        print('[*] Done.')

In [None]:
trainer_dir = os.path.join(base_dir, f"models/{model_id.split('/')[-1]}/Trainer")
check_dir_path(trainer_dir)

log_dir = os.path.join(base_dir, f"models/{model_id.split('/')[-1]}/Log")
check_dir_path(log_dir)

In [None]:
# Training hyperparameters
training_args = TrainingArguments(
    output_dir=trainer_dir,
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=3e-5,
    warmup_steps=500,
    weight_decay=0.01,
    logging_steps=10000,
    logging_dir=log_dir,
    save_steps=30000,
    save_total_limit=1,
    fp16=True,
    optim="adamw_torch",
    dataloader_num_workers=2,
)

In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # Causal Language Modeling
)

In [None]:
import evaluate

In [None]:
metric = evaluate.load("accuracy")

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    return metric.compute(predictions=predictions, references=labels)

### Train

In [None]:
from transformers import Trainer

In [None]:
# Trainer for products
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=training_data_tokenized,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

---
## 5. Save Fine-tuned Model

In [None]:
# Save Fine-Tuned Model
output_dir = os.path.join(base_dir, f"models/{model_id.split('/')[-1]}/Final")

if not os.path.exists(output_dir):
    os.mkdir(output_dir)

In [None]:
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"[*] Model saved to {output_dir}")

In [None]:
# Save to hugging face
# from huggingface_hub import notebook_login

# notebook_login()

In [None]:
# model.push_to_hub(f"CountZero404/{model_id.split('/')[-1]}-RecSys", variant="fp16")
# tokenizer.push_to_hub(f"CountZero404/{model_id.split('/')[-1]}-RecSys", variant="fp16")

## 6. Model Test

In [None]:
def generate_question(query_type, query_value):
    if query_type == "PRODUCT_ID":
        prompt = f"Please provide information for this product with product ID {query_value}:\n"
    elif query_type == "TITLE":
        prompt = f"Please provide information for this product with title '{query_value}':\n"
    else:
        raise ValueError("Invalid query type. Use 'PRODUCT_ID' or 'TITLE'.")
    
    return prompt

In [None]:
def generate_response(model, tokenizer, test_inputs: list):
    for input_text in test_inputs:
        model_input = tokenizer(input_text, return_tensors="pt").to(device)
        # print(f'Input ids: {model_input["input_ids"]}')
        # print(f'Attention Mask: {model_input['attention_mask']}\n')
        
        model.eval()
        with torch.no_grad():
            output_ids = model.generate(
                input_ids=model_input["input_ids"], 
                attention_mask=model_input['attention_mask'], 
                pad_token_id=tokenizer.pad_token_id,
                max_new_tokens=300
            )[0]
            
            response = tokenizer.decode(
                output_ids, 
                skip_special_tokens=True
            )
            
            print(response)

### Random product

In [None]:
from time import time 
import random

In [None]:
random.seed(time())

In [None]:
def retrieve_product_information(df, query_type, query_value):
    if query_type == "PRODUCT_ID":
        product_index = df.index[df['PRODUCT_ID'] == query_value].tolist()[0]
    elif query_type == "TITLE":
        product_index = df.index[df['TITLE'] == query_value].tolist()[0]
    else:
        raise ValueError("Invalid query type. Use 'PRODUCT_ID' or 'TITLE'.")
        
    print(construct_training_text(df.loc[product_index, :]))

In [None]:
random_product_id = random.choice(_products_df['PRODUCT_ID'])
retrieve_product_information(_products_df, 'PRODUCT_ID', random_product_id)

In [None]:
formatted_question = generate_question("PRODUCT_ID", random_product_id)
print(f'[*] Formatted Question: {formatted_question}')

In [None]:
test_inputs = [formatted_question,]
generate_response(model, tokenizer, test_inputs)