In [1]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import os, torch, wandb
from datasets import load_dataset
from trl import SFTTrainer, setup_chat_format

In [2]:
# huggingface-cli login --token key   
# wandb login --relogin key

In [3]:
run = wandb.init(
    project='Fine-tune Llama 3 on CC Dataset', 
    job_type="training", 
    anonymous="allow"
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mdnicho26[0m ([33mdnicho26-university-of-north-carolina-at-charlotte[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [4]:
import pandas as pd
df = pd.read_csv('/opt/notebooks/Chatbot-Credit-Card/backend/dataset/target-augmented.csv')
# Post-Training Augmentation (for interpretability)
# Convert Age (Years) to string for user-friendly output
df['Age (Days)'] = df['Age (Days)'].astype(str) + " years old"

# Ensure Employment Duration (Days) is numeric
df['Employment Duration (Days)'] = pd.to_numeric(df['Employment Duration (Days)'], errors='coerce')
# Fill NaN values or handle invalid data
df['Employment Duration (Days)'].fillna(0, inplace=True)  # Assume NaN means unemployed
# Convert Employment Duration to human-readable strings
df['Employment Duration (Days)'] = df['Employment Duration (Days)'].apply(
    lambda x: f"{x} days" if x > 0 else "Unemployed"
)

# Convert Months Balance to human-readable format
# df['begin_month'] = df['begin_month'].astype(str) + " months ago"

df.head()

Unnamed: 0,Client Number,Car Ownership,Property Ownership,Number of Children,Annual Income,Income Category,Education Level,Marital Status,Housing Type,Age (Days),Employment Duration (Days),Work Phone,Phone,Email,Occupation,Family Size,Approved,Reason
0,5008804,1,1,0,427500.0,4,1,0,4,32 years old,4542.0 days,1.0,0.0,0.0,18,2.0,1,This application was approved due to Client Nu...
1,5008805,1,1,0,427500.0,4,1,0,4,32 years old,4542.0 days,1.0,0.0,0.0,18,2.0,1,This application was approved due to Client Nu...
2,5008806,1,1,0,112500.0,4,5,1,1,58 years old,1134.0 days,0.0,0.0,0.0,16,2.0,1,This application was approved due to Client Nu...
3,5008808,0,1,0,270000.0,0,5,3,1,52 years old,3051.0 days,0.0,1.0,1.0,14,1.0,1,This application was approved due to Client Nu...
4,5008809,0,1,0,270000.0,0,5,3,1,52 years old,3051.0 days,0.0,1.0,1.0,14,1.0,1,This application was approved due to Client Nu...


In [5]:
import pandas as pd

# Function to preprocess the data with special tokens for fine-tuning LLaMA3
def preprocess_data_generalized(row):
    try:
        # Generate text for each input feature with special tokens, handling missing values
        gender_text = f"<gender> {'Male' if row.get('Gender', 0) == 1 else 'Female'} </gender>"
        age_text = f"<age> {row.get('Age (Days)', 'Unknown')} days </age>"
        debt_text = f"<debt> {row.get('Debt', '0')} units </debt>"
        married_text = f"<married> {'Yes' if row.get('Marital Status', 0) == 1 else 'No'} </married>"
        bank_customer_text = f"<bank_customer> {'Yes' if row.get('BankCustomer', 0) == 1 else 'No'} </bank_customer>"
        industry_text = f"<industry> {row.get('Occupation', 'Unknown')} </industry>"
        ethnicity_text = f"<ethnicity> {row.get('Ethnicity', 'Unknown')} </ethnicity>"
        years_employed_text = f"<years_employed> {row.get('Employment Duration (Days)', '0')} days </years_employed>"
        prior_default_text = f"<prior_default> {'Yes' if row.get('PriorDefault', 0) == 1 else 'No'} </prior_default>"
        employed_text = f"<employed> {'Yes' if row.get('Employed', 0) == 1 else 'No'} </employed>"
        credit_score_text = f"<credit_score> {row.get('CreditScore', 'Unknown')} </credit_score>"
        drivers_license_text = f"<drivers_license> {'Yes' if row.get('Work Phone', 0) == 1 else 'No'} </drivers_license>"
        citizen_text = f"<citizen> {row.get('Citizen', 'Unknown')} </citizen>"
        zip_code_text = f"<zip_code> {row.get('ZipCode', 'Unknown')} </zip_code>"
        income_text = f"<income> {row.get('Annual Income', '0')} units </income>"

        # Combine all input text with special tokens
        input_text = " ".join([
            gender_text, age_text, debt_text, married_text, bank_customer_text,
            industry_text, ethnicity_text, years_employed_text, prior_default_text,
            employed_text, credit_score_text, drivers_license_text, citizen_text,
            zip_code_text, income_text
        ])

        # Output format for LLaMA fine-tuning (using special tokens for labels)
        output_text = f"<approved> {'Yes' if row.get('Approved', 0) == 1 else 'No'} </approved> <reason> {row.get('Reason', 'Unknown')} </reason>"

        return {"text": input_text, "label": output_text}

    except Exception as e:
        print(f"Error processing row: {row.to_dict()} - {e}")
        return None

# Assume df is your original DataFrame
# Apply the generalized preprocessing to the dataframe
df_processed = df.apply(preprocess_data_generalized, axis=1)

# Drop rows where preprocessing failed (None values)
df_processed = df_processed.dropna()

# Convert the processed Series to a DataFrame
df_final = pd.DataFrame(df_processed.tolist())

# Display the first few rows of the processed data
df_final.head()


Unnamed: 0,text,label
0,<gender> Female </gender> <age> 32 years old d...,<approved> Yes </approved> <reason> This appli...
1,<gender> Female </gender> <age> 32 years old d...,<approved> Yes </approved> <reason> This appli...
2,<gender> Female </gender> <age> 58 years old d...,<approved> Yes </approved> <reason> This appli...
3,<gender> Female </gender> <age> 52 years old d...,<approved> Yes </approved> <reason> This appli...
4,<gender> Female </gender> <age> 52 years old d...,<approved> Yes </approved> <reason> This appli...


In [6]:
# Convert the first row to a dictionary for better readability
first_row_dict = df_final.iloc[0].to_dict()
print(first_row_dict)

{'text': '<gender> Female </gender> <age> 32 years old days </age> <debt> 0 units </debt> <married> No </married> <bank_customer> No </bank_customer> <industry> 18 </industry> <ethnicity> Unknown </ethnicity> <years_employed> 4542.0 days days </years_employed> <prior_default> No </prior_default> <employed> No </employed> <credit_score> Unknown </credit_score> <drivers_license> Yes </drivers_license> <citizen> Unknown </citizen> <zip_code> Unknown </zip_code> <income> 427500.0 units </income>', 'label': '<approved> Yes </approved> <reason> This application was approved due to Client Number, Age Days, Property Ownership, Employment Duration Days, Car Ownership, Occupation, Housing Type, Marital Status, Education Level, Family Size, Annual Income, Number of Children, Income Category. </reason>'}


In [7]:
base_model = "meta-llama/Llama-3.2-3B-Instruct"
new_model = "/opt/notebooks/Chatbot-Credit-Card/models/llama-3.2-3b-CC"

In [8]:
torch_dtype = torch.float16
attn_implementation = "eager"

In [9]:
# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)
# Load model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)

config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.
CUDA is required but not available for bitsandbytes. Please consider installing the multi-platform enabled version of bitsandbytes, which is currently a work in progress. Please check currently supported platforms and installation instructions at https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend


RuntimeError: CUDA is required but not available for bitsandbytes. Please consider installing the multi-platform enabled version of bitsandbytes, which is currently a work in progress. Please check currently supported platforms and installation instructions at https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend

In [None]:
from datasets import Dataset
dataset = Dataset.from_pandas(df_final)
dataset['text'][3]

In [None]:
import bitsandbytes as bnb

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:  # needed for 16 bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

modules = find_all_linear_names(model)
print(modules)

In [None]:
# LoRA config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=modules
)
model, tokenizer = setup_chat_format(model, tokenizer)
model = get_peft_model(model, peft_config)

In [None]:
#Hyperparamter
training_arguments = TrainingArguments(
    output_dir=new_model,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=1,
    eval_strategy="steps",
    eval_steps=0.2,
    logging_steps=1,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
    report_to="wandb"
)

In [None]:
from datasets import Dataset

# Assuming dataset is a Dataset object with columns 'text' and 'label'
train_test_split = dataset.train_test_split(test_size=0.2)  # Split into 80% train, 20% test
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

In [None]:
# Custom Callback to save model every 5 epochs
class SaveEveryNEpochsCallback(TrainerCallback):
    def __init__(self, save_every_n_epochs, output_dir):
        self.save_every_n_epochs = save_every_n_epochs
        self.output_dir = output_dir

    def on_epoch_end(self, args, state: TrainerState, control: TrainerControl, **kwargs):
        if state.epoch % self.save_every_n_epochs == 0:
            model_save_path = os.path.join(self.output_dir, f"checkpoint-{int(state.epoch)}-epochs")
            kwargs["model"].save_pretrained(model_save_path)
            kwargs["tokenizer"].save_pretrained(model_save_path)
            print(f"Model saved at {model_save_path}")

In [None]:
# Adjust the training arguments for the single epoch run
single_epoch_training_arguments = TrainingArguments(
    output_dir=new_model,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=1,  # Only run for one epoch first
    eval_strategy="steps",
    eval_steps=0.2,
    logging_steps=1,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
    report_to="wandb"
)

# Train for one epoch and save as new_model + "-1-epoch"
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    peft_config=peft_config,
    max_seq_length=512,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=single_epoch_training_arguments,
    packing=False,
)
trainer.train()
model.save_pretrained(new_model + "-1-epoch")
tokenizer.save_pretrained(new_model + "-1-epoch")
print(f"Model saved after 1 epoch at {new_model + '-1-epoch'}")

In [None]:
from transformers import EarlyStoppingCallback
early_stopping_patience = 5  # Number of eval steps with no improvement before stopping

# Now, reconfigure for full training with 100 epochs and save every 5 epochs
full_training_arguments = TrainingArguments(
    output_dir=new_model,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=100,  # Set to full training epochs
    eval_strategy="steps",
    eval_steps=0.2,
    logging_steps=1,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
    report_to="wandb"
)

In [None]:
# Run the full training with periodic saving
trainer.args = full_training_arguments  # Update the training arguments
trainer.add_callback(SaveEveryNEpochsCallback(save_every_n_epochs=5, output_dir=new_model))
trainer.add_callback(EarlyStoppingCallback(early_stopping_patience=early_stopping_patience))

trainer.train()
wandb.finish()

In [None]:
# Instruction tailored to credit card approval context
instruction = """You are a highly knowledgeable financial advisor specializing in credit card approvals. 
    Be informative, polite, and provide clear responses to any queries regarding credit approval decisions.
    """

# Example message (user asking about credit card approval)
messages = [
    {"role": "system", "content": instruction},
    {"role": "user", "content": "Can I know why my credit card application was rejected? My age is 30, income is $40,000, and credit score is 580."}
]

# Generate the prompt using the chat template (assuming tokenizer.apply_chat_template is a custom method for your setup)
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

# Tokenize the prompt
inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to("cuda")

# Generate model outputs (adjusting parameters if necessary)
outputs = model.generate(**inputs, max_new_tokens=150, num_return_sequences=1)

# Decode the model's response
text = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Print the assistant's response (assuming the response begins after the 'assistant' token)
print(text.split("assistant")[1])

# Save the fine-tuned model and tokenizer for future use
model.save_pretrained(new_model)
tokenizer.save_pretrained(new_model)


In [None]:
# Example custom prompt provided by the user
custom_prompt = "Age: 27.83, CreditScore: 5, Income: 3, YearsEmployed: 3.75, Gender: Male, Married: Yes, Industry: Industrials, Ethnicity: White, PriorDefault: Yes, Employed: Yes"

# You don't need a system message if you are simply testing this input directly
messages = [
    {"role": "system", "content": instruction},
    {"role": "user", "content": custom_prompt}
]

# Generate the prompt using the chat template (if using custom chat template generation)
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

# Tokenize the custom prompt
inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to("cuda")

# Generate output from the model
outputs = model.generate(**inputs, max_new_tokens=150, num_return_sequences=1)

# Decode the model's response
response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Print the assistant's response
print(response_text.split("assistant")[1])
