In [10]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)

import pandas as pd
from datasets import Dataset
import torch
from datasets import load_dataset
from transformers import TrainingArguments, TextStreamer
import os, wandb
from trl import SFTTrainer, setup_chat_format

# Load the augmented CSV
df = pd.read_csv('augmented.csv')


In [11]:
# Preprocess the input and output (combine features into text format)
def preprocess_data(row):
    input_string = f"Age: {row['Age']}, CreditScore: {row['CreditScore']}, Income: {row['Income']}, " \
                   f"YearsEmployed: {row['YearsEmployed']}, Gender: {'Male' if row['Gender'] == 1 else 'Female'}, " \
                   f"Married: {'Yes' if row['Married'] == 1 else 'No'}, " \
                   f"Industry: {row['Industry']}, Ethnicity: {row['Ethnicity']}, " \
                   f"PriorDefault: {'Yes' if row['PriorDefault'] == 1 else 'No'}, " \
                   f"Employed: {'Yes' if row['Employed'] == 1 else 'No'}"
    
    # Simplified output format
    output_string = f"{'Yes' if row['Approved'] == 1 else 'No'}, {row['Reason']}"
    
    return {"text": input_string, "label": output_string}

# Apply preprocessing to the dataframe
df_processed = df.apply(preprocess_data, axis=1)
df_final = pd.DataFrame(df_processed.tolist())  # Convert to DataFrame of text and labels

# No need to use np.ravel for the labels as we're already returning a single string
df_final


Unnamed: 0,text,label
0,"Age: 30.83, CreditScore: 1, Income: 0, YearsEm...","Yes, This application was approved due to Inco..."
1,"Age: 58.67, CreditScore: 6, Income: 560, Years...","Yes, This application was approved due to Inco..."
2,"Age: 24.5, CreditScore: 0, Income: 824, YearsE...","Yes, This application was approved due to Inco..."
3,"Age: 27.83, CreditScore: 5, Income: 3, YearsEm...","Yes, This application was approved due to Year..."
4,"Age: 20.17, CreditScore: 0, Income: 0, YearsEm...","Yes, This application was approved due to Inco..."
...,...,...
685,"Age: 21.08, CreditScore: 0, Income: 0, YearsEm...","No, This application was denied due to Employe..."
686,"Age: 22.67, CreditScore: 2, Income: 394, Years...","No, This application was denied due to Income,..."
687,"Age: 25.25, CreditScore: 1, Income: 1, YearsEm...","No, This application was denied due to Income,..."
688,"Age: 17.92, CreditScore: 0, Income: 750, Years...","No, This application was denied due to YearsEm..."


In [12]:
# Convert DataFrame to Huggingface Dataset format
dataset = Dataset.from_pandas(df_final)

In [13]:
# !huggingface-cli login      
# !wandb login --relogin API-KEY

In [None]:
from transformers import LlamaForCausalLM, LlamaTokenizer
from transformers import AutoTokenizer
#8B,70B,405B
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B")

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["label"])

# Split dataset into train and test sets
dataset = tokenized_dataset.train_test_split(test_size=0.3)



In [5]:
# Accessing the 'text' field in the train split
print(dataset['train']['text'][3]) 
print(dataset['train']['label'][3])  # Modify to 'train' split, since it's after train_test_split



Age: 35.75, CreditScore: 4, Income: 1583, YearsEmployed: 0.75, Gender: Female, Married: Yes, Industry: ConsumerStaples, Ethnicity: White, PriorDefault: Yes, Employed: Yes
Yes, This application was approved due to Income, ZipCode, Employed, CreditScore, Citizen, Ethnicity.


In [6]:
new_model = "llama-3-8b-CC"
torch_dtype = torch.float16
attn_implementation = "eager"

# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)


model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B", 
                                         quantization_config=bnb_config,
                                         device_map="auto",
                                         attn_implementation=attn_implementation)

model.resize_token_embeddings(len(tokenizer))


Loading checkpoint shards: 100%|██████████| 4/4 [00:19<00:00,  4.82s/it]


Embedding(128256, 4096)

In [7]:
# LoRA config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)
model = get_peft_model(model, peft_config)


run = wandb.init(
    project='Fine-tune Llama 3 8B on CC Dataset', 
    job_type="training", 
    anonymous="allow"
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mdnicho26[0m ([33mdnicho26-university-of-north-carolina-at-charlotte[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [8]:
training_arguments = TrainingArguments(
    output_dir=new_model,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=1,
    evaluation_strategy="steps",
    eval_steps=0.2,
    logging_steps=1,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
    report_to="wandb"
)

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    peft_config=peft_config,
    max_seq_length=512,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


In [9]:
trainer.train()

  0%|          | 0/241 [00:00<?, ?it/s]

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`label` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

In [12]:
wandb.finish()
model.config.use_cache = True

In [None]:
# Test prompt to evaluate model
test_message = {
    "role": "user",
    "content": "Age: 35, CreditScore: 650, Income: 5000, YearsEmployed: 5, Gender: Male, Married: Yes, " \
               "Industry: Tech, Ethnicity: Asian, PriorDefault: No, Employed: Yes"
}


# Create the prompt from the message
prompt = tokenizer.apply_chat_template([test_message], tokenize=False, add_generation_prompt=True)

# Tokenize and generate response
inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to("cuda")
outputs = model.generate(**inputs, max_length=150, num_return_sequences=1)

# Decode and print the generated text
text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(text.split("assistant")[1])  # Extract the assistant's response