In [2]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)

import pandas as pd
from datasets import Dataset
import torch
from datasets import load_dataset
from transformers import TrainingArguments, TextStreamer
import os, wandb
from trl import SFTTrainer, setup_chat_format

# Load the augmented CSV
df = pd.read_csv('augmented.csv')


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Preprocess the input and output (combine features into text format)
def preprocess_data(row):
    input_string = f"Age: {row['Age']}, CreditScore: {row['CreditScore']}, Income: {row['Income']}, " \
                   f"YearsEmployed: {row['YearsEmployed']}, Gender: {'Male' if row['Gender'] == 1 else 'Female'}, " \
                   f"Married: {'Yes' if row['Married'] == 1 else 'No'}, " \
                   f"Industry: {row['Industry']}, Ethnicity: {row['Ethnicity']}, " \
                   f"PriorDefault: {'Yes' if row['PriorDefault'] == 1 else 'No'}, " \
                   f"Employed: {'Yes' if row['Employed'] == 1 else 'No'}"
    output_string = f"Approval: {'Yes' if row['Approved'] == 1 else 'No'}, Reason: {row['Reason']}"
    return {"text": input_string, "label": output_string}

# Apply preprocessing to the dataframe
df_processed = df.apply(preprocess_data, axis=1)
df_final = pd.DataFrame(df_processed.tolist())  # Convert to DataFrame of text and labels
df_final

Unnamed: 0,text,label
0,"Age: 30.83, CreditScore: 1, Income: 0, YearsEm...","Approval: Yes, Reason: This application was ap..."
1,"Age: 58.67, CreditScore: 6, Income: 560, Years...","Approval: Yes, Reason: This application was ap..."
2,"Age: 24.5, CreditScore: 0, Income: 824, YearsE...","Approval: Yes, Reason: This application was ap..."
3,"Age: 27.83, CreditScore: 5, Income: 3, YearsEm...","Approval: Yes, Reason: This application was ap..."
4,"Age: 20.17, CreditScore: 0, Income: 0, YearsEm...","Approval: Yes, Reason: This application was ap..."
...,...,...
685,"Age: 21.08, CreditScore: 0, Income: 0, YearsEm...","Approval: No, Reason: This application was den..."
686,"Age: 22.67, CreditScore: 2, Income: 394, Years...","Approval: No, Reason: This application was den..."
687,"Age: 25.25, CreditScore: 1, Income: 1, YearsEm...","Approval: No, Reason: This application was den..."
688,"Age: 17.92, CreditScore: 0, Income: 750, Years...","Approval: No, Reason: This application was den..."


In [5]:
# Convert DataFrame to Huggingface Dataset format
dataset = Dataset.from_pandas(df_final)


In [6]:
# !huggingface-cli login      
# !wandb login --relogin API-KEY

In [None]:
from transformers import LlamaForCausalLM, LlamaTokenizer
from transformers import AutoTokenizer
#8B,70B,405B
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B")

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Split dataset into train and test sets
dataset = tokenized_dataset.train_test_split(test_size=0.3)

Map:   0%|          | 0/690 [00:00<?, ? examples/s]

In [None]:
dataset['text'][3]

In [None]:
new_model = "llama-3-8b-CC"
torch_dtype = torch.float16
attn_implementation = "eager"

# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)


model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B", 
                                         quantization_config=bnb_config,
                                         device_map="auto",
                                         attn_implementation=attn_implementation)



In [None]:
# LoRA config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)
model = get_peft_model(model, peft_config)


run = wandb.init(
    project='Fine-tune Llama 3 8B on CC Dataset', 
    job_type="training", 
    anonymous="allow"
)

In [None]:
training_arguments = TrainingArguments(
    output_dir=new_model,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=1,
    evaluation_strategy="steps",
    eval_steps=0.2,
    logging_steps=1,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
    report_to="wandb"
)

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    peft_config=peft_config,
    max_seq_length=512,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)

In [None]:
trainer.train()

In [12]:
wandb.finish()
model.config.use_cache = True

In [None]:

# Test prompt to evaluate model
test_message = {
    "role": "user",
    "content": "Age: 35, CreditScore: 650, Income: 5000, YearsEmployed: 5, Gender: Male, Married: Yes, " \
               "Industry: Tech, Ethnicity: Asian, PriorDefault: No, Employed: Yes"
}


# Create the prompt from the message
prompt = tokenizer.apply_chat_template([test_message], tokenize=False, add_generation_prompt=True)

# Tokenize and generate response
inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to("cuda")
outputs = model.generate(**inputs, max_length=150, num_return_sequences=1)

# Decode and print the generated text
text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(text.split("assistant")[1])  # Extract the assistant's response