In [1]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import os, torch, wandb
from datasets import load_dataset
from trl import SFTTrainer, setup_chat_format

In [2]:
# huggingface-cli login --token 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    
# wandb login --relogin c1858410fe092c30e9807b3d2570dd61ae73d01f

In [3]:
run = wandb.init(
    project='Fine-tune Llama 3 on CC Dataset', 
    job_type="training", 
    anonymous="allow"
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mdnicho26[0m ([33mdnicho26-university-of-north-carolina-at-charlotte[0m). Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Tracking run with wandb version 0.18.5


[34m[1mwandb[0m: Run data is saved locally in [35m[1m/opt/notebooks/Chatbot-Credit-Card/wandb/run-20241024_020359-cms4ca5b[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.


[34m[1mwandb[0m: Syncing run [33mancient-river-24[0m


[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/dnicho26-university-of-north-carolina-at-charlotte/Fine-tune%20Llama%203%20on%20CC%20Dataset[0m


[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/dnicho26-university-of-north-carolina-at-charlotte/Fine-tune%20Llama%203%20on%20CC%20Dataset/runs/cms4ca5b[0m


In [4]:
#base_model = "meta-llama/Llama-3.2-3B-Instruct"
#new_model = "./models/llama-3.2-3b-CC-target"


new_model = "./models/Llama-3.1-8B-Instruct"
base_model = "meta-llama/Llama-3.1-8B-Instruct"

In [5]:
torch_dtype = torch.float16
attn_implementation = "eager"

In [6]:
# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)
# Load model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
import pandas as pd
df = pd.read_csv('target-augmented.csv')
df

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,JOB,BEGIN_MONTHS,STATUS,TARGET,Reason
0,5065438,F,Y,N,2+ children,270000.0,Secondary / secondary special,Married,With parents,-13258,-2300,1,0,0,0,Managers,-6,C,0,This application was denied due to NAMEFAMILYS...
1,5142753,F,N,N,No children,81000.0,Secondary / secondary special,Single / not married,House / apartment,-17876,-377,1,1,1,0,Private service staff,-4,0,0,"This application was denied due to CODEGENDER,..."
2,5111146,M,Y,Y,No children,270000.0,Higher education,Married,House / apartment,-19579,-1028,1,0,1,0,Laborers,0,C,0,This application was denied due to NAMEFAMILYS...
3,5010310,F,Y,Y,1 children,112500.0,Secondary / secondary special,Married,House / apartment,-15109,-1956,1,0,0,0,Core staff,-3,0,0,This application was denied due to NAMEFAMILYS...
4,5010835,M,Y,Y,2+ children,139500.0,Secondary / secondary special,Married,House / apartment,-17281,-5578,1,1,0,0,Drivers,-29,0,0,This application was denied due to NAMEFAMILYS...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
537662,5142999,M,Y,N,1 children,166500.0,Secondary / secondary special,Married,With parents,-12372,-5401,1,0,1,0,Core staff,-8,0,0,This application was denied due to NAMEFAMILYS...
537663,5010773,F,N,Y,No children,135000.0,Higher education,Married,With parents,-14160,-4635,1,0,0,0,Sales staff,-8,0,0,This application was denied due to NAMEFAMILYS...
537664,5105601,M,N,Y,No children,180000.0,Higher education,Married,House / apartment,-24204,-2462,1,0,0,0,Private service staff,-7,0,0,This application was denied due to NAMEFAMILYS...
537665,5132833,M,Y,N,No children,220500.0,Secondary / secondary special,Married,House / apartment,-22647,-3847,1,0,1,0,Laborers,-1,0,0,This application was denied due to NAMEFAMILYS...


In [8]:
# import pandas as pd

# # Function to preprocess the data with special tokens for fine-tuning LLaMA3
# def preprocess_data_generalized(row):
#     # Generate text for each input feature with special tokens
#     gender_text = f"<gender> {'Male' if row['Gender'] == 1 else 'Female'} </gender>"
#     age_text = f"<age> {row['Age']} </age>"
#     debt_text = f"<debt> {row['Debt']} </debt>"
#     married_text = f"<married> {'Yes' if row['Married'] == 1 else 'No'} </married>"
#     bank_customer_text = f"<bank_customer> {'Yes' if row['BankCustomer'] == 1 else 'No'} </bank_customer>"
#     industry_text = f"<industry> {row['Industry']} </industry>"
#     ethnicity_text = f"<ethnicity> {row['Ethnicity']} </ethnicity>"
#     years_employed_text = f"<years_employed> {row['YearsEmployed']} </years_employed>"
#     prior_default_text = f"<prior_default> {'Yes' if row['PriorDefault'] == 1 else 'No'} </prior_default>"
#     employed_text = f"<employed> {'Yes' if row['Employed'] == 1 else 'No'} </employed>"
#     credit_score_text = f"<credit_score> {row['CreditScore']} </credit_score>"
#     drivers_license_text = f"<drivers_license> {'Yes' if row['DriversLicense'] == 1 else 'No'} </drivers_license>"
#     citizen_text = f"<citizen> {row['Citizen']} </citizen>"
#     zip_code_text = f"<zip_code> {row['ZipCode']} </zip_code>"
#     income_text = f"<income> {row['Income']} </income>"

#     # Combine all input text with special tokens
#     input_text = " ".join([
#         gender_text, age_text, debt_text, married_text, bank_customer_text, 
#         industry_text, ethnicity_text, years_employed_text, prior_default_text, 
#         employed_text, credit_score_text, drivers_license_text, citizen_text, 
#         zip_code_text, income_text
#     ])
    
#     # Output format for LLaMA fine-tuning (using special tokens for labels)
#     output_text = f"<approved> {'Yes' if row['Approved'] == 1 else 'No'} </approved> <reason> {row['Reason']} </reason>"
    
#     return {"text": input_text, "label": output_text}

# # Apply the generalized preprocessing to the dataframe
# df_processed = df.apply(preprocess_data_generalized, axis=1)
# df_final = pd.DataFrame(df_processed.tolist())

# # Display the first few rows of the processed data
# print(df_final.head())

import pandas as pd

# Function to preprocess the data with special tokens for fine-tuning LLaMA3
def preprocess_data_generalized(row):
    # Generate text for each input feature with special tokens
    gender_text = f"<gender> {'Male' if row['CODE_GENDER'] == 'M' else 'Female'} </gender>"
    car_text = f"<own_car> {'Yes' if row['FLAG_OWN_CAR'] == 'Y' else 'No'} </own_car>"
    realty_text = f"<own_realty> {'Yes' if row['FLAG_OWN_REALTY'] == 'Y' else 'No'} </own_realty>"
    children_text = f"<children> {row['CNT_CHILDREN']} </children>"
    income_text = f"<income> {row['AMT_INCOME_TOTAL']} </income>"
    education_text = f"<education> {row['NAME_EDUCATION_TYPE']} </education>"
    family_status_text = f"<family_status> {row['NAME_FAMILY_STATUS']} </family_status>"
    housing_type_text = f"<housing_type> {row['NAME_HOUSING_TYPE']} </housing_type>"
    age_text = f"<age_days> {row['DAYS_BIRTH']} </age_days>"
    employed_days_text = f"<days_employed> {row['DAYS_EMPLOYED']} </days_employed>"
    mobile_text = f"<mobile> {'Yes' if row['FLAG_MOBIL'] == 1 else 'No'} </mobile>"
    work_phone_text = f"<work_phone> {'Yes' if row['FLAG_WORK_PHONE'] == 1 else 'No'} </work_phone>"
    phone_text = f"<phone> {'Yes' if row['FLAG_PHONE'] == 1 else 'No'} </phone>"
    email_text = f"<email> {'Yes' if row['FLAG_EMAIL'] == 1 else 'No'} </email>"
    job_text = f"<job> {row['JOB']} </job>"
    begin_months_text = f"<begin_months> {row['BEGIN_MONTHS']} </begin_months>"

    # Combine all input text with special tokens
    input_text = " ".join([
        gender_text, car_text, realty_text, children_text, income_text, 
        education_text, family_status_text, housing_type_text, age_text, 
        employed_days_text, mobile_text, work_phone_text, phone_text, 
        email_text, job_text, begin_months_text
    ])
    
    # Output format for LLaMA fine-tuning (using special tokens for labels)
    output_text = f"<approved> {'Yes' if row['TARGET'] == 1 else 'No'} </approved> <reason> {row['Reason']} </reason>"
    
    return {"text": input_text, "label": output_text}

# Apply the generalized preprocessing to the dataframe
df_processed = df.apply(preprocess_data_generalized, axis=1)
df_final = pd.DataFrame(df_processed.tolist())

# Display the first few rows of the processed data
print(df_final.head())

                                                text  \
0  <gender> Female </gender> <own_car> Yes </own_...   
1  <gender> Female </gender> <own_car> No </own_c...   
2  <gender> Male </gender> <own_car> Yes </own_ca...   
3  <gender> Female </gender> <own_car> Yes </own_...   
4  <gender> Male </gender> <own_car> Yes </own_ca...   

                                               label  
0  <approved> No </approved> <reason> This applic...  
1  <approved> No </approved> <reason> This applic...  
2  <approved> No </approved> <reason> This applic...  
3  <approved> No </approved> <reason> This applic...  
4  <approved> No </approved> <reason> This applic...  


In [9]:
from datasets import Dataset
dataset = Dataset.from_pandas(df_final)
dataset['text'][3]

'<gender> Female </gender> <own_car> Yes </own_car> <own_realty> Yes </own_realty> <children> 1 children </children> <income> 112500.0 </income> <education> Secondary / secondary special </education> <family_status> Married </family_status> <housing_type> House / apartment </housing_type> <age_days> -15109 </age_days> <days_employed> -1956 </days_employed> <mobile> Yes </mobile> <work_phone> No </work_phone> <phone> No </phone> <email> No </email> <job> Core staff </job> <begin_months> -3 </begin_months>'

In [10]:
import bitsandbytes as bnb

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:  # needed for 16 bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

modules = find_all_linear_names(model)
print(modules)

['k_proj', 'q_proj', 'down_proj', 'up_proj', 'o_proj', 'v_proj', 'gate_proj']


In [11]:
# LoRA config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=modules
)
model, tokenizer = setup_chat_format(model, tokenizer)
model = get_peft_model(model, peft_config)

In [12]:
#Hyperparamter
training_arguments = TrainingArguments(
    output_dir=new_model,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=10,
    eval_strategy="steps",
    eval_steps=0.2,
    logging_steps=1,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
    report_to="wandb"
)

In [13]:
from datasets import Dataset

# Assuming dataset is a Dataset object with columns 'text' and 'label'
train_test_split = dataset.train_test_split(test_size=0.2)  # Split into 80% train, 20% test
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    peft_config=peft_config,
    max_seq_length=512,
    dataset_text_field="text",  # The column containing the input text
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False,
)



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/552 [00:00<?, ? examples/s]

Map:   0%|          | 0/138 [00:00<?, ? examples/s]

In [14]:
trainer.train()



Step,Training Loss,Validation Loss
552,0.1811,0.248998
1104,0.2045,0.236214
1656,0.1207,0.247008
2208,0.1242,0.310113
2760,0.0633,0.376329














TrainOutput(global_step=2760, training_loss=0.18762998445148485, metrics={'train_runtime': 6848.3767, 'train_samples_per_second': 0.806, 'train_steps_per_second': 0.403, 'total_flos': 1.319532958666752e+16, 'train_loss': 0.18762998445148485, 'epoch': 10.0})

In [15]:
wandb.finish()

[34m[1mwandb[0m:                                                                                


[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:               eval/loss ▂▁▂▅█
[34m[1mwandb[0m:            eval/runtime █▁▇▄▅
[34m[1mwandb[0m: eval/samples_per_second ▁█▂▅▄
[34m[1mwandb[0m:   eval/steps_per_second ▁█▂▅▄
[34m[1mwandb[0m:             train/epoch ▁▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇████████
[34m[1mwandb[0m:       train/global_step ▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇███
[34m[1mwandb[0m:         train/grad_norm ▇▃▃▃▂▂▁▁▂▁▂▃▂▂▄▃▅▃▄▃▅▇▄█▆▆▅▇▂▃▅▂▄█▃▅▄▄▃▄
[34m[1mwandb[0m:     train/learning_rate ████▇▇▇▇▇▆▆▆▆▆▆▅▅▅▄▄▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁
[34m[1mwandb[0m:              train/loss █▂▂▂▂▂▂▂▂▂▂▂▂▁▂▂▂▁▂▁▂▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:                eval/loss 0.37633
[34m[1mwandb[0m:             eval/runtime 61.3769
[34m[1mwandb[0m:  eval/samples_per_second 2.248
[34m[1mwandb[0m:    eval/steps_per_second 2.248
[34m[1mwandb[0m:               total_flos 1.319532958666

[34m[1mwandb[0m: 🚀 View run [33mancient-river-24[0m at: [34m[4mhttps://wandb.ai/dnicho26-university-of-north-carolina-at-charlotte/Fine-tune%20Llama%203%20on%20CC%20Dataset/runs/cms4ca5b[0m
[34m[1mwandb[0m: ⭐️ View project at: [34m[4mhttps://wandb.ai/dnicho26-university-of-north-carolina-at-charlotte/Fine-tune%20Llama%203%20on%20CC%20Dataset[0m
[34m[1mwandb[0m: Synced 5 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)


[34m[1mwandb[0m: Find logs at: [35m[1m./wandb/run-20241024_020359-cms4ca5b/logs[0m


In [16]:
# Instruction tailored to credit card approval context
instruction = """You are a highly knowledgeable financial advisor specializing in credit card approvals. 
    Be informative, polite, and provide clear responses to any queries regarding credit approval decisions.
    """

# Example message (user asking about credit card approval)
messages = [
    {"role": "system", "content": instruction},
    {"role": "user", "content": "Can I know why my credit card application was rejected? My age is 30, income is $40,000, and credit score is 580."}
]

# Generate the prompt using the chat template (assuming tokenizer.apply_chat_template is a custom method for your setup)
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

# Tokenize the prompt
inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to("cuda")

# Generate model outputs (adjusting parameters if necessary)
outputs = model.generate(**inputs, max_new_tokens=150, num_return_sequences=1)

# Decode the model's response
text = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Print the assistant's response (assuming the response begins after the 'assistant' token)
print(text.split("assistant")[1])

# Save the fine-tuned model and tokenizer for future use
model.save_pretrained(new_model)
tokenizer.save_pretrained(new_model)


Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)



Hello! I'd be happy to help you understand why your credit card application was rejected. Based on your income and age, I would have expected your credit score to be higher. However, there are a few factors that may have contributed to the rejection.

Your credit score is 580, which is considered good, but it's not excellent. The main reason for the rejection is that your credit score is not high enough to qualify you for a credit card. The credit score requirements for a credit card vary by credit card issuer, but generally, a score of 11 or higher is required for approval. Since your credit score is 580, you may be eligible for a credit card, but the issuer may have other requirements or restrictions that you need




('./models/llama-3.2-3b-CC/tokenizer_config.json',
 './models/llama-3.2-3b-CC/special_tokens_map.json',
 './models/llama-3.2-3b-CC/tokenizer.json')

In [17]:
# Example custom prompt provided by the user
custom_prompt = "Age: 27.83, CreditScore: 5, Income: 3, YearsEmployed: 3.75, Gender: Male, Married: Yes, Industry: Industrials, Ethnicity: White, PriorDefault: Yes, Employed: Yes"

# You don't need a system message if you are simply testing this input directly
messages = [
    {"role": "system", "content": instruction},
    {"role": "user", "content": custom_prompt}
]

# Generate the prompt using the chat template (if using custom chat template generation)
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

# Tokenize the custom prompt
inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to("cuda")

# Generate output from the model
outputs = model.generate(**inputs, max_new_tokens=150, num_return_sequences=1)

# Decode the model's response
response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Print the assistant's response
print(response_text.split("assistant")[1])



Hello! I'd be happy to help you with your credit card approval. Can you tell me a little bit about your credit history and what you're looking for? 

Also, I noticed you have a credit score of 5. What type of credit card are you interested in? Are you looking for a specific industry or a particular benefit?

And, I see you have a job with Industrials. How long have you been with the company? 

Also, I noticed you have a prior default. Can you tell me a bit more about that? 

Lastly, I have to ask, what's your income? 

Let me summarize what I have so far: 
- Age: 27.83
- CreditScore: 5
