In [54]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
    TrainerCallback,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import os, torch, wandb
from datasets import load_dataset
from trl import SFTTrainer, setup_chat_format
import pandas as pd

In [55]:
# huggingface-cli login --token key   
# wandb login --relogin key

In [56]:
import pandas as pd
df = pd.read_csv('/opt/notebooks/Chatbot-Credit-Card/backend/dataset/target-augmented.csv')
df.drop(['Approved','Explanation'], axis=1, inplace=True)
df.head()

Unnamed: 0,Car Ownership,Property Ownership,Number of Children,Annual Income,Income Category,Education Level,Marital Status,Housing Type,Age (Days),Employment Duration (Days),Work Phone,Phone,Email,Occupation,Family Size,Prediction,Actual,Reason
0,1,1,0,427500.0,4,1,0,4,32,4542.0,1.0,0.0,0.0,18,2.0,Approved,Approved,This application was approved due to Employmen...
1,1,1,0,427500.0,4,1,0,4,32,4542.0,1.0,0.0,0.0,18,2.0,Approved,Approved,This application was approved due to Employmen...
2,1,1,0,112500.0,4,5,1,1,58,1134.0,0.0,0.0,0.0,16,2.0,Approved,Approved,"This application was approved due to -0.77, Ma..."
3,0,1,0,270000.0,0,5,3,1,52,3051.0,0.0,1.0,1.0,14,1.0,Approved,Approved,"This application was approved due to Email, Nu..."
4,0,1,0,270000.0,0,5,3,1,52,3051.0,0.0,1.0,1.0,14,1.0,Approved,Approved,"This application was approved due to Email, Nu..."


In [57]:
print(df['Reason'].iloc[0])

This application was approved due to Employment Duration (Days), -0.77, Housing Type, Marital Status, Work Phone, Number of Children, -0.15, -0.81, Family Size.


In [48]:
import re

# Function to remove numbers and clean the Explanation column
def clean_explanation(text):
    return re.sub(r'[-+]?\d*\.\d+|\d+', '', text).replace(', ,', ',').replace(' ,', ',').strip(", ")

# Apply the cleaning function to the Explanation column
df["Reason"] = df["Reason"].apply(clean_explanation)

# Display the DataFrame with the cleaned Explanation column
print(df['Reason'].iloc[0])
df.head()

This application was approved due to Employment Duration (Days), Housing Type, Marital Status, Number of Children, Work Phone, Annual Income.


Unnamed: 0,Car Ownership,Property Ownership,Number of Children,Annual Income,Income Category,Education Level,Marital Status,Housing Type,Age (Days),Employment Duration (Days),Work Phone,Phone,Email,Occupation,Family Size,Prediction,Actual,Reason
0,1,1,0,427500.0,4,1,0,4,32,4542.0,1.0,0.0,0.0,18,2.0,Approved,Approved,This application was approved due to Employmen...
1,1,1,0,427500.0,4,1,0,4,32,4542.0,1.0,0.0,0.0,18,2.0,Approved,Approved,This application was approved due to Employmen...
2,1,1,0,112500.0,4,5,1,1,58,1134.0,0.0,0.0,0.0,16,2.0,Approved,Approved,"This application was approved due to, Marital ..."
3,0,1,0,270000.0,0,5,3,1,52,3051.0,0.0,1.0,1.0,14,1.0,Approved,Approved,"This application was approved due to Email, Nu..."
4,0,1,0,270000.0,0,5,3,1,52,3051.0,0.0,1.0,1.0,14,1.0,Approved,Approved,"This application was approved due to Email, Nu..."


In [49]:
df.rename(columns={'Age (Days)': 'Age (Years)'}, inplace=True)
df.rename(columns={'Employment Duration' : 'Employment Duration (Days)'}, inplace=True)
df['Age (Years)'] = df['Age (Years)'].astype(str) + " years old"

In [50]:
# Display the DataFrame with the cleaned Explanation column
print(df['Reason'].iloc[0])
df.head()

This application was approved due to Employment Duration (Days), Housing Type, Marital Status, Number of Children, Work Phone, Annual Income.


Unnamed: 0,Car Ownership,Property Ownership,Number of Children,Annual Income,Income Category,Education Level,Marital Status,Housing Type,Age (Years),Employment Duration (Days),Work Phone,Phone,Email,Occupation,Family Size,Prediction,Actual,Reason
0,1,1,0,427500.0,4,1,0,4,32 years old,4542.0,1.0,0.0,0.0,18,2.0,Approved,Approved,This application was approved due to Employmen...
1,1,1,0,427500.0,4,1,0,4,32 years old,4542.0,1.0,0.0,0.0,18,2.0,Approved,Approved,This application was approved due to Employmen...
2,1,1,0,112500.0,4,5,1,1,58 years old,1134.0,0.0,0.0,0.0,16,2.0,Approved,Approved,"This application was approved due to, Marital ..."
3,0,1,0,270000.0,0,5,3,1,52 years old,3051.0,0.0,1.0,1.0,14,1.0,Approved,Approved,"This application was approved due to Email, Nu..."
4,0,1,0,270000.0,0,5,3,1,52 years old,3051.0,0.0,1.0,1.0,14,1.0,Approved,Approved,"This application was approved due to Email, Nu..."


In [51]:
# Function to dynamically replace labels with row values
def insert_numbers_dynamically(row):
    reason = row['Reason']
    
    # Extract all parts of the reason where dynamic replacement is needed
    labels = re.findall(r'([A-Za-z\s()]+)', reason)
    
    for label in labels:
        # Match the label to the corresponding column name
        column_name = None
        for col in df.columns:
            # Normalize column names and labels for matching
            normalized_col = re.sub(r'[\s()]+', '', col).lower()
            normalized_label = re.sub(r'[\s()]+', '', label).lower()
            
            if normalized_col == normalized_label:
                column_name = col
                break
        
        if column_name and column_name in row:  # Ensure column exists in the DataFrame
            # Replace the label with the corresponding value from the row
            value = row[column_name]
            # Ensure proper replacement in the text
            reason = reason.replace(label, f"{label.strip()} {value}")
    
    return reason

In [52]:
# Apply the function to each row in the DataFrame
df['Reason'] = df.apply(insert_numbers_dynamically, axis=1)

# Display the updated DataFrame
df.head()

Unnamed: 0,Car Ownership,Property Ownership,Number of Children,Annual Income,Income Category,Education Level,Marital Status,Housing Type,Age (Years),Employment Duration (Days),Work Phone,Phone,Email,Occupation,Family Size,Prediction,Actual,Reason
0,1,1,0,427500.0,4,1,0,4,32 years old,4542.0,1.0,0.0,0.0,18,2.0,Approved,Approved,This application was approved due to Employmen...
1,1,1,0,427500.0,4,1,0,4,32 years old,4542.0,1.0,0.0,0.0,18,2.0,Approved,Approved,This application was approved due to Employmen...
2,1,1,0,112500.0,4,5,1,1,58 years old,1134.0,0.0,0.0,0.0,16,2.0,Approved,Approved,"This application was approved due to,Marital S..."
3,0,1,0,270000.0,0,5,3,1,52 years old,3051.0,0.0,1.0,1.0,14,1.0,Approved,Approved,"This application was approved due to Email,Num..."
4,0,1,0,270000.0,0,5,3,1,52 years old,3051.0,0.0,1.0,1.0,14,1.0,Approved,Approved,"This application was approved due to Email,Num..."


In [53]:
# Convert the first row to a dictionary
first_row_dict = df.iloc[0].to_dict()

# Print only the Reason string
print(first_row_dict['Reason'])

This application was approved due to Employment Duration (Days),Housing Type 4,Marital Status 0,Number of Children 0,Work Phone 1.0,Annual Income 427500.0.


In [11]:
base_model = "meta-llama/Llama-3.2-3B-Instruct"
new_model = "/opt/notebooks/Chatbot-Credit-Card/models/llama-3.2-3b-CC"

In [12]:
torch_dtype = torch.float16
attn_implementation = "eager"

In [None]:
# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)
# Load model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)

In [None]:
from datasets import Dataset
dataset = Dataset.from_pandas(df_final)
dataset['text'][3]

In [None]:
import bitsandbytes as bnb

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:  # needed for 16 bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

modules = find_all_linear_names(model)
print(modules)

In [None]:
# LoRA config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=modules
)
model, tokenizer = setup_chat_format(model, tokenizer)
model = get_peft_model(model, peft_config)

In [None]:
#Hyperparamter
training_arguments = TrainingArguments(
    output_dir=new_model,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=1,
    eval_strategy="steps",
    eval_steps=0.2,
    logging_steps=1,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
    report_to="wandb"
)

In [None]:
from datasets import Dataset

# Assuming dataset is a Dataset object with columns 'text' and 'label'
train_test_split = dataset.train_test_split(test_size=0.2)  # Split into 80% train, 20% test
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

In [None]:
# Custom Callback to save model every 5 epochs
class SaveEveryNEpochsCallback(TrainerCallback):
    def __init__(self, save_every_n_epochs, output_dir):
        self.save_every_n_epochs = save_every_n_epochs
        self.output_dir = output_dir

    def on_epoch_end(self, args, state: TrainerState, control: TrainerControl, **kwargs):
        if state.epoch % self.save_every_n_epochs == 0:
            model_save_path = os.path.join(self.output_dir, f"checkpoint-{int(state.epoch)}-epochs")
            kwargs["model"].save_pretrained(model_save_path)
            kwargs["tokenizer"].save_pretrained(model_save_path)
            print(f"Model saved at {model_save_path}")

In [None]:
run = wandb.init(
    project='Fine-tune Llama 3 on CC Dataset', 
    job_type="training", 
    anonymous="allow"
)

In [None]:
# Adjust the training arguments for the single epoch run
single_epoch_training_arguments = TrainingArguments(
    output_dir=new_model,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=1,  # Only run for one epoch first
    eval_strategy="steps",
    eval_steps=0.2,
    logging_steps=1,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
    report_to="wandb"
)

# Train for one epoch and save as new_model + "-1-epoch"
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    peft_config=peft_config,
    max_seq_length=512,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=single_epoch_training_arguments,
    packing=False,
)
trainer.train()
model.save_pretrained(new_model + "-1-epoch")
tokenizer.save_pretrained(new_model + "-1-epoch")
print(f"Model saved after 1 epoch at {new_model + '-1-epoch'}")

In [None]:
from transformers import EarlyStoppingCallback
early_stopping_patience = 5  # Number of eval steps with no improvement before stopping

# Now, reconfigure for full training with 100 epochs and save every 5 epochs
full_training_arguments = TrainingArguments(
    output_dir=new_model,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=100,  # Set to full training epochs
    eval_strategy="steps",
    eval_steps=0.2,
    logging_steps=1,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
    report_to="wandb"
)

In [None]:
# Run the full training with periodic saving
trainer.args = full_training_arguments  # Update the training arguments
trainer.add_callback(SaveEveryNEpochsCallback(save_every_n_epochs=5, output_dir=new_model))
trainer.add_callback(EarlyStoppingCallback(early_stopping_patience=early_stopping_patience))

trainer.train()
wandb.finish()

In [None]:
# Instruction tailored to credit card approval context
instruction = """You are a highly knowledgeable financial advisor specializing in credit card approvals. 
    Be informative, polite, and provide clear responses to any queries regarding credit approval decisions.
    """

# Example message (user asking about credit card approval)
messages = [
    {"role": "system", "content": instruction},
    {"role": "user", "content": "Can I know why my credit card application was rejected? My age is 30, income is $40,000, and credit score is 580."}
]

# Generate the prompt using the chat template (assuming tokenizer.apply_chat_template is a custom method for your setup)
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

# Tokenize the prompt
inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to("cuda")

# Generate model outputs (adjusting parameters if necessary)
outputs = model.generate(**inputs, max_new_tokens=150, num_return_sequences=1)

# Decode the model's response
text = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Print the assistant's response (assuming the response begins after the 'assistant' token)
print(text.split("assistant")[1])

# Save the fine-tuned model and tokenizer for future use
model.save_pretrained(new_model)
tokenizer.save_pretrained(new_model)


In [None]:
# Example custom prompt provided by the user
custom_prompt = "Age: 27.83, CreditScore: 5, Income: 3, YearsEmployed: 3.75, Gender: Male, Married: Yes, Industry: Industrials, Ethnicity: White, PriorDefault: Yes, Employed: Yes"

# You don't need a system message if you are simply testing this input directly
messages = [
    {"role": "system", "content": instruction},
    {"role": "user", "content": custom_prompt}
]

# Generate the prompt using the chat template (if using custom chat template generation)
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

# Tokenize the custom prompt
inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to("cuda")

# Generate output from the model
outputs = model.generate(**inputs, max_new_tokens=150, num_return_sequences=1)

# Decode the model's response
response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Print the assistant's response
print(response_text.split("assistant")[1])
