In [1]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
    TrainerCallback,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import os, torch, wandb
from datasets import load_dataset
from trl import SFTTrainer, setup_chat_format
import pandas as pd

In [2]:
# huggingface-cli login --token key   
# wandb login --relogin key

In [3]:
import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings("ignore")
#https://www.kaggle.com/datasets/rikdifos/credit-card-approval-prediction
# Load the application_record dataset
data = pd.read_csv("/opt/notebooks/Chatbot-Credit-Card/backend/dataset/credit-card-approval/application_record.csv")
# Load the credit_record dataset
record = pd.read_csv("/opt/notebooks/Chatbot-Credit-Card/backend/dataset/credit-card-approval/credit_record.csv")
# Find the first account open month for each user
begin_month = record.loc[record.groupby("ID")["MONTHS_BALANCE"].idxmin()]
begin_month = begin_month.rename(columns={"MONTHS_BALANCE": "begin_month"})

# Merge the datasets
df = pd.merge(data, begin_month, how="left", on="ID")
print("Datasets loaded and merged successfully.")
# Define approval logic based on multiple criteria
def determine_approval(row):
    # Define custom approval logic
    if row["STATUS"] in ["0", "1", "C", "X"]:  # Good credit status
            return 1  # Approved
    return 0  # Default to denial if STATUS is bad or missing

# Apply logic to determine approval (filling missing STATUS values first)
record["STATUS"] = record["STATUS"].fillna("X")  # Handle missing values
record["Approved"] = record.apply(determine_approval, axis=1)
# Aggregate approval status for each ID (disapproval if any ID has disqualifying criteria)
approval_status = record.groupby("ID")["Approved"].min().reset_index()
# Merge approval status back into the main dataset, avoiding "_x" and "_y" columns
df = pd.merge(data, approval_status, how="left", on="ID")
df["Approved"] = df["Approved"].fillna(0).astype(int)  # Fill missing approvals as denial
print("Approval status merged successfully.")
# Preprocess the 'DAYS_BIRTH' column to convert days to years
df['DAYS_BIRTH'] = (-df['DAYS_BIRTH'] // 365).fillna(0).astype(int)
df.drop(columns=['ID'], inplace=True)
# Preprocess the 'DAYS_EMPLOYED' column to get absolute values and handle unemployment
df['DAYS_EMPLOYED'] = df['DAYS_EMPLOYED'].apply(lambda x: abs(x) if x < 0 else 0)
# Handle missing or infinite values in numerical columns
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
df[numerical_cols] = df[numerical_cols].replace([np.inf, -np.inf], np.nan)  # Replace infinities with NaN
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].median())  # Fill NaN with median values
print("Preprocessing completed successfully.")

# Define the feature mapping dictionary
feature_mapping = {
    'CODE_GENDER': 'Gender',
    'FLAG_OWN_CAR': 'Car Ownership',
    'FLAG_OWN_REALTY': 'Property Ownership',
    'CNT_CHILDREN': 'Number of Children',
    'AMT_INCOME_TOTAL': 'Annual Income',
    'NAME_INCOME_TYPE': 'Income Category',
    'NAME_EDUCATION_TYPE': 'Education Level',
    'NAME_FAMILY_STATUS': 'Marital Status',
    'NAME_HOUSING_TYPE': 'Housing Type',
    'DAYS_BIRTH': 'Age (Days)',
    'DAYS_EMPLOYED': 'Employment Duration (Days)',
    'FLAG_MOBIL': 'Mobile Phone',
    'FLAG_WORK_PHONE': 'Work Phone',
    'FLAG_PHONE': 'Phone',
    'FLAG_EMAIL': 'Email',
    'OCCUPATION_TYPE': 'Occupation',
    'CNT_FAM_MEMBERS': 'Family Size',
    'STATUS': 'Credit Status'
}

# Rename the columns in the DataFrame using the mapping
df.rename(columns=feature_mapping, inplace=True)

# Display the first few rows to confirm the changes
df.head()
df.rename(columns={'TARGET': 'Approved'}, inplace=True)
# Display the first few rows to confirm the change
df.head()

Datasets loaded and merged successfully.


Approval status merged successfully.
Preprocessing completed successfully.


Unnamed: 0,Gender,Car Ownership,Property Ownership,Number of Children,Annual Income,Income Category,Education Level,Marital Status,Housing Type,Age (Days),Employment Duration (Days),Mobile Phone,Work Phone,Phone,Email,Occupation,Family Size,Approved
0,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,32,4542.0,1.0,1.0,0.0,0.0,,2.0,1
1,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,32,4542.0,1.0,1.0,0.0,0.0,,2.0,1
2,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,58,1134.0,1.0,0.0,0.0,0.0,Security staff,2.0,1
3,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,52,3051.0,1.0,0.0,1.0,1.0,Sales staff,1.0,1
4,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,52,3051.0,1.0,0.0,1.0,1.0,Sales staff,1.0,1


In [4]:
explanation_df = pd.read_csv('/opt/notebooks/Chatbot-Credit-Card/backend/dataset/explanations_df.csv')
# explanation_df.drop(['Approved','Explanation'], axis=1, inplace=True)
explanation_df.head()

Unnamed: 0,Prediction,Actual,Explanation
0,Approved,Approved,"This application was approved due to -0.77, Ma..."
1,Approved,Approved,This application was approved due to Employmen...
2,Approved,Approved,This application was approved due to Employmen...
3,Approved,Approved,"This application was approved due to Email, -0..."
4,Approved,Approved,"This application was approved due to Email, Nu..."


In [5]:
print(explanation_df['Explanation'].iloc[0])

This application was approved due to -0.77, Marital Status, -0.15, Number of Children, -0.81, Age (Days), -1.05, Annual Income, Family Size.


In [6]:
import re

# Function to remove numbers and clean the Explanation column
def clean_explanation(text):
    return re.sub(r'[-+]?\d*\.\d+|\d+', '', text).replace(', ,', ',').replace(' ,', ',').strip(", ")

# Apply the cleaning function to the Explanation column
explanation_df["Explanation"] = explanation_df["Explanation"].apply(clean_explanation)

In [7]:
# Display the DataFrame with the cleaned Explanation column
print(explanation_df['Explanation'].iloc[0])
explanation_df.head()

This application was approved due to, Marital Status, Number of Children, Age (Days), Annual Income, Family Size.


Unnamed: 0,Prediction,Actual,Explanation
0,Approved,Approved,"This application was approved due to, Marital ..."
1,Approved,Approved,This application was approved due to Employmen...
2,Approved,Approved,This application was approved due to Employmen...
3,Approved,Approved,"This application was approved due to Email, Nu..."
4,Approved,Approved,"This application was approved due to Email, Nu..."


In [8]:
# df.rename(columns={'Age (Days)': 'Age (Years)'}, inplace=True)
df.rename(columns={'Employment Duration' : 'Employment Duration (Days)'}, inplace=True)
df['Age (Days)'] = df['Age (Days)'].astype(str) + " years old"

In [9]:
# Combine explanation_df and df based on index
df['Reason'] = explanation_df['Explanation'].reset_index(drop=True)

# Display the first value in the 'Reason' column
print(df['Reason'].iloc[0])
df.head()

This application was approved due to, Marital Status, Number of Children, Age (Days), Annual Income, Family Size.


Unnamed: 0,Gender,Car Ownership,Property Ownership,Number of Children,Annual Income,Income Category,Education Level,Marital Status,Housing Type,Age (Days),Employment Duration (Days),Mobile Phone,Work Phone,Phone,Email,Occupation,Family Size,Approved,Reason
0,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,32 years old,4542.0,1.0,1.0,0.0,0.0,,2.0,1,"This application was approved due to, Marital ..."
1,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,32 years old,4542.0,1.0,1.0,0.0,0.0,,2.0,1,This application was approved due to Employmen...
2,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,58 years old,1134.0,1.0,0.0,0.0,0.0,Security staff,2.0,1,This application was approved due to Employmen...
3,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,52 years old,3051.0,1.0,0.0,1.0,1.0,Sales staff,1.0,1,"This application was approved due to Email, Nu..."
4,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,52 years old,3051.0,1.0,0.0,1.0,1.0,Sales staff,1.0,1,"This application was approved due to Email, Nu..."


In [10]:
# Function to dynamically replace labels with row values
def insert_numbers_dynamically(row):
    reason = row['Reason']
    
    # Extract all parts of the reason where dynamic replacement is needed
    labels = re.findall(r'([A-Za-z\s()]+)', reason)
    
    for label in labels:
        # Match the label to the corresponding column name
        column_name = None
        for col in df.columns:
            # Normalize column names and labels for matching
            normalized_col = re.sub(r'[\s()]+', '', col).lower()
            normalized_label = re.sub(r'[\s()]+', '', label).lower()
            
            if normalized_col == normalized_label:
                column_name = col
                break
        
        if column_name and column_name in row:  # Ensure column exists in the DataFrame
            # Replace the label with the corresponding value from the row
            value = row[column_name]
            # Ensure proper replacement in the text
            reason = reason.replace(label, f"{label.strip()} {value}")
    
    return reason

In [11]:
# Apply the function to each row in the DataFrame
df['Reason'] = df.apply(insert_numbers_dynamically, axis=1)

# Display the updated DataFrame
df.head()

Unnamed: 0,Gender,Car Ownership,Property Ownership,Number of Children,Annual Income,Income Category,Education Level,Marital Status,Housing Type,Age (Days),Employment Duration (Days),Mobile Phone,Work Phone,Phone,Email,Occupation,Family Size,Approved,Reason
0,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,32 years old,4542.0,1.0,1.0,0.0,0.0,,2.0,1,"This application was approved due to,Marital S..."
1,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,32 years old,4542.0,1.0,1.0,0.0,0.0,,2.0,1,This application was approved due to Employmen...
2,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,58 years old,1134.0,1.0,0.0,0.0,0.0,Security staff,2.0,1,This application was approved due to Employmen...
3,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,52 years old,3051.0,1.0,0.0,1.0,1.0,Sales staff,1.0,1,"This application was approved due to Email,Num..."
4,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,52 years old,3051.0,1.0,0.0,1.0,1.0,Sales staff,1.0,1,"This application was approved due to Email,Num..."


In [12]:
# Convert "Employment Duration (Days)" to years in-place
df["Employment Duration (Days)"] = df["Employment Duration (Days)"].apply(lambda x: round(x / 365.0, 2) if not pd.isna(x) else np.nan)
df.rename(columns={"Age (Days)": "Age (Years)", "Employment Duration (Days)": "Employment Duration (Years)"}, inplace=True)
df.head()

Unnamed: 0,Gender,Car Ownership,Property Ownership,Number of Children,Annual Income,Income Category,Education Level,Marital Status,Housing Type,Age (Years),Employment Duration (Years),Mobile Phone,Work Phone,Phone,Email,Occupation,Family Size,Approved,Reason
0,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,32 years old,12.44,1.0,1.0,0.0,0.0,,2.0,1,"This application was approved due to,Marital S..."
1,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,32 years old,12.44,1.0,1.0,0.0,0.0,,2.0,1,This application was approved due to Employmen...
2,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,58 years old,3.11,1.0,0.0,0.0,0.0,Security staff,2.0,1,This application was approved due to Employmen...
3,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,52 years old,8.36,1.0,0.0,1.0,1.0,Sales staff,1.0,1,"This application was approved due to Email,Num..."
4,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,52 years old,8.36,1.0,0.0,1.0,1.0,Sales staff,1.0,1,"This application was approved due to Email,Num..."


In [13]:
# Step 2: Update the "Reason" column to match the new labels
def update_reason(reason_text):
    reason_text = reason_text.replace("Age (Days)", "Age (Years)")
    reason_text = reason_text.replace("Employment Duration (Days)", "Employment Duration (Years)")
    return reason_text

df["Reason"] = df["Reason"].apply(update_reason)

In [14]:
# Display the first value in the 'Reason' column
print(df['Reason'].iloc[0])

This application was approved due to,Marital Status Civil marriage,Number of Children 0,Age (Years) 32 years old,Annual Income 427500.0,Family Size 2.0.


In [15]:
# Generate input and label text with tags
def generate_tagged_input_and_label(row):
    # Generate label
    label = f"<label> {row['Reason']} </label>"

    # Generate input, skipping NaN fields
    input_parts = [
        f"<gender> {'Male' if row['Gender'] == 'M' else 'Female'} </gender>" if pd.notna(row['Gender']) else None,
        f"<car_ownership> {'yes' if row['Car Ownership'] == 'Y' else 'no'} </car_ownership>" if pd.notna(row['Car Ownership']) else None,
        f"<property_ownership> {'yes' if row['Property Ownership'] == 'Y' else 'no'} </property_ownership>" if pd.notna(row['Property Ownership']) else None,
        f"<number_of_children> {int(row['Number of Children'])} </number_of_children>" if pd.notna(row['Number of Children']) else None,
        f"<annual_income> {row['Annual Income']} </annual_income>" if pd.notna(row['Annual Income']) else None,
        f"<income_category> {row['Income Category']} </income_category>" if pd.notna(row['Income Category']) else None,
        f"<education_level> {row['Education Level']} </education_level>" if pd.notna(row['Education Level']) else None,
        f"<marital_status> {row['Marital Status']} </marital_status>" if pd.notna(row['Marital Status']) else None,
        f"<housing_type> {row['Housing Type']} </housing_type>" if pd.notna(row['Housing Type']) else None,
        f"<age> {row['Age (Years)']} years old </age>" if pd.notna(row['Age (Years)']) else None,
        f"<employment_duration> {row['Employment Duration (Years)']} years </employment_duration>" if pd.notna(row['Employment Duration (Years)']) else None,
        f"<mobile_phone> {'yes' if row['Mobile Phone'] == 1 else 'no'} </mobile_phone>" if pd.notna(row['Mobile Phone']) else None,
        f"<work_phone> {'yes' if row['Work Phone'] == 1 else 'no'} </work_phone>" if pd.notna(row['Work Phone']) else None,
        f"<email> {'yes' if row['Email'] == 1 else 'no'} </email>" if pd.notna(row['Email']) else None,
        f"<family_size> {row['Family Size']} </family_size>" if pd.notna(row['Family Size']) else None,
    ]
    input_text = " ".join([part for part in input_parts if part is not None])

    return f"<input> {input_text} </input>", label

# Apply the function to each row
df["text"], df["label"] = zip(*df.apply(generate_tagged_input_and_label, axis=1))

# Keep only the required columns
final_df = df[["text", "label"]]

In [16]:
# Display the first value in the 'Reason' column
print(final_df['label'].iloc[0])
print(final_df['text'].iloc[0])
final_df.head()

<label> This application was approved due to,Marital Status Civil marriage,Number of Children 0,Age (Years) 32 years old,Annual Income 427500.0,Family Size 2.0. </label>
<input> <gender> Male </gender> <car_ownership> yes </car_ownership> <property_ownership> yes </property_ownership> <number_of_children> 0 </number_of_children> <annual_income> 427500.0 </annual_income> <income_category> Working </income_category> <education_level> Higher education </education_level> <marital_status> Civil marriage </marital_status> <housing_type> Rented apartment </housing_type> <age> 32 years old years old </age> <employment_duration> 12.44 years </employment_duration> <mobile_phone> yes </mobile_phone> <work_phone> yes </work_phone> <email> no </email> <family_size> 2.0 </family_size> </input>


Unnamed: 0,text,label
0,<input> <gender> Male </gender> <car_ownership...,"<label> This application was approved due to,M..."
1,<input> <gender> Male </gender> <car_ownership...,<label> This application was approved due to E...
2,<input> <gender> Male </gender> <car_ownership...,<label> This application was approved due to E...
3,<input> <gender> Female </gender> <car_ownersh...,<label> This application was approved due to E...
4,<input> <gender> Female </gender> <car_ownersh...,<label> This application was approved due to E...


In [17]:
import random

# Define templates for varied inputs
input_templates = [
    "My gender is {gender}, I own a car {car}, I own a house {house}, I make {income} annually, I am {job_status}, I have {education}, I am {marital_status}, I live in {housing}, I am {age} years old, I have {children} children.",
    "I am a {gender} earning {income} per year. I am {marital_status}, living in a {housing}, and I am {age} years old. I have {children} children and {education}. My job status is {job_status}.",
    "{gender}, {car}, {house}, earning {income}, {job_status}, {education}, {marital_status}, {housing}, {age} years old, {children} children.",
    "I earn {income} yearly and am {marital_status}. I live in a {housing}, I am {age} years old, and I have {children} children.",
    "My details: {gender}, Car Ownership: {car}, Property Ownership: {house}, Annual Income: {income}, Education: {education}, Marital Status: {marital_status}, Housing Type: {housing}, Age: {age}, Children: {children}.",
    "I am a {gender} who owns a car ({car}) and a house ({house}). I earn {income} per year and am {marital_status}. My education level is {education}, and I live in a {housing}. I am {age} years old and have {children} children. My job is {job_status}.",
    "I am {age} years old and make {income} annually. I am {marital_status}, living in a {housing}. I own a car: {car}. I own a house: {house}. I have {children} children and {education}.",
    "I earn {income} yearly and have {children} children. I am a {gender} who owns a car ({car}) and a house ({house}). I am {marital_status} and live in a {housing}. My education level is {education}, and I am {age} years old.",
    "I have {education} and make {income} yearly. I am {age} years old and {job_status}. I own a car ({car}) and a house ({house}). I am {marital_status} and live in a {housing}. I have {children} children.",
    "{gender} with an annual income of {income}, {job_status}, living in a {housing}. {marital_status}, with {education}. I own a car ({car}) and a house ({house}). Age: {age}, Children: {children}.",
    "I make {income} per year and live in a {housing}. I am {marital_status} with {education}, {age} years old, and have {children} children. Car ownership: {car}, Property ownership: {house}.",
    "I am {age} years old, {marital_status}, and earn {income} per year. I live in a {housing} and have {children} children. Car: {car}, House: {house}, Education: {education}.",
    "I am a {gender} who earns {income} annually. I have {children} children and am {age} years old. I live in a {housing} and own a car ({car}) and a house ({house}).",
    "{gender}, {age} years old, earns {income}, lives in a {housing}, is {marital_status}, owns a car: {car}, owns a house: {house}, has {education}, and {children} children.",
    "{gender}, owns a car: {car}, owns a house: {house}, earns {income}, is {age} years old, {education}, {marital_status}, {housing}, {children} children."
]

# Generate varied natural-language inputs
def generate_varied_inputs(row):
    # Choose a random template
    template = random.choice(input_templates)
    
    # Fill placeholders with actual values or defaults
    filled_input = template.format(
        gender="Male" if row['Gender'] == 'M' else "Female",
        car="yes" if row['Car Ownership'] == 'Y' else "no",
        house="yes" if row['Property Ownership'] == 'Y' else "no",
        income=f"{row['Annual Income']:.2f}" if pd.notna(row['Annual Income']) else "unknown",
        job_status="working" if row['Income Category'] == "Working" else "not working",
        education=row['Education Level'] if pd.notna(row['Education Level']) else "unknown education level",
        marital_status=row['Marital Status'] if pd.notna(row['Marital Status']) else "unknown marital status",
        housing=row['Housing Type'] if pd.notna(row['Housing Type']) else "unknown housing type",
        age=f"{row['Age (Years)']}" if pd.notna(row['Age (Years)']) else "unknown age",
        children=int(row['Number of Children']) if pd.notna(row['Number of Children']) else "unknown number of"
    )
    return filled_input
random_df = df.copy()
# Apply the varied input generation to each row
random_df["text"] = random_df.apply(generate_varied_inputs, axis=1)

# Keep the label (reason) structured and tagged
random_df["label"] = random_df["Reason"].apply(lambda reason: f"<label> {reason} </label>")

# Resulting dataframe with varied inputs and structured labels
random_df = random_df[["text", "label"]]
random_df.head()

Unnamed: 0,text,label
0,I have Higher education and make 427500.00 yea...,"<label> This application was approved due to,M..."
1,"My gender is Male, I own a car yes, I own a ho...",<label> This application was approved due to E...
2,I have Secondary / secondary special and make ...,<label> This application was approved due to E...
3,I am a Female earning 270000.00 per year. I am...,<label> This application was approved due to E...
4,I am 52 years old years old and make 270000.00...,<label> This application was approved due to E...


In [18]:
base_model = "meta-llama/Llama-3.2-3B-Instruct"
new_model = "/opt/notebooks/Chatbot-Credit-Card/models/llama-3.2-3b-CC/"

In [19]:
torch_dtype = torch.float16
attn_implementation = "eager"

In [20]:
# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)
# Load model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [21]:
from datasets import Dataset
# Convert DataFrames to HuggingFace datasets
final_dataset = Dataset.from_pandas(final_df)
random_dataset = Dataset.from_pandas(random_df)

# Split datasets into train and eval
final_split = final_dataset.train_test_split(test_size=0.2)
random_split = random_dataset.train_test_split(test_size=0.2)

final_train_dataset = final_split['train']
final_eval_dataset = final_split['test']

random_train_dataset = random_split['train']
random_eval_dataset = random_split['test']

In [22]:
import bitsandbytes as bnb

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:  # needed for 16 bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

modules = find_all_linear_names(model)
print(modules)

['k_proj', 'v_proj', 'up_proj', 'o_proj', 'q_proj', 'down_proj', 'gate_proj']


In [23]:
# LoRA config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=modules
)
model, tokenizer = setup_chat_format(model, tokenizer)
model = get_peft_model(model, peft_config)

In [24]:
from transformers import TrainerCallback, TrainerState, TrainerControl
# Custom Callback to save model every 5 epochs
class SaveEveryNEpochsCallback(TrainerCallback):
    def __init__(self, save_every_n_epochs, output_dir):
        self.save_every_n_epochs = save_every_n_epochs
        self.output_dir = output_dir

    def on_epoch_end(self, args, state: TrainerState, control: TrainerControl, **kwargs):
        if state.epoch % self.save_every_n_epochs == 0:
            model_save_path = os.path.join(self.output_dir, f"checkpoint-{int(state.epoch)}-epochs")
            kwargs["model"].save_pretrained(model_save_path)
            kwargs["tokenizer"].save_pretrained(model_save_path)
            print(f"Model saved at {model_save_path}")

In [25]:
run = wandb.init(
    project='Fine-tune Llama 3 on CC Dataset', 
    job_type="training", 
    anonymous="allow"
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


[34m[1mwandb[0m: Currently logged in as: [33mdnicho26[0m ([33mdnicho26-university-of-north-carolina-at-charlotte[0m). Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Tracking run with wandb version 0.18.5


[34m[1mwandb[0m: Run data is saved locally in [35m[1m/opt/notebooks/Chatbot-Credit-Card/backend/notebooks/wandb/run-20241113_194531-raqujeed[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.


[34m[1mwandb[0m: Syncing run [33mworldly-forest-47[0m


[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/dnicho26-university-of-north-carolina-at-charlotte/Fine-tune%20Llama%203%20on%20CC%20Dataset[0m


[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/dnicho26-university-of-north-carolina-at-charlotte/Fine-tune%20Llama%203%20on%20CC%20Dataset/runs/raqujeed[0m


In [26]:
from datasets import Dataset
from transformers import TrainingArguments, EarlyStoppingCallback

# Training arguments template
def get_training_arguments(output_dir, num_epochs):
    return TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        gradient_accumulation_steps=2,
        optim="paged_adamw_32bit",
        num_train_epochs=num_epochs,
        eval_strategy="steps",
        eval_steps=50,  # Adjust eval steps based on dataset size
        logging_steps=50,
        warmup_steps=10,
        logging_strategy="steps",
        learning_rate=2e-4,
        fp16=False,
        bf16=False,
        group_by_length=True,
        report_to="wandb"
    )

# Training and evaluation function
def train_and_save_model(train_dataset, eval_dataset, output_suffix, num_epochs=100, save_every_n_epochs=5):
    # Save first epoch model for debugging
    print(f"Training for 1 epoch to save a debug model...")
    debug_training_args = get_training_arguments(new_model + output_suffix + "-1-epoch", num_epochs=1)

    debug_trainer = SFTTrainer(
        model=model,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        peft_config=peft_config,
        max_seq_length=512,
        dataset_text_field="text",
        tokenizer=tokenizer,
        args=debug_training_args,
        packing=False,
    )

    # Train for one epoch and save
    debug_trainer.train()
    model.save_pretrained(new_model + output_suffix + "-1-epoch")
    tokenizer.save_pretrained(new_model + output_suffix + "token-1-epoch")
    print(f"Debug model saved after 1 epoch at {new_model + output_suffix + '-1-epoch'}")

    # Continue training for the full epochs
    print(f"Continuing training for {num_epochs} epochs...")
    full_training_args = get_training_arguments(new_model + output_suffix, num_epochs)

    trainer = SFTTrainer(
        model=model,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        peft_config=peft_config,
        max_seq_length=512,
        dataset_text_field="text",
        tokenizer=tokenizer,
        args=full_training_args,
        packing=False,
    )

    # Add early stopping and periodic saving callbacks
    trainer.add_callback(EarlyStoppingCallback(early_stopping_patience=5))
    trainer.add_callback(SaveEveryNEpochsCallback(save_every_n_epochs=save_every_n_epochs, output_dir=new_model + output_suffix))

    # Train the model
    trainer.train()

    # Save the final trained model
    model.save_pretrained(new_model + output_suffix + "-final")
    tokenizer.save_pretrained(new_model + output_suffix + "token-final")
    print(f"Final model saved at {new_model + output_suffix + '-final'}")

# Train and save models for both datasets
# Final dataset
print("Training on final dataset...")
train_and_save_model(final_train_dataset, final_eval_dataset, output_suffix="final", num_epochs=100)

# Random dataset
print("Training on random dataset...")
train_and_save_model(random_train_dataset, random_eval_dataset, output_suffix="random", num_epochs=100)

# Close WandB session
wandb.finish()

Training on final dataset...
Training for 1 epoch to save a debug model...


Map:   0%|          | 0/6759 [00:00<?, ? examples/s]

Map:   0%|          | 0/1690 [00:00<?, ? examples/s]



Step,Training Loss,Validation Loss
50,0.4371,0.158843
100,0.1448,0.14493
150,0.137,0.150749
200,0.1346,0.152291
250,0.1361,0.155325
300,0.1309,0.15676
350,0.1352,0.147944
400,0.1357,0.145213
450,0.1305,0.141195
500,0.1285,0.141436


In [None]:
# Load the first epoch model for response generation
from transformers import AutoTokenizer, AutoModelForCausalLM

debug_model_path = new_model + "-final-1-epoch"
tokenizer = AutoTokenizer.from_pretrained(debug_model_path)
model = AutoModelForCausalLM.from_pretrained(debug_model_path).to("cuda")

# Example messages for testing
messages = [
    {"role": "system", "content": "You are a highly knowledgeable financial advisor specializing in credit card approvals."},
    {"role": "user", "content": "Why was my application rejected? My age is 30, income is $40,000, and credit score is 580."}
]

# Generate prompt using chat template
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

# Tokenize the prompt
inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to("cuda")

# Generate model outputs
outputs = model.generate(**inputs, max_new_tokens=150, num_return_sequences=1)

# Decode and print the response
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Model Response:")
print(response.split("assistant")[1])
