<span style="color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;">An Exception was encountered at '<a href="#papermill-error-cell">In [30]</a>'.</span>

In [1]:
import os
import torch
import wandb
import numpy as np
import pandas as pd
import warnings
import random
import re

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
    TrainerCallback,
    EarlyStoppingCallback,
    TrainerState,
    TrainerControl,
)
from datasets import load_dataset, Dataset
from trl import SFTTrainer, setup_chat_format
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import bitsandbytes as bnb

In [2]:
# huggingface-cli login --token key   
# wandb login --relogin key

In [3]:
warnings.filterwarnings("ignore")
#https://www.kaggle.com/datasets/rikdifos/credit-card-approval-prediction
# Load the application_record dataset
data = pd.read_csv("/opt/notebooks/Chatbot-Credit-Card/backend/dataset/credit-card-approval/application_record.csv")
# Load the credit_record dataset
record = pd.read_csv("/opt/notebooks/Chatbot-Credit-Card/backend/dataset/credit-card-approval/credit_record.csv")
# Find the first account open month for each user
begin_month = record.loc[record.groupby("ID")["MONTHS_BALANCE"].idxmin()]
begin_month = begin_month.rename(columns={"MONTHS_BALANCE": "begin_month"})

# Merge the datasets
df = pd.merge(data, begin_month, how="left", on="ID")
print("Datasets loaded and merged successfully.")
# Define approval logic based on multiple criteria
def determine_approval(row):
    # Define custom approval logic
    if row["STATUS"] in ["0", "1", "C", "X"]:  # Good credit status
            return 1  # Approved
    return 0  # Default to denial if STATUS is bad or missing

# Apply logic to determine approval (filling missing STATUS values first)
record["STATUS"] = record["STATUS"].fillna("X")  # Handle missing values
record["Approved"] = record.apply(determine_approval, axis=1)
# Aggregate approval status for each ID (disapproval if any ID has disqualifying criteria)
approval_status = record.groupby("ID")["Approved"].min().reset_index()
# Merge approval status back into the main dataset, avoiding "_x" and "_y" columns
df = pd.merge(data, approval_status, how="left", on="ID")
df["Approved"] = df["Approved"].fillna(0).astype(int)  # Fill missing approvals as denial
print("Approval status merged successfully.")
# Preprocess the 'DAYS_BIRTH' column to convert days to years
df['DAYS_BIRTH'] = (-df['DAYS_BIRTH'] // 365).fillna(0).astype(int)
df.drop(columns=['ID'], inplace=True)
# Preprocess the 'DAYS_EMPLOYED' column to get absolute values and handle unemployment
df['DAYS_EMPLOYED'] = df['DAYS_EMPLOYED'].apply(lambda x: abs(x) if x < 0 else 0)
# Handle missing or infinite values in numerical columns
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
df[numerical_cols] = df[numerical_cols].replace([np.inf, -np.inf], np.nan)  # Replace infinities with NaN
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].median())  # Fill NaN with median values
print("Preprocessing completed successfully.")

# Define the feature mapping dictionary
feature_mapping = {
    'CODE_GENDER': 'Gender',
    'FLAG_OWN_CAR': 'Car Ownership',
    'FLAG_OWN_REALTY': 'Property Ownership',
    'CNT_CHILDREN': 'Number of Children',
    'AMT_INCOME_TOTAL': 'Annual Income',
    'NAME_INCOME_TYPE': 'Income Category',
    'NAME_EDUCATION_TYPE': 'Education Level',
    'NAME_FAMILY_STATUS': 'Marital Status',
    'NAME_HOUSING_TYPE': 'Housing Type',
    'DAYS_BIRTH': 'Age (Days)',
    'DAYS_EMPLOYED': 'Employment Duration (Days)',
    'FLAG_MOBIL': 'Mobile Phone',
    'FLAG_WORK_PHONE': 'Work Phone',
    'FLAG_PHONE': 'Phone',
    'FLAG_EMAIL': 'Email',
    'OCCUPATION_TYPE': 'Occupation',
    'CNT_FAM_MEMBERS': 'Family Size',
    'STATUS': 'Credit Status'
}

# Rename the columns in the DataFrame using the mapping
df.rename(columns=feature_mapping, inplace=True)

# Display the first few rows to confirm the changes
df.head()
df.rename(columns={'TARGET': 'Approved'}, inplace=True)
# Display the first few rows to confirm the change
df.head()

Datasets loaded and merged successfully.


Approval status merged successfully.
Preprocessing completed successfully.


Unnamed: 0,Gender,Car Ownership,Property Ownership,Number of Children,Annual Income,Income Category,Education Level,Marital Status,Housing Type,Age (Days),Employment Duration (Days),Mobile Phone,Work Phone,Phone,Email,Occupation,Family Size,Approved
0,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,32,4542.0,1.0,1.0,0.0,0.0,,2.0,1
1,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,32,4542.0,1.0,1.0,0.0,0.0,,2.0,1
2,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,58,1134.0,1.0,0.0,0.0,0.0,Security staff,2.0,1
3,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,52,3051.0,1.0,0.0,1.0,1.0,Sales staff,1.0,1
4,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,52,3051.0,1.0,0.0,1.0,1.0,Sales staff,1.0,1


In [4]:
explanation_df = pd.read_csv('/opt/notebooks/Chatbot-Credit-Card/backend/dataset/explanations_df.csv')
# explanation_df.drop(['Approved','Explanation'], axis=1, inplace=True)
explanation_df.head()

Unnamed: 0,Prediction,Actual,Explanation
0,Approved,Approved,This application was approved due to Employmen...
1,Approved,Approved,This application was approved due to Employmen...
2,Approved,Approved,"This application was approved due to -0.77, Ma..."
3,Approved,Approved,"This application was approved due to Email, -0..."
4,Denied,Approved,"This application was denied due to -1.62, Emai..."


In [5]:
print(explanation_df['Explanation'].iloc[0])

This application was approved due to Employment Duration (Days), Housing Type, -0.77, Marital Status, Work Phone, -0.15, Number of Children, -0.81, Family Size, Annual Income.


In [6]:
# Function to remove numbers and clean the Explanation column
def clean_explanation(text):
    return re.sub(r'[-+]?\d*\.\d+|\d+', '', text).replace(', ,', ',').replace(' ,', ',').strip(", ")

# Apply the cleaning function to the Explanation column
explanation_df["Explanation"] = explanation_df["Explanation"].apply(clean_explanation)

In [7]:
# Display the DataFrame with the cleaned Explanation column
print(explanation_df['Explanation'].iloc[0])
explanation_df.head()

This application was approved due to Employment Duration (Days), Housing Type, Marital Status, Work Phone, Number of Children, Family Size, Annual Income.


Unnamed: 0,Prediction,Actual,Explanation
0,Approved,Approved,This application was approved due to Employmen...
1,Approved,Approved,This application was approved due to Employmen...
2,Approved,Approved,"This application was approved due to, Marital ..."
3,Approved,Approved,"This application was approved due to Email, Nu..."
4,Denied,Approved,"This application was denied due to, Email, Hou..."


In [8]:
# df.rename(columns={'Age (Days)': 'Age (Years)'}, inplace=True)
df.rename(columns={'Employment Duration' : 'Employment Duration (Days)'}, inplace=True)
df['Age (Days)'] = df['Age (Days)'].astype(str) + " years old"

In [9]:
# Combine explanation_df and df based on index
df['Reason'] = explanation_df['Explanation'].reset_index(drop=True)

# Display the first value in the 'Reason' column
print(df['Reason'].iloc[0])
df.head()

This application was approved due to Employment Duration (Days), Housing Type, Marital Status, Work Phone, Number of Children, Family Size, Annual Income.


Unnamed: 0,Gender,Car Ownership,Property Ownership,Number of Children,Annual Income,Income Category,Education Level,Marital Status,Housing Type,Age (Days),Employment Duration (Days),Mobile Phone,Work Phone,Phone,Email,Occupation,Family Size,Approved,Reason
0,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,32 years old,4542.0,1.0,1.0,0.0,0.0,,2.0,1,This application was approved due to Employmen...
1,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,32 years old,4542.0,1.0,1.0,0.0,0.0,,2.0,1,This application was approved due to Employmen...
2,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,58 years old,1134.0,1.0,0.0,0.0,0.0,Security staff,2.0,1,"This application was approved due to, Marital ..."
3,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,52 years old,3051.0,1.0,0.0,1.0,1.0,Sales staff,1.0,1,"This application was approved due to Email, Nu..."
4,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,52 years old,3051.0,1.0,0.0,1.0,1.0,Sales staff,1.0,1,"This application was denied due to, Email, Hou..."


In [10]:
# Function to dynamically replace labels with row values
def insert_numbers_dynamically(row):
    reason = row['Reason']
    
    # Extract all parts of the reason where dynamic replacement is needed
    labels = re.findall(r'([A-Za-z\s()]+)', reason)
    
    for label in labels:
        # Match the label to the corresponding column name
        column_name = None
        for col in df.columns:
            # Normalize column names and labels for matching
            normalized_col = re.sub(r'[\s()]+', '', col).lower()
            normalized_label = re.sub(r'[\s()]+', '', label).lower()
            
            if normalized_col == normalized_label:
                column_name = col
                break
        
        if column_name and column_name in row:  # Ensure column exists in the DataFrame
            # Replace the label with the corresponding value from the row
            value = row[column_name]
            # Ensure proper replacement in the text
            reason = reason.replace(label, f"{label.strip()} {value}")
    
    return reason

In [11]:
# Apply the function to each row in the DataFrame
df['Reason'] = df.apply(insert_numbers_dynamically, axis=1)

# Display the updated DataFrame
df.head()

Unnamed: 0,Gender,Car Ownership,Property Ownership,Number of Children,Annual Income,Income Category,Education Level,Marital Status,Housing Type,Age (Days),Employment Duration (Days),Mobile Phone,Work Phone,Phone,Email,Occupation,Family Size,Approved,Reason
0,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,32 years old,4542.0,1.0,1.0,0.0,0.0,,2.0,1,This application was approved due to Employmen...
1,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,32 years old,4542.0,1.0,1.0,0.0,0.0,,2.0,1,This application was approved due to Employmen...
2,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,58 years old,1134.0,1.0,0.0,0.0,0.0,Security staff,2.0,1,"This application was approved due to,Marital S..."
3,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,52 years old,3051.0,1.0,0.0,1.0,1.0,Sales staff,1.0,1,"This application was approved due to Email,Num..."
4,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,52 years old,3051.0,1.0,0.0,1.0,1.0,Sales staff,1.0,1,"This application was denied due to,Email 1.0,H..."


In [12]:
# Convert "Employment Duration (Days)" to years in-place
df["Employment Duration (Days)"] = df["Employment Duration (Days)"].apply(lambda x: round(x / 365.0, 2) if not pd.isna(x) else np.nan)
df.rename(columns={"Age (Days)": "Age (Years)", "Employment Duration (Days)": "Employment Duration (Years)"}, inplace=True)
df.head()

Unnamed: 0,Gender,Car Ownership,Property Ownership,Number of Children,Annual Income,Income Category,Education Level,Marital Status,Housing Type,Age (Years),Employment Duration (Years),Mobile Phone,Work Phone,Phone,Email,Occupation,Family Size,Approved,Reason
0,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,32 years old,12.44,1.0,1.0,0.0,0.0,,2.0,1,This application was approved due to Employmen...
1,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,32 years old,12.44,1.0,1.0,0.0,0.0,,2.0,1,This application was approved due to Employmen...
2,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,58 years old,3.11,1.0,0.0,0.0,0.0,Security staff,2.0,1,"This application was approved due to,Marital S..."
3,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,52 years old,8.36,1.0,0.0,1.0,1.0,Sales staff,1.0,1,"This application was approved due to Email,Num..."
4,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,52 years old,8.36,1.0,0.0,1.0,1.0,Sales staff,1.0,1,"This application was denied due to,Email 1.0,H..."


In [13]:
# Step 2: Update the "Reason" column to match the new labels
def update_reason(reason_text):
    reason_text = reason_text.replace("Age (Days)", "Age (Years)")
    reason_text = reason_text.replace("Employment Duration (Days)", "Employment Duration (Years)")
    return reason_text

df["Reason"] = df["Reason"].apply(update_reason)

In [14]:
# Display the first value in the 'Reason' column
print(df['Reason'].iloc[0])

This application was approved due to Employment Duration (Years),Housing Type Rented apartment,Marital Status Civil marriage,Work Phone 1.0,Number of Children 0,Family Size 2.0,Annual Income 427500.0.


In [15]:
def generate_tagged_input_and_label(row):
    # Generate the reasoning part of the label using the same tags as input
    reasoning_parts = [
        f"<gender> {'Male' if row['Gender'] == 'M' else 'Female'} </gender>" if pd.notna(row.get('Gender')) else None,
        f"<age> {row['Age (Years)']} years old </age>" if pd.notna(row.get('Age (Years)')) else None,
        f"<car_ownership> {'yes' if row['Car Ownership'] == 'Y' else 'no'} </car_ownership>" if pd.notna(row.get('Car Ownership')) else None,
        f"<property_ownership> {'yes' if row['Property Ownership'] == 'Y' else 'no'} </property_ownership>" if pd.notna(row.get('Property Ownership')) else None,
        f"<number_of_children> {int(row['Number of Children'])} </number_of_children>" if pd.notna(row.get('Number of Children')) else None,
        f"<annual_income> {row['Annual Income']} </annual_income>" if pd.notna(row.get('Annual Income')) else None,
        f"<income_category> {row['Income Category']} </income_category>" if pd.notna(row.get('Income Category')) else None,
        f"<education_level> {row['Education Level']} </education_level>" if pd.notna(row.get('Education Level')) else None,
        f"<marital_status> {row['Marital Status']} </marital_status>" if pd.notna(row.get('Marital Status')) else None,
        f"<housing_type> {row['Housing Type']} </housing_type>" if pd.notna(row.get('Housing Type')) else None,
        f"<employment_duration> {row['Employment Duration (Years)']} years </employment_duration>" if pd.notna(row.get('Employment Duration (Years)')) else None,
        f"<mobile_phone> {'yes' if row['Mobile Phone'] == 1 else 'no'} </mobile_phone>" if pd.notna(row.get('Mobile Phone')) else None,
        f"<work_phone> {'yes' if row['Work Phone'] == 1 else 'no'} </work_phone>" if pd.notna(row.get('Work Phone')) else None,
        f"<email> {'yes' if row['Email'] == 1 else 'no'} </email>" if pd.notna(row.get('Email')) else None,
        f"<family_size> {row['Family Size']} </family_size>" if pd.notna(row.get('Family Size')) else None,
    ]
    reasoning_text = " ".join([part for part in reasoning_parts if part is not None])
    label = f"<approval_status> Approved </approval_status> <reasoning> {reasoning_text} </reasoning>"

    # Generate input in the same way
    input_parts = reasoning_parts  # Reuse the reasoning parts for consistency
    input_text = " ".join([part for part in input_parts if part is not None])
    input_section = f"{input_text}"

    return input_section, label

def generate_plain_input_and_label(row):
    # Generate the reasoning part of the label
    reasoning_parts = [
        f"Gender: {'Male' if row['Gender'] == 'M' else 'Female'}" if pd.notna(row.get('Gender')) else None,
        f"Age: {row['Age (Years)']} years old" if pd.notna(row.get('Age (Years)')) else None,
        f"Car Ownership: {'Yes' if row['Car Ownership'] == 'Y' else 'No'}" if pd.notna(row.get('Car Ownership')) else None,
        f"Property Ownership: {'Yes' if row['Property Ownership'] == 'Y' else 'No'}" if pd.notna(row.get('Property Ownership')) else None,
        f"Number of Children: {int(row['Number of Children'])}" if pd.notna(row.get('Number of Children')) else None,
        f"Annual Income: {row['Annual Income']}" if pd.notna(row.get('Annual Income')) else None,
        f"Income Category: {row['Income Category']}" if pd.notna(row.get('Income Category')) else None,
        f"Education Level: {row['Education Level']}" if pd.notna(row.get('Education Level')) else None,
        f"Marital Status: {row['Marital Status']}" if pd.notna(row.get('Marital Status')) else None,
        f"Housing Type: {row['Housing Type']}" if pd.notna(row.get('Housing Type')) else None,
        f"Employment Duration: {row['Employment Duration (Years)']} years" if pd.notna(row.get('Employment Duration (Years)')) else None,
        f"Mobile Phone: {'Yes' if row['Mobile Phone'] == 1 else 'No'}" if pd.notna(row.get('Mobile Phone')) else None,
        f"Work Phone: {'Yes' if row['Work Phone'] == 1 else 'No'}" if pd.notna(row.get('Work Phone')) else None,
        f"Email: {'Yes' if row['Email'] == 1 else 'No'}" if pd.notna(row.get('Email')) else None,
        f"Family Size: {row['Family Size']}" if pd.notna(row.get('Family Size')) else None,
    ]
    reasoning_text = ". ".join([part for part in reasoning_parts if part is not None])
    label = f"Approval Status: Approved. Reasoning: {reasoning_text}."

    # Generate input in the same way
    input_text = reasoning_text  # Reuse the reasoning parts for consistency

    return input_text, label


In [16]:
# Apply the function to each row
df["text"], df["label"] = zip(*df.apply(generate_plain_input_and_label, axis=1))

# Keep only the required columns
final_df = df[["text", "label"]]

In [17]:
# Display the first value in the 'Reason' column
print("Label\n")
print(final_df['label'].iloc[0])
print("Input\n")
print(final_df['text'].iloc[0])
final_df.head()

Label

Approval Status: Approved. Reasoning: Gender: Male. Age: 32 years old years old. Car Ownership: Yes. Property Ownership: Yes. Number of Children: 0. Annual Income: 427500.0. Income Category: Working. Education Level: Higher education. Marital Status: Civil marriage. Housing Type: Rented apartment. Employment Duration: 12.44 years. Mobile Phone: Yes. Work Phone: Yes. Email: No. Family Size: 2.0.
Input

Gender: Male. Age: 32 years old years old. Car Ownership: Yes. Property Ownership: Yes. Number of Children: 0. Annual Income: 427500.0. Income Category: Working. Education Level: Higher education. Marital Status: Civil marriage. Housing Type: Rented apartment. Employment Duration: 12.44 years. Mobile Phone: Yes. Work Phone: Yes. Email: No. Family Size: 2.0


Unnamed: 0,text,label
0,Gender: Male. Age: 32 years old years old. Car...,Approval Status: Approved. Reasoning: Gender: ...
1,Gender: Male. Age: 32 years old years old. Car...,Approval Status: Approved. Reasoning: Gender: ...
2,Gender: Male. Age: 58 years old years old. Car...,Approval Status: Approved. Reasoning: Gender: ...
3,Gender: Female. Age: 52 years old years old. C...,Approval Status: Approved. Reasoning: Gender: ...
4,Gender: Female. Age: 52 years old years old. C...,Approval Status: Approved. Reasoning: Gender: ...


In [18]:
# Define templates for varied inputs
input_templates = [
    "My gender is {gender}, I own a car: {car}, I own a house: {house}, I make {income} annually, I am {job_status}, I have {education}, I am {marital_status}, I live in {housing}, I am {age} years old, I have {children} children.",
    "I am a {gender} earning {income} per year. I am {marital_status}, living in a {housing}, and I am {age} years old. I have {children} children and {education}. My job status is {job_status}.",
    "{gender}, owns a car: {car}, owns a house: {house}, earning {income}, {job_status}, {education}, {marital_status}, living in {housing}, {age} years old, {children} children.",
    "I earn {income} yearly and am {marital_status}. I live in a {housing}, I am {age} years old, and I have {children} children.",
    "My details: {gender}, Car Ownership: {car}, Property Ownership: {house}, Annual Income: {income}, Education: {education}, Marital Status: {marital_status}, Housing Type: {housing}, Age: {age}, Children: {children}.",
    "I am a {gender} who owns a car ({car}) and a house ({house}). I earn {income} per year and am {marital_status}. My education level is {education}, and I live in a {housing}. I am {age} years old and have {children} children. My job is {job_status}.",
    "I am {age} years old and make {income} annually. I am {marital_status}, living in a {housing}. I own a car: {car}. I own a house: {house}. I have {children} children and {education}.",
    "I earn {income} yearly and have {children} children. I am a {gender} who owns a car ({car}) and a house ({house}). I am {marital_status} and live in a {housing}. My education level is {education}, and I am {age} years old.",
    "I have {education} and make {income} yearly. I am {age} years old and {job_status}. I own a car ({car}) and a house ({house}). I am {marital_status} and live in a {housing}. I have {children} children.",
    "{gender} with an annual income of {income}, {job_status}, living in a {housing}. {marital_status}, with {education}. I own a car ({car}) and a house ({house}). Age: {age}, Children: {children}.",
    "I make {income} per year and live in a {housing}. I am {marital_status} with {education}, {age} years old, and have {children} children. Car ownership: {car}, Property ownership: {house}.",
    "I am {age} years old, {marital_status}, and earn {income} per year. I live in a {housing} and have {children} children. Car: {car}, House: {house}, Education: {education}.",
    "I am a {gender} who earns {income} annually. I have {children} children and am {age} years old. I live in a {housing} and own a car ({car}) and a house ({house}).",
    "{gender}, {age} years old, earns {income}, lives in a {housing}, is {marital_status}, owns a car: {car}, owns a house: {house}, has {education}, and {children} children.",
    "{gender}, owns a car: {car}, owns a house: {house}, earns {income}, is {age} years old, {education}, {marital_status}, {housing}, {children} children."
]

# Generate varied natural-language inputs
def generate_varied_inputs(row):
    # Choose a random template
    template = random.choice(input_templates)
    
    # Fill placeholders with actual values or defaults
    filled_input = template.format(
        gender="Male" if row['Gender'] == 'M' else "Female",
        car="yes" if row['Car Ownership'] == 'Y' else "no",
        house="yes" if row['Property Ownership'] == 'Y' else "no",
        income=f"{row['Annual Income']:.2f}" if pd.notna(row['Annual Income']) else "unknown income",
        job_status="working" if row['Income Category'] == "Working" else "not working" if pd.notna(row['Income Category']) else "unknown job status",
        education=row['Education Level'] if pd.notna(row['Education Level']) else "unknown education level",
        marital_status=row['Marital Status'] if pd.notna(row['Marital Status']) else "unknown marital status",
        housing=row['Housing Type'] if pd.notna(row['Housing Type']) else "unknown housing type",
        age=f"{(row['Age (Years)'])}" if pd.notna(row['Age (Years)']) else "unknown age",
        children=f"{int(row['Number of Children'])}" if pd.notna(row['Number of Children']) else "unknown number of"
    )
    return filled_input

In [19]:
random_df = df.copy()
# Apply the varied input generation to each row
random_df["text"] = random_df.apply(generate_varied_inputs, axis=1)

# Keep the label (reason) structured and tagged
random_df["label"] = random_df["Reason"].apply(lambda reason: f"<label> {reason} </label>")

# Resulting dataframe with varied inputs and structured labels
random_df = random_df[["text", "label"]]
random_df.head()

Unnamed: 0,text,label
0,I make 427500.00 per year and live in a Rented...,<label> This application was approved due to E...
1,"Male, 32 years old years old, earns 427500.00,...",<label> This application was approved due to E...
2,"My details: Male, Car Ownership: yes, Property...","<label> This application was approved due to,M..."
3,I earn 270000.00 yearly and am Single / not ma...,<label> This application was approved due to E...
4,"I am 52 years old years old, Single / not marr...","<label> This application was denied due to,Ema..."


In [20]:
base_model = "meta-llama/Llama-3.1-8B"
new_model = "/opt/notebooks/Chatbot-Credit-Card/backend/models/llama-3.1-8b-CC/"

instruct_base_model = "meta-llama/Llama-3.1-8B-Instruct"
instruct_new_model = "/opt/notebooks/Chatbot-Credit-Card/backend/models/llama-3.1-8b-Instruct-CC/"
epochs=30

In [21]:
torch_dtype = torch.float16
attn_implementation = "eager"

In [22]:
# Convert DataFrames to HuggingFace datasets
final_dataset = Dataset.from_pandas(final_df)
random_dataset = Dataset.from_pandas(random_df)

# Split datasets into train and eval
final_split = final_dataset.train_test_split(test_size=0.2)
random_split = random_dataset.train_test_split(test_size=0.2)

final_train_dataset = final_split['train']
final_eval_dataset = final_split['test']

random_train_dataset = random_split['train']
random_eval_dataset = random_split['test']

In [23]:
# Custom Callback to save model every 5 epochs
class SaveEveryNEpochsCallback(TrainerCallback):
    def __init__(self, save_every_n_epochs, output_dir):
        self.save_every_n_epochs = save_every_n_epochs
        self.output_dir = output_dir

    def on_epoch_end(self, args, state: TrainerState, control: TrainerControl, **kwargs):
        if state.epoch % self.save_every_n_epochs == 0:
            model_save_path = os.path.join(self.output_dir, f"checkpoint-{int(state.epoch)}-epochs")
            kwargs["model"].save_pretrained(model_save_path)
            kwargs["tokenizer"].save_pretrained(model_save_path)
            print(f"Model saved at {model_save_path}")

In [24]:
run = wandb.init(
    project='Fine-tune Llama 3 on CC Dataset', 
    job_type="training", 
    anonymous="allow"
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


[34m[1mwandb[0m: Currently logged in as: [33mdnicho26[0m ([33mdnicho26-university-of-north-carolina-at-charlotte[0m). Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Tracking run with wandb version 0.18.5


[34m[1mwandb[0m: Run data is saved locally in [35m[1m/opt/notebooks/Chatbot-Credit-Card/backend/wandb/run-20241125_231213-j5lcy65p[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.


[34m[1mwandb[0m: Syncing run [33mtoasty-glitter-91[0m


[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/dnicho26-university-of-north-carolina-at-charlotte/Fine-tune%20Llama%203%20on%20CC%20Dataset[0m


[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/dnicho26-university-of-north-carolina-at-charlotte/Fine-tune%20Llama%203%20on%20CC%20Dataset/runs/j5lcy65p[0m


In [25]:
def get_training_arguments(output_dir, num_epochs):
    return TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        gradient_accumulation_steps=16,  # Larger effective batch size
        optim="adamw_bnb_8bit",  # Optimized optimizer
        num_train_epochs=num_epochs,
        eval_strategy="steps",
        eval_steps=100,  # Less frequent evaluation
        logging_steps=100,  # Align with eval_steps
        warmup_steps=10,
        logging_strategy="steps",
        learning_rate=2e-4,
        fp16=True,  # Mixed precision
        dataloader_num_workers=8,  # Speed up data loading
        group_by_length=True,
        load_best_model_at_end=True,  # Ensure this is set
        report_to="wandb"
    )

In [26]:
def log_labels_and_inputs(batch, predictions, step):
    """
    Log inputs, labels, and predictions to wandb.
    """
    inputs = tokenizer.batch_decode(batch["input_ids"], skip_special_tokens=True)
    labels = tokenizer.batch_decode(batch["labels"], skip_special_tokens=True)
    preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    for input_text, label_text, pred_text in zip(inputs, labels, preds):
        wandb.log({
            "input": input_text,
            "label": label_text,
            "prediction": pred_text,
            "step": step,
        })


In [27]:
# Training and evaluation function
def train_and_save_model(model, tokenizer, train_dataset, eval_dataset, peft_config, new_model_path, output_suffix, num_epochs=epochs, save_every_n_epochs=5):
    print(f"Continuing training for {num_epochs} epochs...")
    full_training_args = get_training_arguments(new_model_path + output_suffix, num_epochs)

    trainer = SFTTrainer(
        model=model,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        peft_config=peft_config,
        max_seq_length=512,
        dataset_text_field="text",
        tokenizer=tokenizer,
        args=full_training_args,
        packing=False,
    )

    trainer.add_callback(EarlyStoppingCallback(early_stopping_patience=5))
    trainer.add_callback(SaveEveryNEpochsCallback(save_every_n_epochs=save_every_n_epochs, output_dir=new_model_path + output_suffix))

    # Custom evaluation logging
    def evaluation_logging(trainer, eval_dataset):
        predictions = trainer.predict(eval_dataset)
        step = trainer.state.global_step
        for batch in eval_dataset:  # Assuming eval_dataset is iterable
            log_labels_and_inputs(batch, predictions, step)

    # Train the model
    trainer.train()

    # Log evaluation after training
    evaluation_logging(trainer, eval_dataset)

    # Save the final trained model
    model.save_pretrained(new_model_path + output_suffix + "-final")
    tokenizer.save_pretrained(new_model_path + output_suffix + "-final")
    print(f"Final model saved at {new_model_path + output_suffix + '-final'}")

In [28]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [29]:
# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)
# Modify load_model_and_tokenizer function
def load_model_and_tokenizer(base_model_path, attn_implementation="default"):
    model = AutoModelForCausalLM.from_pretrained(
        base_model_path,
        quantization_config=bnb_config,
        device_map="auto",
        attn_implementation=attn_implementation
    )
    tokenizer = AutoTokenizer.from_pretrained(base_model_path, trust_remote_code=True)
    # model, tokenizer = setup_chat_format(model, tokenizer)
    # model.resize_token_embeddings(len(tokenizer))
    return model, tokenizer
    
def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:  # needed for 16 bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

# modules = find_all_linear_names(model)
# print(modules)
# Define LoRA config once since both models use it
def create_lora_config(model):
    modules = find_all_linear_names(model)
    return LoraConfig(
        r=16,
        lora_alpha=32,
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=modules
    )


<span id="papermill-error-cell" style="color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;">Execution using papermill encountered an exception here and stopped:</span>

In [30]:
# Load Base Model and Tokenizer
print("Loading Base Model...")
base_model, base_tokenizer = load_model_and_tokenizer(
    base_model_path=base_model, 
    attn_implementation=attn_implementation
)

# Assert tokenizer and model embedding size match
assert base_model.get_input_embeddings().weight.size(0) == len(base_tokenizer), (
    f"Model embedding size ({base_model.get_input_embeddings().weight.size(0)}) "
    f"does not match tokenizer vocabulary size ({len(base_tokenizer)})."
)
# Define LoRA Config
print("Creating LoRA Configuration...")
peft_config = create_lora_config(base_model) 

# Apply PEFT to the model
base_model = get_peft_model(base_model, peft_config)
base_tokenizer.pad_token = base_tokenizer.eos_token

# Assert tokenizer and model embedding size match
assert base_model.get_input_embeddings().weight.size(0) == len(base_tokenizer), (
    f"Model embedding size after peft ({base_model.get_input_embeddings().weight.size(0)}) "
    f"does not match tokenizer vocabulary size ({len(base_tokenizer)})."
)

# Train Base Model on Final Dataset
print("Training Base Model on Final Dataset...")
train_and_save_model(
    model=base_model,
    new_model_path=new_model,
    tokenizer=base_tokenizer,
    train_dataset=final_train_dataset,
    eval_dataset=final_eval_dataset,
    peft_config=peft_config,
    output_suffix="base",
    num_epochs=epochs
)

Loading Base Model...


Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Creating LoRA Configuration...


Training Base Model on Final Dataset...
Continuing training for 30 epochs...


Map:   0%|          | 0/6759 [00:00<?, ? examples/s]

Map:   0%|          | 0/1690 [00:00<?, ? examples/s]



OutOfMemoryError: CUDA out of memory. Tried to allocate 194.00 MiB. GPU 0 has a total capacity of 15.89 GiB of which 99.12 MiB is free. Process 79308 has 5.99 GiB memory in use. Process 82924 has 9.80 GiB memory in use. Of the allocated memory 9.34 GiB is allocated by PyTorch, and 174.17 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
def test_model(model, tokenizer, input_text):
    # Construct the prompt using the same format as in training
    prompt = f"Input:\n{input_text}\n\nLabel:\n"

    # Tokenize and generate response
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to("cuda")
    outputs = model.generate(
        input_ids=inputs['input_ids'],
        attention_mask=inputs.get('attention_mask'),
        max_new_tokens=150,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id,
        temperature=0.8,
        top_p=0.9,
        repetition_penalty=1.2,
    )

    # Decode response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Extract the generated Label section
    response_content = response.split("Label:\n")[-1].strip()
    return response_content

In [None]:
# Define test prompts
# Create an input text
input_text = (
    "Gender: Male. "
    "Age: 32 years old. "
    "Car Ownership: Yes. "
    "Property Ownership: Yes. "
    "Number of Children: 0. "
    "Annual Income: 427500.0. "
    "Income Category: Working. "
    "Education Level: Higher education. "
    "Marital Status: Civil marriage. "
    "Housing Type: Rented apartment. "
    "Employment Duration: 12.44 years. "
    "Mobile Phone: Yes. "
    "Work Phone: Yes. "
    "Email: No. "
    "Family Size: 2.0."
)

In [None]:
# Test the models
print("Testing base model...")
base_response = test_model(base_model, base_tokenizer, messages)
print("Base Model Response:")
print(base_response)

In [None]:
# # Load Instruct Model
# print("Loading Instruct Model...")
# instruct_model, instruct_tokenizer = load_model_and_tokenizer(
#     base_model_path="meta-llama/Llama-3.1-8B-Instruct",
#     attn_implementation=attn_implementation

# )
# instruct_tokenizer.pad_token = instruct_tokenizer.eos_token

# # Assert tokenizer and model embedding size match
# assert instruct_model.get_input_embeddings().weight.size(0) == len(instruct_tokenizer), (
#     f"Model embedding size after peft ({instruct_model.get_input_embeddings().weight.size(0)}) "
#     f"does not match tokenizer vocabulary size ({len(instruct_tokenizer)})."
# )


# peft_config = create_lora_config(instruct_model)

# # Apply PEFT to the model
# instruct_model = get_peft_model(instruct_model, peft_config)

# # Train Instruct Model on random Dataset
# print("Training Instruct Model on Final Dataset...")
# train_and_save_model(
#     model=instruct_model,
#     new_model_path=instruct_new_model,
#     tokenizer=instruct_tokenizer,
#     train_dataset=random_train_dataset,
#     eval_dataset=random_eval_dataset,
#     peft_config=peft_config,
#     output_suffix="instruct",
#     num_epochs=30
# )

In [None]:
# print("Testing instruct model...")
# instruct_response = test_model(base_model, base_tokenizer, messages)
# print("Instruct Model Response:")
# print(instruct_response)

In [None]:
# Close WandB session
wandb.finish()