In [1]:
import os
import torch
import wandb
import numpy as np
import pandas as pd
import warnings
import random
import re

from accelerate import infer_auto_device_map
from peft import prepare_model_for_kbit_training

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
    TrainerCallback,
    EarlyStoppingCallback,
    TrainerState,
    TrainerControl,
)
from datasets import load_dataset, Dataset
from trl import SFTTrainer, setup_chat_format
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import bitsandbytes as bnb
from sklearn.metrics import (accuracy_score, 
                             classification_report, 
                             confusion_matrix)

In [2]:
# huggingface-cli login --token key   
# wandb login --relogin key

In [3]:
warnings.filterwarnings("ignore")
#https://www.kaggle.com/datasets/rikdifos/credit-card-approval-prediction
# Load the application_record dataset
data = pd.read_csv("/opt/notebooks/Chatbot-Credit-Card/backend/dataset/credit-card-approval/application_record.csv")
# Load the credit_record dataset
record = pd.read_csv("/opt/notebooks/Chatbot-Credit-Card/backend/dataset/credit-card-approval/credit_record.csv")
# Find the first account open month for each user
begin_month = record.loc[record.groupby("ID")["MONTHS_BALANCE"].idxmin()]
begin_month = begin_month.rename(columns={"MONTHS_BALANCE": "begin_month"})

# Merge the datasets
df = pd.merge(data, begin_month, how="left", on="ID")
print("Datasets loaded and merged successfully.")
# Define approval logic based on multiple criteria
def determine_approval(row):
    # Define custom approval logic
    if row["STATUS"] in ["0", "1", "C", "X"]:  # Good credit status
            return 1  # Approved
    return 0  # Default to denial if STATUS is bad or missing

# Apply logic to determine approval (filling missing STATUS values first)
record["STATUS"] = record["STATUS"].fillna("X")  # Handle missing values
record["Approved"] = record.apply(determine_approval, axis=1)
# Aggregate approval status for each ID (disapproval if any ID has disqualifying criteria)
approval_status = record.groupby("ID")["Approved"].min().reset_index()
# Merge approval status back into the main dataset, avoiding "_x" and "_y" columns
df = pd.merge(data, approval_status, how="left", on="ID")
df["Approved"] = df["Approved"].fillna(0).astype(int)  # Fill missing approvals as denial
print("Approval status merged successfully.")
# Preprocess the 'DAYS_BIRTH' column to convert days to years
df['DAYS_BIRTH'] = (-df['DAYS_BIRTH'] // 365).fillna(0).astype(int)
df.drop(columns=['ID'], inplace=True)
# Preprocess the 'DAYS_EMPLOYED' column to get absolute values and handle unemployment
df['DAYS_EMPLOYED'] = df['DAYS_EMPLOYED'].apply(lambda x: abs(x) if x < 0 else 0)
# Handle missing or infinite values in numerical columns
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
df[numerical_cols] = df[numerical_cols].replace([np.inf, -np.inf], np.nan)  # Replace infinities with NaN
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].median())  # Fill NaN with median values
print("Preprocessing completed successfully.")

# Define the feature mapping dictionary
feature_mapping = {
    'CODE_GENDER': 'Gender',
    'FLAG_OWN_CAR': 'Car Ownership',
    'FLAG_OWN_REALTY': 'Property Ownership',
    'CNT_CHILDREN': 'Number of Children',
    'AMT_INCOME_TOTAL': 'Annual Income',
    'NAME_INCOME_TYPE': 'Income Category',
    'NAME_EDUCATION_TYPE': 'Education Level',
    'NAME_FAMILY_STATUS': 'Marital Status',
    'NAME_HOUSING_TYPE': 'Housing Type',
    'DAYS_BIRTH': 'Age (Days)',
    'DAYS_EMPLOYED': 'Employment Duration (Days)',
    'FLAG_MOBIL': 'Mobile Phone',
    'FLAG_WORK_PHONE': 'Work Phone',
    'FLAG_PHONE': 'Phone',
    'FLAG_EMAIL': 'Email',
    'OCCUPATION_TYPE': 'Occupation',
    'CNT_FAM_MEMBERS': 'Family Size',
    'STATUS': 'Credit Status'
}

# Rename the columns in the DataFrame using the mapping
df.rename(columns=feature_mapping, inplace=True)

# Display the first few rows to confirm the changes
df.head()
df.rename(columns={'TARGET': 'Approved'}, inplace=True)
# Display the first few rows to confirm the change
df.head()

Datasets loaded and merged successfully.
Approval status merged successfully.
Preprocessing completed successfully.


Unnamed: 0,Gender,Car Ownership,Property Ownership,Number of Children,Annual Income,Income Category,Education Level,Marital Status,Housing Type,Age (Days),Employment Duration (Days),Mobile Phone,Work Phone,Phone,Email,Occupation,Family Size,Approved
0,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,32,4542.0,1.0,1.0,0.0,0.0,,2.0,1
1,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,32,4542.0,1.0,1.0,0.0,0.0,,2.0,1
2,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,58,1134.0,1.0,0.0,0.0,0.0,Security staff,2.0,1
3,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,52,3051.0,1.0,0.0,1.0,1.0,Sales staff,1.0,1
4,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,52,3051.0,1.0,0.0,1.0,1.0,Sales staff,1.0,1


In [4]:
explanation_df = pd.read_csv('/opt/notebooks/Chatbot-Credit-Card/backend/dataset/explanations_df.csv')
# explanation_df.drop(['Approved','Explanation'], axis=1, inplace=True)
explanation_df.head()

Unnamed: 0,Prediction,Actual,Explanation
0,Approved,Approved,This application was approved due to Employmen...
1,Approved,Approved,This application was approved due to Employmen...
2,Approved,Approved,"This application was approved due to -0.77, Ma..."
3,Approved,Approved,"This application was approved due to Email, -0..."
4,Denied,Approved,"This application was denied due to -1.62, Emai..."


In [5]:
print(explanation_df['Explanation'].iloc[0])

This application was approved due to Employment Duration (Days), Housing Type, -0.77, Marital Status, Work Phone, -0.15, Number of Children, -0.81, Family Size, Annual Income.


In [6]:
# Function to remove numbers and clean the Explanation column
def clean_explanation(text):
    return re.sub(r'[-+]?\d*\.\d+|\d+', '', text).replace(', ,', ',').replace(' ,', ',').strip(", ")

# Apply the cleaning function to the Explanation column
explanation_df["Explanation"] = explanation_df["Explanation"].apply(clean_explanation)

In [7]:
# Display the DataFrame with the cleaned Explanation column
print(explanation_df['Explanation'].iloc[0])
explanation_df.head()

This application was approved due to Employment Duration (Days), Housing Type, Marital Status, Work Phone, Number of Children, Family Size, Annual Income.


Unnamed: 0,Prediction,Actual,Explanation
0,Approved,Approved,This application was approved due to Employmen...
1,Approved,Approved,This application was approved due to Employmen...
2,Approved,Approved,"This application was approved due to, Marital ..."
3,Approved,Approved,"This application was approved due to Email, Nu..."
4,Denied,Approved,"This application was denied due to, Email, Hou..."


In [8]:
# df.rename(columns={'Age (Days)': 'Age (Years)'}, inplace=True)
df.rename(columns={'Employment Duration' : 'Employment Duration (Days)'}, inplace=True)
df['Age (Days)'] = df['Age (Days)'].astype(str) + " years old"

In [9]:
# Combine explanation_df and df based on index
df['Reason'] = explanation_df['Explanation'].reset_index(drop=True)

# Display the first value in the 'Reason' column
print(df['Reason'].iloc[0])
df.head()

This application was approved due to Employment Duration (Days), Housing Type, Marital Status, Work Phone, Number of Children, Family Size, Annual Income.


Unnamed: 0,Gender,Car Ownership,Property Ownership,Number of Children,Annual Income,Income Category,Education Level,Marital Status,Housing Type,Age (Days),Employment Duration (Days),Mobile Phone,Work Phone,Phone,Email,Occupation,Family Size,Approved,Reason
0,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,32 years old,4542.0,1.0,1.0,0.0,0.0,,2.0,1,This application was approved due to Employmen...
1,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,32 years old,4542.0,1.0,1.0,0.0,0.0,,2.0,1,This application was approved due to Employmen...
2,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,58 years old,1134.0,1.0,0.0,0.0,0.0,Security staff,2.0,1,"This application was approved due to, Marital ..."
3,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,52 years old,3051.0,1.0,0.0,1.0,1.0,Sales staff,1.0,1,"This application was approved due to Email, Nu..."
4,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,52 years old,3051.0,1.0,0.0,1.0,1.0,Sales staff,1.0,1,"This application was denied due to, Email, Hou..."


In [10]:
# Function to dynamically replace labels with row values
def insert_numbers_dynamically(row):
    reason = row['Reason']
    
    # Extract all parts of the reason where dynamic replacement is needed
    labels = re.findall(r'([A-Za-z\s()]+)', reason)
    
    for label in labels:
        # Match the label to the corresponding column name
        column_name = None
        for col in df.columns:
            # Normalize column names and labels for matching
            normalized_col = re.sub(r'[\s()]+', '', col).lower()
            normalized_label = re.sub(r'[\s()]+', '', label).lower()
            
            if normalized_col == normalized_label:
                column_name = col
                break
        
        if column_name and column_name in row:  # Ensure column exists in the DataFrame
            # Replace the label with the corresponding value from the row
            value = row[column_name]
            # Ensure proper replacement in the text
            reason = reason.replace(label, f"{label.strip()} {value}")
    
    return reason

In [11]:
# Apply the function to each row in the DataFrame
df['Reason'] = df.apply(insert_numbers_dynamically, axis=1)

# Display the updated DataFrame
df.head()

Unnamed: 0,Gender,Car Ownership,Property Ownership,Number of Children,Annual Income,Income Category,Education Level,Marital Status,Housing Type,Age (Days),Employment Duration (Days),Mobile Phone,Work Phone,Phone,Email,Occupation,Family Size,Approved,Reason
0,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,32 years old,4542.0,1.0,1.0,0.0,0.0,,2.0,1,This application was approved due to Employmen...
1,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,32 years old,4542.0,1.0,1.0,0.0,0.0,,2.0,1,This application was approved due to Employmen...
2,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,58 years old,1134.0,1.0,0.0,0.0,0.0,Security staff,2.0,1,"This application was approved due to,Marital S..."
3,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,52 years old,3051.0,1.0,0.0,1.0,1.0,Sales staff,1.0,1,"This application was approved due to Email,Num..."
4,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,52 years old,3051.0,1.0,0.0,1.0,1.0,Sales staff,1.0,1,"This application was denied due to,Email 1.0,H..."


In [12]:
# Convert "Employment Duration (Days)" to years in-place
df["Employment Duration (Days)"] = df["Employment Duration (Days)"].apply(lambda x: round(x / 365.0, 2) if not pd.isna(x) else np.nan)
df.rename(columns={"Age (Days)": "Age (Years)", "Employment Duration (Days)": "Employment Duration (Years)"}, inplace=True)
df.head()

Unnamed: 0,Gender,Car Ownership,Property Ownership,Number of Children,Annual Income,Income Category,Education Level,Marital Status,Housing Type,Age (Years),Employment Duration (Years),Mobile Phone,Work Phone,Phone,Email,Occupation,Family Size,Approved,Reason
0,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,32 years old,12.44,1.0,1.0,0.0,0.0,,2.0,1,This application was approved due to Employmen...
1,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,32 years old,12.44,1.0,1.0,0.0,0.0,,2.0,1,This application was approved due to Employmen...
2,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,58 years old,3.11,1.0,0.0,0.0,0.0,Security staff,2.0,1,"This application was approved due to,Marital S..."
3,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,52 years old,8.36,1.0,0.0,1.0,1.0,Sales staff,1.0,1,"This application was approved due to Email,Num..."
4,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,52 years old,8.36,1.0,0.0,1.0,1.0,Sales staff,1.0,1,"This application was denied due to,Email 1.0,H..."


In [13]:
# Step 2: Update the "Reason" column to match the new labels
def update_reason(reason_text):
    reason_text = reason_text.replace("Age (Days)", "Age (Years)")
    reason_text = reason_text.replace("Employment Duration (Days)", "Employment Duration (Years)")
    return reason_text

df["Reason"] = df["Reason"].apply(update_reason)

In [14]:
# Display the first value in the 'Reason' column
print(df['Reason'].iloc[0])

This application was approved due to Employment Duration (Years),Housing Type Rented apartment,Marital Status Civil marriage,Work Phone 1.0,Number of Children 0,Family Size 2.0,Annual Income 427500.0.


In [15]:
def generate_plain_input_and_label(row):
    # Generate the reasoning part of the label
    reasoning_parts = [
        f"Gender: {'Male' if row['Gender'] == 'M' else 'Female'}" if pd.notna(row.get('Gender')) else None,
        f"Age: {row['Age (Years)']} years old" if pd.notna(row.get('Age (Years)')) else None,
        f"Car Ownership: {'Yes' if row['Car Ownership'] == 'Y' else 'No'}" if pd.notna(row.get('Car Ownership')) else None,
        f"Property Ownership: {'Yes' if row['Property Ownership'] == 'Y' else 'No'}" if pd.notna(row.get('Property Ownership')) else None,
        f"Number of Children: {int(row['Number of Children'])}" if pd.notna(row.get('Number of Children')) else None,
        f"Annual Income: {row['Annual Income']}" if pd.notna(row.get('Annual Income')) else None,
        f"Income Category: {row['Income Category']}" if pd.notna(row.get('Income Category')) else None,
        f"Education Level: {row['Education Level']}" if pd.notna(row.get('Education Level')) else None,
        f"Marital Status: {row['Marital Status']}" if pd.notna(row.get('Marital Status')) else None,
        f"Housing Type: {row['Housing Type']}" if pd.notna(row.get('Housing Type')) else None,
        f"Employment Duration: {row['Employment Duration (Years)']} years" if pd.notna(row.get('Employment Duration (Years)')) else None,
        f"Mobile Phone: {'Yes' if row['Mobile Phone'] == 1 else 'No'}" if pd.notna(row.get('Mobile Phone')) else None,
        f"Work Phone: {'Yes' if row['Work Phone'] == 1 else 'No'}" if pd.notna(row.get('Work Phone')) else None,
        f"Email: {'Yes' if row['Email'] == 1 else 'No'}" if pd.notna(row.get('Email')) else None,
        f"Family Size: {row['Family Size']}" if pd.notna(row.get('Family Size')) else None,
    ]
    reasoning_text = ". ".join([part for part in reasoning_parts if part is not None])
    label = f"Approval Status: Approved. Reasoning: {reasoning_text}."

    # Generate input in the same way
    input_text = reasoning_text  # Reuse the reasoning parts for consistency

    return input_text, label

In [16]:
# Apply the function to each row
df["text"], df["label"] = zip(*df.apply(generate_plain_input_and_label, axis=1))

# Keep only the required columns
final_df = df[["text", "label"]]

In [17]:
# Display the first value in the 'Reason' column
print("Label\n")
print(final_df['label'].iloc[0])
print("Input\n")
print(final_df['text'].iloc[0])

Label

Approval Status: Approved. Reasoning: Gender: Male. Age: 32 years old years old. Car Ownership: Yes. Property Ownership: Yes. Number of Children: 0. Annual Income: 427500.0. Income Category: Working. Education Level: Higher education. Marital Status: Civil marriage. Housing Type: Rented apartment. Employment Duration: 12.44 years. Mobile Phone: Yes. Work Phone: Yes. Email: No. Family Size: 2.0.
Input

Gender: Male. Age: 32 years old years old. Car Ownership: Yes. Property Ownership: Yes. Number of Children: 0. Annual Income: 427500.0. Income Category: Working. Education Level: Higher education. Marital Status: Civil marriage. Housing Type: Rented apartment. Employment Duration: 12.44 years. Mobile Phone: Yes. Work Phone: Yes. Email: No. Family Size: 2.0


In [18]:
# Clean the 'text' and 'label' columns
final_df['text'] = final_df['text'].str.replace('years old years old', 'years old', regex=False)

# Sample the dataset to 3000 rows for training efficiency
df = df.sample(frac=1, random_state=42).reset_index(drop=True).head(3000)

In [19]:
# Split ratios
train_size = 0.8
eval_size = 0.1

# Calculate split indices
train_end = int(train_size * len(df))
eval_end = train_end + int(eval_size * len(df))

# Split data
X_train = df[:train_end]
X_eval = df[train_end:eval_end]
X_test = df[eval_end:]

In [20]:
# Define the prompt generation functions
def generate_prompt(data_point):
    return f"""
Predict the credit card application status and provide reasoning.
text: {data_point["text"]}
label: {data_point["label"]}
""".strip()

def generate_test_prompt(data_point):
    return f"""
Predict the credit card application status and provide reasoning.
text: {data_point["text"]}
label: 
""".strip()

# Apply to training and evaluation datasets
X_train['text'] = X_train.apply(generate_prompt, axis=1)
X_eval['text'] = X_eval.apply(generate_prompt, axis=1)

# Apply to test dataset
X_test_prompts = X_test.apply(generate_test_prompt, axis=1)
y_true = X_test['label']
X_test = pd.DataFrame(X_test_prompts, columns=["text"])

In [21]:
# Convert to datasets
train_data = Dataset.from_pandas(X_train[["text"]])
eval_data = Dataset.from_pandas(X_eval[["text"]])

In [22]:
# Display lengths of each entry in train_data
train_lengths = train_data.map(lambda x: {"length": len(x["text"])})
print("Train data lengths:")
print(train_lengths)

# Display lengths of each entry in eval_data
eval_lengths = eval_data.map(lambda x: {"length": len(x["text"])})
print("Eval data lengths:")
print(eval_lengths)

Map:   0%|          | 0/2400 [00:00<?, ? examples/s]

Train data lengths:
Dataset({
    features: ['text', 'length'],
    num_rows: 2400
})


Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Eval data lengths:
Dataset({
    features: ['text', 'length'],
    num_rows: 300
})


In [23]:
train_data['text'][3]

'Predict the credit card application status and provide reasoning.\ntext: Gender: Female. Age: 32 years old years old. Car Ownership: Yes. Property Ownership: No. Number of Children: 0. Annual Income: 126000.0. Income Category: State servant. Education Level: Incomplete higher. Marital Status: Single / not married. Housing Type: House / apartment. Employment Duration: 14.26 years. Mobile Phone: Yes. Work Phone: Yes. Email: No. Family Size: 1.0\nlabel: Approval Status: Approved. Reasoning: Gender: Female. Age: 32 years old years old. Car Ownership: Yes. Property Ownership: No. Number of Children: 0. Annual Income: 126000.0. Income Category: State servant. Education Level: Incomplete higher. Marital Status: Single / not married. Housing Type: House / apartment. Employment Duration: 14.26 years. Mobile Phone: Yes. Work Phone: Yes. Email: No. Family Size: 1.0.'

In [24]:
base_model = "/opt/notebooks/Chatbot-Credit-Card/backend/models/llama-3.1-8b-Instruct/"

tokenizer = AutoTokenizer.from_pretrained(base_model)

In [1]:
base_model_name = base_model

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
)

model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    device_map="auto",
    torch_dtype="float16",
    quantization_config=bnb_config, 
    max_memory={0: "16GiB", "cpu": "30GiB"},

)

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto",
)
model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(base_model_name)

tokenizer.pad_token_id = tokenizer.eos_token_id

NameError: name 'base_model' is not defined

In [26]:
from tqdm import tqdm
from transformers import pipeline

def predict(test, model, tokenizer):
    y_pred = []
    statuses = ["Approved", "Denied"]  # The statuses for prediction
    
    for i in tqdm(range(len(test))):
        prompt = test.iloc[i]["text"]
        
        # Initialize the text-generation pipeline
        pipe = pipeline(
            task="text-generation", 
            model=model, 
            tokenizer=tokenizer, 
            max_new_tokens=50,  # Increase to ensure reasoning is generated
            temperature=0.1
        )
        
        # Generate prediction
        result = pipe(prompt)
        generated_text = result[0]['generated_text']
        
        # Parse the output to extract status and reason
        try:
            # Extract 'label:' portion and split to get the prediction
            answer = generated_text.split("label:")[-1].strip()
            
            # Extract status (Approved/Denied)
            status = None
            for s in statuses:
                if s.lower() in answer.lower():
                    status = s
                    break
            
            # Append the status and reasoning
            if status:
                y_pred.append({
                    "status": status,
                    "reason": answer  # Include full reasoning text
                })
            else:
                y_pred.append({
                    "status": "Unknown",
                    "reason": answer
                })
        except Exception as e:
            # Handle unexpected output format
            y_pred.append({
                "status": "Error",
                "reason": f"Parsing failed: {e}"
            })
    
    return y_pred

# # Example Usage
# # Assuming `X_test` is your test DataFrame and `model`, `tokenizer` are already loaded
# y_pred = predict(X_test, model, tokenizer)

# # Print a few predictions to verify
# for prediction in y_pred[:5]:
#     print(prediction)


In [27]:
# from collections import Counter

# # Extract statuses from y_pred
# statuses = [pred.get("status", "Unknown") for pred in y_pred]

# # Count occurrences of each status
# status_counts = Counter(statuses)

# # Display the counts
# print(f"Total counts by status:")
# print(f"Approved: {status_counts['Approved']}")
# print(f"Denied: {status_counts['Denied']}")
# print(f"Unknown: {status_counts['Unknown']}")

In [28]:
def extract_label_from_text(data_point):
    try:
        text = data_point["text"]
        if "Approval Status:" in text:
            label_section = text.split("Approval Status:")[1].strip()
            return label_section.split(".")[0].strip()
        else:
            print(f"Missing 'Approval Status:' in text: {text}")
            return "Unknown"
    except Exception as e:
        print(f"Error parsing text: {text}, Error: {e}")
        return "Unknown"

# Add the 'status' column to train and evaluation datasets
train_data = train_data.map(lambda x: {"status": extract_label_from_text(x)})
eval_data = eval_data.map(lambda x: {"status": extract_label_from_text(x)})


Map:   0%|          | 0/2400 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

In [29]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np

def evaluate(y_true, y_pred):
    # Extract statuses from predictions
    y_pred_statuses = []
    for pred in y_pred:
        if "status" in pred and pred["status"] in ["Approved", "Denied", "Unknown"]:
            y_pred_statuses.append(pred["status"])
        else:
            y_pred_statuses.append("Unknown")  # Default for malformed predictions

    # Define label mapping (including "Unknown")
    labels = ["Approved", "Denied", "Unknown"]
    mapping = {label: idx for idx, label in enumerate(labels)}

    def map_func(x):
        return mapping.get(x, -1)  # Map to -1 if not found

    # Ensure lengths of y_true and y_pred match
    if len(y_true) != len(y_pred_statuses):
        raise ValueError("Length mismatch between y_true and y_pred.")

    # Map true and predicted labels
    y_true_mapped = np.vectorize(map_func)(y_true)
    y_pred_mapped = np.vectorize(map_func)(y_pred_statuses)

    # Check for unmapped values
    if -1 in y_true_mapped:
        raise ValueError("Unmapped labels found in y_true. Ensure all labels are 'Approved', 'Denied', or 'Unknown'.")
    if -1 in y_pred_mapped:
        raise ValueError("Unmapped labels found in y_pred. Ensure all predictions are 'Approved', 'Denied', or 'Unknown'.")

    # Calculate accuracy
    accuracy = accuracy_score(y_true=y_true_mapped, y_pred=y_pred_mapped)
    print(f'Accuracy: {accuracy:.3f}')
    
    # Generate classification report
    class_report = classification_report(
        y_true=y_true_mapped, 
        y_pred=y_pred_mapped, 
        target_names=labels, 
        labels=list(range(len(labels)))
    )
    print('\nClassification Report:')
    print(class_report)
    
    # Generate confusion matrix
    conf_matrix = confusion_matrix(
        y_true=y_true_mapped, 
        y_pred=y_pred_mapped, 
        labels=list(range(len(labels)))
    )
    print('\nConfusion Matrix:')
    print(conf_matrix)


# # Assertions to ensure valid inputs
# assert all(label in ["Approved", "Denied", "Unknown"] for label in y_true), "y_true contains unexpected labels!"
# assert all("status" in pred for pred in y_pred), "Some predictions are missing the 'status' key!"
# assert len(y_true) == len(y_pred), "Mismatch in lengths of y_true and y_pred!"

# # Example usage with y_true and y_pred
# y_true = X_test["status"].tolist()
# evaluate(y_true, y_pred)

In [30]:
import bitsandbytes as bnb

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:  # needed for 16 bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)
modules = find_all_linear_names(model)
modules

['up_proj', 'v_proj', 'down_proj', 'o_proj', 'gate_proj', 'q_proj', 'k_proj']

In [31]:
output_dir="/opt/notebooks/Chatbot-Credit-Card/backend/models/llama-3.1-fine-tuned-model"

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=modules,
)

training_arguments = TrainingArguments(
    output_dir=output_dir,                    # directory to save and repository id
    num_train_epochs=1,                       # number of training epochs
    per_device_train_batch_size=1,            # batch size per device during training
    gradient_accumulation_steps=8,            # number of steps before performing a backward/update pass
    gradient_checkpointing=True,              # use gradient checkpointing to save memory
    optim="paged_adamw_32bit",
    logging_steps=1,                         
    learning_rate=2e-4,                       # learning rate, based on QLoRA paper
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,                        # max gradient norm based on QLoRA paper
    max_steps=-1,
    warmup_ratio=0.03,                        # warmup ratio based on QLoRA paper
    group_by_length=False,
    lr_scheduler_type="cosine",               # use cosine learning rate scheduler
    report_to="wandb",                  # report metrics to w&b
    eval_strategy="steps",              # save checkpoint every epoch
    eval_steps = 0.2
)

trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=train_data,
    eval_dataset=eval_data,
    peft_config=peft_config,
    dataset_text_field="text",
    tokenizer=tokenizer,
    max_seq_length=512,
    packing=False,
    dataset_kwargs={
    "add_special_tokens": False,
    "append_concat_token": False,
    }
)

Map:   0%|          | 0/2400 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

In [32]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mdnicho26[0m ([33mdnicho26-university-of-north-carolina-at-charlotte[0m). Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011113046211054705, max=1.0…

OutOfMemoryError: CUDA out of memory. Tried to allocate 1002.00 MiB. GPU 0 has a total capacity of 15.89 GiB of which 385.12 MiB is free. Process 3491537 has 6.14 GiB memory in use. Process 3585902 has 9.36 GiB memory in use. Of the allocated memory 8.66 GiB is allocated by PyTorch, and 425.04 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
wandb.finish()
model.config.use_cache = True

In [None]:
# Save trained model and tokenizer
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

In [None]:
y_pred = predict(X_test, model, tokenizer)
evaluate(y_true, y_pred)