In [1]:
!pip install unsloth

Collecting unsloth
  Downloading unsloth-2025.3.17-py3-none-any.whl.metadata (59 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/59.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting unsloth_zoo>=2025.3.14 (from unsloth)
  Downloading unsloth_zoo-2025.3.15-py3-none-any.whl.metadata (17 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.29.post3-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting bitsandbytes (from unsloth)
  Downloading bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting tyro (from unsloth)
  Downloading tyro-0.9.17-py3-none-any.whl.metadata (9.5 kB)
Collecting datasets>=2.16.0 (from unsloth)
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting trl!=0.15.0,!=0.9.0,!=0.9.1,!=0.9.2,!=0.9.3,<=0.15.2,>=0.7.9 (from unsloth)
  Do

In [1]:
!pip install datasets transformers trl torch



In [12]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/DeepSeek-R1-Distill-Llama-8B-unsloth-bnb-4bit",
    max_seq_length = 2048,
    dtype = None,
    load_in_4bit = True,
)

==((====))==  Unsloth 2025.3.17: Fast Llama patching. Transformers: 4.48.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.96G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/236 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/53.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [13]:
model = FastLanguageModel.get_peft_model(
    model,
    r=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=42,
    use_rslora=True,
    loftq_config=None,
)

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2025.3.17 patched 32 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


In [50]:
import torch
import json
import os
import numpy as np
from transformers import TrainingArguments
from trl import SFTTrainer
from unsloth import FastLanguageModel, is_bfloat16_supported
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_recall_fscore_support
from datasets import Dataset

# Clear GPU memory at the start
torch.cuda.empty_cache()

# Load the dataset with improved error handling
def load_dataset(file_path):
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            data = json.load(f)
            print(f"Successfully loaded {len(data)} records from {file_path}")
            return data
    except FileNotFoundError:
        print(f"Error: File {file_path} not found.")
        return []
    except json.JSONDecodeError:
        print(f"Error: Invalid JSON format in {file_path}")
        return []

# Path configurations
TRAIN_DATA_PATH = "/content/conversation_dataset.json"
TEST_DATA_PATH = "/content/test_dataset.json"
OUTPUT_DIR = "/content/outputs"
FINAL_MODEL_DIR = "/content/fine_tuned_fraud_model"

# Create output directories if they don't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(FINAL_MODEL_DIR, exist_ok=True)

# Load datasets
train_dataset = load_dataset(TRAIN_DATA_PATH)
test_dataset = load_dataset(TEST_DATA_PATH)

if not train_dataset or not test_dataset:
    raise ValueError("One or both datasets failed to load.")

# Display sample data
print("Sample training data:")
print(json.dumps(train_dataset[:2], indent=2))

# Create formatted ShareGPT-style conversations directly
def prepare_sharegpt_conversations(dataset):
    conversations = []

    i = 0
    while i < len(dataset):
        # Look for a user message
        if dataset[i]['role'] == 'user':
            # Check if the next message is from an assistant
            if i + 1 < len(dataset) and dataset[i + 1]['role'] == 'assistant':
                # Create a conversation with this pair
                conversation = {
                    "conversations": [
                        {"role": dataset[i]['role'], "content": dataset[i]['content']},
                        {"role": dataset[i + 1]['role'], "content": dataset[i + 1]['content']}
                    ]
                }
                conversations.append(conversation)
                i += 2  # Skip both messages
            else:
                i += 1  # Skip this user message if no assistant response
        else:
            i += 1  # Skip non-user messages

    return Dataset.from_list(conversations)

# Convert datasets to ShareGPT format directly
train_data = prepare_sharegpt_conversations(train_dataset)
print(f"Created {len(train_data)} training conversations")

# Display sample to verify structure
if len(train_data) > 0:
    print("\nSample training conversation:")
    print(json.dumps(train_data[0], indent=2))

# Model configuration
MODEL_NAME = "unsloth/DeepSeek-R1-Distill-Llama-8B-unsloth-bnb-4bit"
MAX_SEQ_LENGTH = 2048
LORA_RANK = 32
LORA_ALPHA = 32
LORA_DROPOUT = 0.05

# Load the model
try:
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=MODEL_NAME,
        max_seq_length=MAX_SEQ_LENGTH,
        dtype=None,
        load_in_4bit=True,
    )
    print(f"Successfully loaded {MODEL_NAME}")

    # Set pad token to be different from eos_token
    if tokenizer.pad_token is None or tokenizer.pad_token == tokenizer.eos_token:
        # Add a new [PAD] token
        tokenizer.add_special_tokens({'pad_token': '[PAD]'})
        # Resize the token embeddings since we added a new token
        model.resize_token_embeddings(len(tokenizer))
        print("Added [PAD] token to tokenizer and resized model embeddings")

except Exception as e:
    print(f"Error loading model: {str(e)}")
    raise

# Configure the model for training
model = FastLanguageModel.get_peft_model(
    model,
    r=LORA_RANK,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=42,
    use_rslora=True,
    loftq_config=None,
)

# Convert the ShareGPT conversations to the format expected by the model
def formatting_func(example):
    if "conversations" not in example:
        return {"text": "", "attention_mask": []}

    # Apply the chat template to the conversation
    messages = example["conversations"]
    try:
        formatted_text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        # Tokenize and get attention mask
        encoded = tokenizer(formatted_text, return_attention_mask=True, truncation=True, max_length=MAX_SEQ_LENGTH)
        return {
            "text": formatted_text,
            "attention_mask": encoded["attention_mask"]
        }
    except Exception as e:
        print(f"Error formatting conversation: {str(e)}")
        return {"text": "", "attention_mask": []}

# Apply formatting to the dataset
train_data = train_data.map(formatting_func)

# Remove empty examples
train_data = train_data.filter(lambda example: example["text"] != "")
print(f"Training on {len(train_data)} formatted conversations after filtering")

# Training arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,
    warmup_steps=10,
    max_steps=200,
    learning_rate=1e-4,
    fp16=not is_bfloat16_supported(),
    bf16=is_bfloat16_supported(),
    logging_steps=1,
    optim="adamw_8bit",
    weight_decay=0.01,
    lr_scheduler_type="cosine",
    save_strategy="steps",
    save_steps=50,
    report_to="tensorboard",
)

# Initialize trainer with Dataset object
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_data,
    dataset_text_field="text",
    max_seq_length=MAX_SEQ_LENGTH,
    args=training_args,
)

# Train the model with improved error handling
try:
    print("Starting model training...")
    trainer.train()
    print("Model training completed successfully")
except Exception as e:
    print(f"Error during training: {str(e)}")
    raise

# Extract test messages and labels for evaluation
def extract_test_data(test_dataset):
    test_inputs = []
    test_labels = []

    i = 0
    while i < len(test_dataset):
        # Look for user message
        if test_dataset[i]['role'] == 'user':
            user_content = test_dataset[i]['content']

            # Check if next message is from assistant
            if i + 1 < len(test_dataset) and test_dataset[i + 1]['role'] == 'assistant':
                assistant_content = test_dataset[i + 1]['content'].lower()

                # Determine label
                is_fraud = 1 if "fraud" in assistant_content and "not fraud" not in assistant_content else 0

                # Add to test data
                test_inputs.append(user_content)
                test_labels.append(is_fraud)

                i += 2  # Skip both messages
            else:
                i += 1  # Skip this user message
        else:
            i += 1  # Skip non-user message

    print(f"Extracted {len(test_inputs)} test examples")
    return test_inputs, test_labels

# Improved inference function with explicit attention mask
def improved_inference(model, tokenizer, input_text, max_new_tokens=50):
    try:
        # Format as a user message
        message = [{"role": "user", "content": input_text}]

        # Apply chat template
        formatted_text = tokenizer.apply_chat_template(message, return_tensors=None, add_generation_prompt=True)

        # Tokenize with attention mask
        encoded = tokenizer(formatted_text, return_tensors="pt", padding=True, truncation=True,
                           max_length=MAX_SEQ_LENGTH, return_attention_mask=True)

        # Move to appropriate device
        input_ids = encoded["input_ids"].to("cuda" if torch.cuda.is_available() else "cpu")
        attention_mask = encoded["attention_mask"].to("cuda" if torch.cuda.is_available() else "cpu")

        # Generate with explicit attention mask
        with torch.no_grad():
            output_ids = model.generate(
                input_ids,
                attention_mask=attention_mask,  # Explicitly pass attention mask
                max_new_tokens=max_new_tokens,
                do_sample=False,  # Greedy decoding
                num_beams=1,      # No beam search
                pad_token_id=tokenizer.pad_token_id,
            )

        # Decode
        output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        return output_text
    except Exception as e:
        print(f"Inference error: {str(e)}")
        return ""

# Evaluate model with improved inference
def evaluate_model(test_inputs, test_labels):
    if not test_inputs or not test_labels:
        print("No test data available for evaluation")
        return 0, 0, 0, 0

    predictions = []
    model.eval()

    # Process each test input
    for i, input_text in enumerate(test_inputs):
        if i % 10 == 0:
            print(f"Processing test example {i}/{len(test_inputs)}")

        # Get model prediction with improved inference
        output_text = improved_inference(model, tokenizer, input_text)

        # Determine label from output
        if output_text:
            pred_label = 1 if "fraud" in output_text.lower() and "not fraud" not in output_text.lower() else 0
        else:
            # Default to "not fraud" if generation failed
            pred_label = 0

        predictions.append(pred_label)

        # Clear GPU memory periodically
        if i % 50 == 0:
            torch.cuda.empty_cache()

    # Ensure we have predictions
    if not predictions:
        print("No predictions were generated")
        return 0, 0, 0, 0

    # Calculate metrics
    accuracy = accuracy_score(test_labels, predictions)

    # Handle potential division by zero in metrics
    try:
        precision, recall, f1, _ = precision_recall_fscore_support(
            test_labels, predictions, average='weighted', zero_division=0
        )
    except Exception as e:
        print(f"Error calculating metrics: {str(e)}")
        precision, recall, f1 = 0, 0, 0

    # Print metrics
    print(f"Model Evaluation Results")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

    # Print classification report with try/except
    try:
        unique_labels = np.unique(test_labels + predictions)
        if len(unique_labels) <= 1:
            print("Only one class present in the combined true and predicted labels")
        else:
            # Create target names for the report
            target_names = ['Not Fraud', 'Fraud']
            # Ensure target_names matches the number of classes
            target_names = target_names[:len(unique_labels)]

            print("\nClassification Report:")
            print(classification_report(
                test_labels, predictions,
                target_names=target_names,
                zero_division=0
            ))
    except Exception as e:
        print(f"Could not generate classification report: {str(e)}")

    # Print confusion matrix with try/except
    try:
        print("\nConfusion Matrix:")
        print(confusion_matrix(test_labels, predictions))
    except Exception as e:
        print(f"Could not generate confusion matrix: {str(e)}")

    return accuracy, precision, recall, f1

# Extract test data
test_inputs, test_labels = extract_test_data(test_dataset)

# Evaluate model
print("\nEvaluating model performance...")
metrics = evaluate_model(test_inputs, test_labels)

# Save fine-tuned model
model.save_pretrained(FINAL_MODEL_DIR)
tokenizer.save_pretrained(FINAL_MODEL_DIR)

# Save evaluation results
with open(os.path.join(FINAL_MODEL_DIR, "evaluation_results.json"), "w") as f:
    json.dump(
        {
            "accuracy": float(metrics[0]),
            "precision": float(metrics[1]),
            "recall": float(metrics[2]),
            "f1_score": float(metrics[3]),
            "training_config": {
                "model_name": MODEL_NAME,
                "lora_rank": LORA_RANK,
                "lora_alpha": LORA_ALPHA,
                "lora_dropout": LORA_DROPOUT,
                "max_seq_length": MAX_SEQ_LENGTH,
            }
        },
        f,
        indent=2
    )

print(f"Model fine-tuning complete and saved to '{FINAL_MODEL_DIR}'")
print(f"Evaluation results saved to '{FINAL_MODEL_DIR}/evaluation_results.json'")

Successfully loaded 1968 records from /content/conversation_dataset.json
Successfully loaded 3936 records from /content/test_dataset.json
Sample training data:
[
  {
    "content": "Detect if the transaction is fraud or not based on the following features:\nV1: 0.9799876052447956,V2: 0.7736108816346513,V3: 0.8272829963669474,V4: 0.2909341549115423,V5: 0.7660261539744361,V10: 0.4944353671795013,V11: 0.28893827145700024,V12: 0.683667528565137,V14: 0.6074635728786055,V16: 0.46803214239167035,V18: 0.6780358407193249,V20: 0.579005681819562,V21: 0.5588535832041219,V22: 0.48911808543806545",
    "role": "user"
  },
  {
    "content": "Result: Not Fraud.\nRisk Level: Low risk level based on its deviation from normal patterns.The transaction does not show significant anomalies and aligns with normal patterns.",
    "role": "assistant"
  }
]
Created 984 training conversations

Sample training conversation:
{
  "conversations": [
    {
      "content": "Detect if the transaction is fraud or not b

Map:   0%|          | 0/984 [00:00<?, ? examples/s]

Filter:   0%|          | 0/984 [00:00<?, ? examples/s]

Training on 984 formatted conversations after filtering
Unsloth: We found double BOS tokens - we shall remove one automatically.


Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/984 [00:00<?, ? examples/s]

Starting model training...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 984 | Num Epochs = 7 | Total steps = 200
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 8 x 1) = 32
 "-____-"     Trainable parameters = 83,886,080/8,000,000,000 (1.05% trained)


Step,Training Loss
1,4.2093
2,4.1838
3,4.0028
4,3.7439
5,3.4144
6,3.135
7,2.917
8,2.7247
9,2.652
10,2.5659


Model training completed successfully
Extracted 1968 test examples

Evaluating model performance...
Processing test example 0/1968
Inference error: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).
Inference error: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).
Inference error: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).
Inference error: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).
Inference error: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).