In [3]:
# Step 1: Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Step 2: Load dataset
df = pd.read_csv("transactions_dataset_500_rich.csv")
print("Original Data:")
print(df.head())

# Step 3: Split features and labels
X = df.drop(columns=["txn_id", "label"])   # Features
y = df["label"]                            # Target

# Step 4: Encode categorical features
le = LabelEncoder()
for col in X.select_dtypes(include=["object"]).columns:
    X[col] = le.fit_transform(X[col])

# Step 5: Scale numerical values
scaler = StandardScaler()
X["amount"] = scaler.fit_transform(X[["amount"]])

# Step 6: Train/Test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Shapes:")
print("X_train:", X_train.shape)
print("X_test:", X_test.shape)

# Step 7: Train Random Forest
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Step 8: Evaluate model
y_pred = model.predict(X_test)

print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Original Data:
   txn_id  amount currency sender_country receiver_country payment_method  \
0       1   10000      USD             IR               CN    credit_card   
1       2      50      USD             RU               XY     debit_card   
2       3     500      INR             US               CN     debit_card   
3       4   50000      EUR             XY               GB         crypto   
4       5   10000      EUR             IR               CN     debit_card   

  merchant_category     ip_address           description       label  
0            travel     10.0.0.219           school fees       Legal  
1       electronics  203.0.113.145       grocery payment       Legal  
2            pharma   192.168.1.18       urgent transfer  Suspicious  
3         education     10.0.0.246  monthly subscription       Legal  
4            travel   203.0.113.53       grocery payment       Legal  
Shapes:
X_train: (400, 8)
X_test: (100, 8)

Accuracy: 0.79

Confusion Matrix:
 [[ 6  1  8]
 [ 0 

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
import numpy as np


  from .autonotebook import tqdm as notebook_tqdm


In [7]:
df = pd.read_csv("transactions_dataset_500_rich.csv")
print(df.head())


   txn_id  amount currency sender_country receiver_country payment_method  \
0       1   10000      USD             IR               CN    credit_card   
1       2      50      USD             RU               XY     debit_card   
2       3     500      INR             US               CN     debit_card   
3       4   50000      EUR             XY               GB         crypto   
4       5   10000      EUR             IR               CN     debit_card   

  merchant_category     ip_address           description       label  
0            travel     10.0.0.219           school fees       Legal  
1       electronics  203.0.113.145       grocery payment       Legal  
2            pharma   192.168.1.18       urgent transfer  Suspicious  
3         education     10.0.0.246  monthly subscription       Legal  
4            travel   203.0.113.53       grocery payment       Legal  


In [8]:
X = df.drop(columns=["txn_id", "label"])
y = df["label"]

le = LabelEncoder()
for col in X.select_dtypes(include=["object"]).columns:
    X[col] = le.fit_transform(X[col])

scaler = StandardScaler()
X["amount"] = scaler.fit_transform(X[["amount"]])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [9]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.79
Classification Report:
               precision    recall  f1-score   support

     Illegal       0.86      0.40      0.55        15
       Legal       0.83      1.00      0.90        57
  Suspicious       0.67      0.57      0.62        28

    accuracy                           0.79       100
   macro avg       0.78      0.66      0.69       100
weighted avg       0.79      0.79      0.77       100



In [10]:
df_text = df[["description", "label"]]
dataset = Dataset.from_pandas(df_text)

model_name = "yiyanghkust/finbert-tone"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch["description"], padding="max_length", truncation=True)

dataset = dataset.map(tokenize, batched=True)
dataset = dataset.train_test_split(test_size=0.2, seed=42)
dataset = dataset.class_encode_column("label")


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Map:   0%|          | 0/500 [00:00<?, ? examples/s]Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100%|██████████| 500/500 [00:00<00:00, 7335.24 examples/s]
Flattening the indices: 100%|██████████| 400/400 [00:00<00:00, 29227.58 examples/s]
Casting to class labels: 100%|██████████| 400/400 [00:00<00:00, 10247.76 examples/s]
Flattening the indices: 100%|██████████| 100/100 [00:00<00:00, 20065.56 examples/s]
Casting to class labels: 100%|██████████| 100/100 [00:00<00:00, 11086.36 examples/s]


In [11]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [1]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    load_best_model_at_end=True,
)


NameError: name 'TrainingArguments' is not defined

In [1]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()


NameError: name 'Trainer' is not defined

In [2]:
metrics = trainer.evaluate()
print(metrics)


NameError: name 'trainer' is not defined

In [3]:
import transformers
print(transformers.__version__)


  from .autonotebook import tqdm as notebook_tqdm


4.56.2


In [4]:
import sys
print(sys.executable)


c:\Users\Goppinath\AppData\Local\Programs\Python\Python39\python.exe


In [15]:
!C:/Users/Goppinath/AppData/Local/Programs/Python/Python39/python.exe -m pip install --upgrade transformers




You should consider upgrading via the 'C:\Users\Goppinath\AppData\Local\Programs\Python\Python39\python.exe -m pip install --upgrade pip' command.


In [5]:
pip install evaluate 

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
Installing collected packages: evaluate
Successfully installed evaluate-0.4.6
Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\Users\Goppinath\AppData\Local\Programs\Python\Python39\python.exe -m pip install --upgrade pip' command.


In [2]:
# Step 1: Imports
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
import numpy as np

# Step 2: Load dataset (replace with your CSV file if saved)
df = pd.read_csv("transactions_dataset_500_rich.csv")

# Step 3: Map labels to numbers
label2id = {"Legal": 0, "Illegal": 1, "Suspicious": 2}
id2label = {v: k for k, v in label2id.items()}
df["label"] = df["label"].map(label2id)

# Step 4: Split train/test
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["label"])

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df[["description", "label"]])
test_dataset = Dataset.from_pandas(test_df[["description", "label"]])

# Step 5: Load FinBERT tokenizer & model
model_name = "yiyanghkust/finbert-tone"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch["description"], padding="max_length", truncation=True)

train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

# Step 6: Load FinBERT model (3 labels for Legal/Illegal/Suspicious)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, 
    num_labels=3, 
    id2label=id2label, 
    label2id=label2id
)

# Step 7: Define metrics
accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy.compute(predictions=preds, references=labels)["accuracy"],
        "precision": precision.compute(predictions=preds, references=labels, average="weighted")["precision"],
        "recall": recall.compute(predictions=preds, references=labels, average="weighted")["recall"],
        "f1": f1.compute(predictions=preds, references=labels, average="weighted")["f1"],
    }

# Step 8: Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    save_strategy="steps",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    logging_dir="./logs",
    logging_steps=50
)

# Step 9: Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Step 10: Train
trainer.train()

# Step 11: Evaluate
metrics = trainer.evaluate()
print(metrics)


Map:   0%|          | 0/400 [00:00<?, ? examples/s]Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100%|██████████| 400/400 [00:00<00:00, 3175.59 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 7662.79 examples/s]


TypeError: __init__() got an unexpected keyword argument 'evaluation_strategy'

In [3]:
!pip install --upgrade transformers
!pip install --upgrade datasets evaluate




You should consider upgrading via the 'c:\users\goppinath\appdata\local\programs\python\python39\python.exe -m pip install --upgrade pip' command.




You should consider upgrading via the 'c:\users\goppinath\appdata\local\programs\python\python39\python.exe -m pip install --upgrade pip' command.
