In [2]:
import os
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Running on device:", device)

Running on device: cpu


In [3]:
from sklearn.model_selection import train_test_split
import pandas as pd

class DataPreparator:
    def __init__(self, model_name: str, max_length: int = 256):
        self.model_name = model_name
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.max_length = max_length

    def split_data(self, data, label_col='label', text_col='text', test_size=0.2, random_state=42):
        """
        Splits data into train and validation sets. Avoids stratification if class counts are too low.
        """
        df = pd.DataFrame(data)
        stratify_col = None

        # Check if stratification is possible
        if df[label_col].value_counts().min() > 1:
            stratify_col = df[label_col]

        train_df, val_df = train_test_split(
            df, 
            test_size=test_size, 
            random_state=random_state, 
            stratify=stratify_col
        )
        return train_df, val_df
    def tokenize(self, texts):
        """
        Applies the BERT tokenizer to a list of texts.
        Returns a dictionary with 'input_ids' and 'attention_mask'.
        """
        return self.tokenizer(
            texts,
            padding=True,
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )

In [4]:
test_data = [
    {"text": "Doc 1 about technology", "label": "Technology"},
    {"text": "Doc 2 about finance", "label": "Financial"},
    {"text": "Doc 3 about technology again", "label": "Technology"},
    {"text": "Doc 4 about marketing", "label": "Marketing"},
    {"text": "Doc 5 about marketing again", "label": "Marketing"}  # Additional sample
]

prep = DataPreparator(model_name="ProsusAI/finbert", max_length=32)
train_df, val_df = prep.split_data(test_data, label_col='label')

print("Train DataFrame:")
print(train_df)
print("\nValidation DataFrame:")
print(val_df)

tokenized = prep.tokenize(train_df['text'].tolist())
print("\nKeys in tokenized output:", tokenized.keys())
print("Shape of input_ids:", tokenized['input_ids'].shape)

Train DataFrame:
                           text       label
4   Doc 5 about marketing again   Marketing
2  Doc 3 about technology again  Technology
0        Doc 1 about technology  Technology
3         Doc 4 about marketing   Marketing

Validation DataFrame:
                  text      label
1  Doc 2 about finance  Financial

Keys in tokenized output: dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
Shape of input_ids: torch.Size([4, 7])


In [5]:
class BertClassifier(nn.Module):
    """
    A simple classification model on top of a BERT base.
    """
    def __init__(self, model_name: str, num_labels: int):
        """
        :param model_name: The name of the pretrained model (FinBERT, LegalBERT, etc.).
        :param num_labels: The number of possible labels/classes.
        """
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(p=0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask=None, labels=None):
        """
        Forward pass of the model.
        """
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output  # shape: (batch_size, hidden_size)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        output_dict = {"logits": logits}

        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)
            output_dict["loss"] = loss

        return output_dict

In [6]:
# We test a simple forward pass to ensure it works
model = BertClassifier(model_name="ProsusAI/finbert", num_labels=3)

# Dummy input
batch_size = 2
seq_length = 8
dummy_input_ids = torch.randint(0, 1000, (batch_size, seq_length))
dummy_attention_mask = torch.ones((batch_size, seq_length))
dummy_labels = torch.tensor([0, 1])  # some label IDs

outputs = model(dummy_input_ids, attention_mask=dummy_attention_mask, labels=dummy_labels)
print("Output keys:", outputs.keys())
print("Logits shape:", outputs["logits"].shape)
print("Loss shape:", outputs["loss"].shape)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Output keys: dict_keys(['logits', 'loss'])
Logits shape: torch.Size([2, 3])
Loss shape: torch.Size([])


In [10]:
class DocumentDataset(Dataset):
    """
    PyTorch Dataset for our documents.
    Expects 'input_ids', 'attention_mask', and integer 'labels'.
    """
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

def train_model(model, train_dataset, val_dataset, epochs=2, batch_size=4, lr=1e-5):
    """
    Custom training loop for the classification model.
    """
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    optimizer = AdamW(model.parameters(), lr=lr)
    total_steps = len(train_loader) * epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, 
                                                num_warmup_steps=0,
                                                num_training_steps=total_steps)

    model.to(device)

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask, labels=labels)
            loss = outputs["loss"]
            loss.backward()
            optimizer.step()
            scheduler.step()

            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}, Training loss: {avg_train_loss:.4f}")

        # Validation
        model.eval()
        val_loss = 0
        correct = 0
        total = 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(input_ids, attention_mask, labels=labels)
                val_loss += outputs["loss"].item()

                logits = outputs["logits"]
                preds = torch.argmax(logits, dim=1)
                correct += (preds == labels).sum().item()
                total += labels.size(0)

        avg_val_loss = val_loss / len(val_loader)
        accuracy = correct / total
        print(f"Validation loss: {avg_val_loss:.4f}, Validation Accuracy: {accuracy:.4f}")

    return model, 

In [11]:
# replace manual labeling with csv file

txt_folder = "data"
file_to_label = {
    "acip-best-practices-for-the-management-of-ml-tf-and-pf-risks-from-customer-relationships-with-a-nexus-to-digital-assets_100723-(publish).txt": "Anti Money Laundering",
    "BCBS Guidelines for AMLCFT June 2017.txt": "Anti Money Laundering",
    "best-practices-for-countering-trade-based-money-laundering.txt": "Anti Money Laundering",
    "Circular - Non-Face-to-Face Customer Due Diligence Measures-1.txt":"Anti Money Laundering",
    "MAS 1106A 2023_04_cleaned.txt":"Consumer Finance",
    "MAS Notice 1115_TDSR_290922_cleaned.txt":"Consumer Finance",
    "Notice 1107 Bridging Loans for the Purchase of Immovable Properties_1 Jul 2021_cleaned.txt":"Consumer Finance",
    "Notice 1109 Unsecured Credit Facilities to Individuals_1 Jul 2021_cleaned.txt":"Consumer Finance",
    "TRM Guidelines 18 January 2021_cleaned.txt":"Risk Management",
    "Response to Consultation Paper on Management of Outsourced Relevant Services_cleaned.txt":"Risk Management",
    "Outsourcing Guidelines_Jul 2016 revised on 5 Oct 2018_cleaned.txt":"Risk Management",
    "MAS Notices 644 655 644A 655A 1114 1118 Cancellation 2024_cleaned.txt":"Risk Management",
}

# We'll read each file, store the text, and the known label
data_entries = []
for filename, label in file_to_label.items():
    file_path = os.path.join(txt_folder, filename)
    with open(file_path, "r", encoding="utf-8") as f:
        text_content = f.read()
    data_entries.append({"text": text_content, "label": label})

# ----------------------------------------------------
# Now we have data_entries = [{ "text": "...", "label": "Financial" }, ...]
# We'll build a label2id map from the unique labels
unique_labels = sorted(list(set(file_to_label.values())))
label2id = {label: idx for idx, label in enumerate(unique_labels)}
id2label = {v: k for k, v in label2id.items()}

print("Unique labels found:", unique_labels)
print("Label2ID map:", label2id)

# ----------------------------------------------------
# Prepare data, split, tokenize, create Datasets
model_name = "ProsusAI/finbert"
prep = DataPreparator(model_name=model_name, max_length=128)

train_df, val_df = prep.split_data(data_entries, label_col="label", test_size=0.2)
train_labels = train_df["label"].map(label2id).tolist()
val_labels = val_df["label"].map(label2id).tolist()

train_encodings = prep.tokenize(train_df["text"].tolist())
val_encodings = prep.tokenize(val_df["text"].tolist())

train_dataset = DocumentDataset(train_encodings, train_labels)
val_dataset = DocumentDataset(val_encodings, val_labels)

# ----------------------------------------------------
# Create and train the model
num_labels = len(unique_labels)
model = BertClassifier(model_name, num_labels=num_labels)

trained_model = train_model(
    model, 
    train_dataset, 
    val_dataset, 
    epochs=2, 
    batch_size=2, 
    lr=2e-5
)

# Save the fine-tuned model (optional)
torch.save(trained_model.state_dict(), "finbert_classifier.pt")
print("Model training complete and saved.")

Unique labels found: ['Anti Money Laundering', 'Consumer Finance', 'Risk Management']
Label2ID map: {'Anti Money Laundering': 0, 'Consumer Finance': 1, 'Risk Management': 2}


Epoch 1/2: 100%|██████████████████████████████████| 5/5 [00:03<00:00,  1.28it/s]


Epoch 1, Training loss: 1.3175
Validation loss: 1.1174, Validation Accuracy: 0.3333


Epoch 2/2: 100%|██████████████████████████████████| 5/5 [00:03<00:00,  1.41it/s]


Epoch 2, Training loss: 1.0808
Validation loss: 1.0756, Validation Accuracy: 0.6667
Model training complete and saved.


In [14]:
def predict_topic(trained_model, text, tokenizer, label2id_dict):
    """
    Predict the topic for a single text using the trained model.
    Returns the predicted label string.
    """
    trained_model.eval()
    inputs = tokenizer([text], padding=True, truncation=True, max_length=128, return_tensors='pt')
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        outputs = trained_model(input_ids, attention_mask=attention_mask)
        logits = outputs['logits']
        preds = torch.argmax(logits, dim=1).cpu().item()

    # Reverse lookup: find label string from predicted ID
    id2label_local = {v: k for k, v in label2id_dict.items()}
    return id2label_local[preds]


# Example usage with some unlabeled text file
unlabeled_file = "data/Guidelines on Outsourcing Banks_cleaned.txt"
with open(unlabeled_file, "r", encoding="utf-8") as f:
    text_content = f.read()

predicted_label = predict_topic(trained_model, text_content, prep.tokenizer, label2id)
print(f"Predicted label for '{unlabeled_file}': {predicted_label}")

Predicted label for 'data/Guidelines on Outsourcing Banks_cleaned.txt': Risk Management


In [15]:
predicted = predict_topic(trained_model, text_content, prep.tokenizer, label2id)
print("Predicted topic:", predicted)
# Compare 'predicted' with your known label

Predicted topic: Risk Management


## Evaluation

In [7]:
class DocumentDataset(Dataset):
    """
    PyTorch Dataset for our documents.
    Expects 'input_ids', 'attention_mask', and integer 'labels'.
    """
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

def train_model(model, train_dataset, val_dataset, epochs=2, batch_size=4, lr=1e-5):
    """
    Custom training loop for the classification model.
    """
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    optimizer = AdamW(model.parameters(), lr=lr)
    total_steps = len(train_loader) * epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, 
                                                num_warmup_steps=0,
                                                num_training_steps=total_steps)

    model.to(device)

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask, labels=labels)
            loss = outputs["loss"]
            loss.backward()
            optimizer.step()
            scheduler.step()

            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}, Training loss: {avg_train_loss:.4f}")

        # Validation
        model.eval()
        val_loss = 0
        correct = 0
        total = 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(input_ids, attention_mask, labels=labels)
                val_loss += outputs["loss"].item()

                logits = outputs["logits"]
                preds = torch.argmax(logits, dim=1)
                correct += (preds == labels).sum().item()
                total += labels.size(0)

        avg_val_loss = val_loss / len(val_loader)
        accuracy = correct / total
        print(f"Validation loss: {avg_val_loss:.4f}, Validation Accuracy: {accuracy:.4f}")

    return model, val_loader

In [12]:
# replace manual labeling with csv file

txt_folder = "cleaned_data_for_testing"
file_to_label = {
    "acip-best-practices-for-the-management-of-ml-tf-and-pf-risks-from-customer-relationships-with-a-nexus-to-digital-assets_100723-(publish)_cleaned.txt": "Anti Money Laundering",
    "BCBS Guidelines for AMLCFT June 2017_cleaned.txt": "Anti Money Laundering",
    "best-practices-for-countering-trade-based-money-laundering_cleaned.txt": "Anti Money Laundering",
    "Circular on MyInfo and CDD on NFTF business relations_cleaned.txt":"Anti Money Laundering",
    "Effective Practices to Detect and Mitigate the Risk from Misuse of Legal Persons June 2019_cleaned.txt":"Anti Money Laundering",
    "Guidelines for Financial Institutions to Safeguard the Integrity of Singapores Financial System  Sep 2011_cleaned.txt":"Anti Money Laundering",
    "Guidelines to MAS Notice 1014  November 2015_cleaned.txt":"Anti Money Laundering",
    "Guidelines to PSN01 dated 2 April 2024_cleaned.txt":"Anti Money Laundering",
    "Industry Perspectives  Adopting Data Analytics Methods for AMLCFT_cleaned.txt":"Anti Money Laundering",
    "Infographic on Effective AMLCFT Transaction Monitoring Controls_cleaned.txt":"Anti Money Laundering",
    "MAS Notice 1014 last revised on 1 March 2022_cleaned.txt":"Anti Money Laundering",
    "MAS Notice PSN10 - Exempt Payment Service Providers_cleaned.txt":"Anti Money Laundering",
    "Notice PSM-N01 dated 1 March 2022-1_cleaned.txt":"Anti Money Laundering",


    "2021-06-28 MAS Notice 1106_cleaned.txt":"Consumer Finance",
    "2021-06-28 Notice 1106B_COVID_LTV_cleaned.txt":"Consumer Finance",
    "2021-06-28 Notice 1115A_COVID_TDSR_cleaned.txt":"Consumer Finance",
    "Compliance Toolkit for Merchant Banks Last Revised 12 September 2024_cleaned.txt":"Consumer Finance",
    "MAS 1106A 2023_04_cleaned.txt":"Consumer Finance",
    "MAS Notice 1113 Motor Vehicle Loans - Merchant Bank_1 July 2021_cleaned.txt":"Consumer Finance",
    "MAS Notice 1115_TDSR_290922_cleaned.txt":"Consumer Finance",
    "Notice 1107 Bridging Loans for the Purchase of Immovable Properties_1 Jul 2021_cleaned.txt":"Consumer Finance",
    "Notice 1109 Unsecured Credit Facilities to Individuals_1 Jul 2021_cleaned.txt":"Consumer Finance",

    "7 Credit Facilities to Directors Related Corporations etc 01 Nov 1985_cleaned.txt":"Risk Management",
    "8 Credit Facilities and Limits 01 Nov 1985_cleaned.txt":"Risk Management",
    "BCM-Guidelines-June-2022_cleaned.txt":"Risk Management",
    "blue_book_wholesale_cleaned.txt":"Risk Management",
    "Board-and-Senior-Mgmt_01 Jul 2021_cleaned.txt":"Risk Management",
    "Compliance Toolkit for Merchant Banks Last Revised 12 September 2024_cleaned.txt":"Risk Management",
    "Consultation Paper on Guidelines on Transition Planning Banks_cleaned.txt":"Risk Management",
    "Directive 5_cleaned.txt":"Risk Management",
    "FAQ - Notice on Cyber Hygiene_cleaned.txt":"Risk Management",
    "FAQ - Notice on Technology Risk Management_cleaned.txt":"Risk Management",
    "FAQ for Notice 658 and 1121_11Dec2024_cleaned.txt":"Risk Management",
    "Guidelines on Definition of a Deposit_01 Jul 2021_cleaned.txt":"Risk Management",
    "Guidelines on Environmental Risk Management for Banks_cleaned.txt":"Risk Management",
    "Guidelines on Outsourcing Banks_cleaned.txt":"Risk Management",
    "Guidelines on Risk Management Practices  Internal Controls July 2024_cleaned.txt":"Risk Management",
    "Information Paper on Environmental Risk Management Banks_cleaned.txt":"Risk Management",
    "MAS NOTICE 1004_11062021_cleaned.txt":"Risk Management",
    "MAS Notice 1005_29 Jun 2021_cleaned.txt":"Risk Management",
    "MAS Notice 1005A_30 Jun 2021_cleaned.txt":"Risk Management",
    "MAS Notice 1015 effective 01 July 2024_cleaned.txt":"Risk Management",
    "MAS Notice 1108 Cancellation_cleaned.txt":"Risk Management",
    "MAS Notice 1121 - 4_cleaned.txt":"Risk Management",
    "MAS Notice FSM-N11_cleaned.txt":"Risk Management",
    "MAS Notices 644 655 644A 655A 1114 1118 Cancellation 2024_cleaned.txt":"Risk Management",
    "Outsourcing Guidelines_Jul 2016 revised on 5 Oct 2018_cleaned.txt":"Risk Management",
    "Response to Consultation Paper on Management of Outsourced Relevant Services_cleaned.txt":"Risk Management",
    "TRM Guidelines 18 January 2021_cleaned.txt":"Risk Management", 
    
    
}

# We'll read each file, store the text, and the known label
data_entries = []
for filename, label in file_to_label.items():
    file_path = os.path.join(txt_folder, filename)
    with open(file_path, "r", encoding="utf-8") as f:
        text_content = f.read()
    data_entries.append({"text": text_content, "label": label})

# ----------------------------------------------------
# Now we have data_entries = [{ "text": "...", "label": "Financial" }, ...]
# We'll build a label2id map from the unique labels
unique_labels = sorted(list(set(file_to_label.values())))
label2id = {label: idx for idx, label in enumerate(unique_labels)}
id2label = {v: k for k, v in label2id.items()}

print("Unique labels found:", unique_labels)
print("Label2ID map:", label2id)

# ----------------------------------------------------
# Prepare data, split, tokenize, create Datasets
model_name = "ProsusAI/finbert"
prep = DataPreparator(model_name=model_name, max_length=128)

train_df, val_df = prep.split_data(data_entries, label_col="label", test_size=0.2)
train_labels = train_df["label"].map(label2id).tolist()
val_labels = val_df["label"].map(label2id).tolist()

train_encodings = prep.tokenize(train_df["text"].tolist())
val_encodings = prep.tokenize(val_df["text"].tolist())

train_dataset = DocumentDataset(train_encodings, train_labels)
val_dataset = DocumentDataset(val_encodings, val_labels)

# ----------------------------------------------------
# Create and train the model
num_labels = len(unique_labels)
model = BertClassifier(model_name, num_labels=num_labels)

trained_model, val_loader = train_model(
    model, 
    train_dataset, 
    val_dataset, 
    epochs=2, 
    batch_size=2, 
    lr=2e-5
)

# Save the fine-tuned model (optional)
torch.save(trained_model.state_dict(), "finbert_classifier.pt")
print("Model training complete and saved.")

Unique labels found: ['Anti Money Laundering', 'Consumer Finance', 'Risk Management']
Label2ID map: {'Anti Money Laundering': 0, 'Consumer Finance': 1, 'Risk Management': 2}


Epoch 1/2: 100%|██████████| 19/19 [00:08<00:00,  2.13it/s]


Epoch 1, Training loss: 1.0273
Validation loss: 0.9999, Validation Accuracy: 0.5000


Epoch 2/2: 100%|██████████| 19/19 [00:07<00:00,  2.41it/s]


Epoch 2, Training loss: 0.9252
Validation loss: 0.9727, Validation Accuracy: 0.5000
Model training complete and saved.


In [14]:
# Evaluate the model on the validation set
trained_model.eval()
predictions, true_labels = [], []

for batch in val_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)
    with torch.no_grad():
        outputs = trained_model(input_ids, attention_mask=attention_mask)
    logits = outputs["logits"]
    predictions.extend(torch.argmax(logits, dim=-1).cpu().numpy())
    true_labels.extend(labels.cpu().numpy())

# Calculate evaluation metrics
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

accuracy = accuracy_score(true_labels, predictions)
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='weighted')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Accuracy: 0.5
Precision: 0.25
Recall: 0.5
F1 Score: 0.3333333333333333


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
