## **Disclaimer**: This notebook is to be run in Google Colab with GPU.
In order to run this notebook you will need to have the file account_names.txt in your current directory.

The goal of this notebook is to cluster financial account names, given that we are in the presence of a unique amount of account names.

The notebook will output a csv file containing the cluster number for each account name. You will have to bring that .csv file output to the directory of your project and use it as input.

In [6]:
import re
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from transformers import BertModel, BertTokenizer
import torch
from concurrent.futures import ProcessPoolExecutor, as_completed
import multiprocessing

In [7]:
with open('account_names.txt', 'r') as file:
    account_names = file.read().splitlines()

# Normalize account names by adding spaces between camel case words
def normalize_account_name(name):
    return re.sub(r'([a-z])([A-Z])', r'\1 \2', name).strip().lower()

normalized_names = [normalize_account_name(name) for name in account_names]

In [8]:
# Set the multiprocessing start method to 'spawn'
multiprocessing.set_start_method('spawn', force=True)

# Load FinBERT model and tokenizer
model_name = 'yiyanghkust/finbert-tone'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# Check if a GPU is available and use it if possible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Function to get embeddings from FinBERT
def get_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
    inputs = {key: value.to(device) for key, value in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).cpu().numpy().flatten()

# Process embeddings without parallelizing the GPU operation
def process_embeddings(batch):
    return [get_embedding(text) for text in batch]

# Batch size for processing embeddings
batch_size = 100

# Split normalized names into batches
batches = [normalized_names[i:i + batch_size] for i in range(0, len(normalized_names), batch_size)]

# Process embeddings in batches
embeddings = []
for batch in batches:
    embeddings.extend(process_embeddings(batch))

embeddings = np.array(embeddings)

In [9]:
embeddings

array([[-0.06058354, -0.53952146, -0.9968411 , ...,  0.13485558,
         0.81674385,  0.30366534],
       [ 0.29919016, -0.73995787, -0.08689369, ..., -0.02804742,
         0.75621545,  0.62906927],
       [-0.5456445 , -0.36958534, -0.44062454, ...,  0.06302413,
         0.55505574, -0.5086833 ],
       ...,
       [-0.01857014, -0.3749677 , -0.52470124, ...,  0.9990641 ,
        -0.4990212 ,  0.46320498],
       [ 0.3328819 , -0.4053446 , -0.7610476 , ..., -0.1243699 ,
         0.6367438 , -0.01542942],
       [-1.1692545 , -0.39621714,  0.1303403 , ...,  0.13932249,
         0.09161877, -1.8154769 ]], dtype=float32)

In [10]:
# Determine the optimal number of clusters using silhouette score
sil_scores = []
range_n_clusters = list(range(2, 21))  # Test between 2 and 20 clusters

for n_clusters in range_n_clusters:
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    cluster_labels = kmeans.fit_predict(embeddings)
    sil_score = silhouette_score(embeddings, cluster_labels)
    sil_scores.append(sil_score)

optimal_clusters = range_n_clusters[sil_scores.index(max(sil_scores))]
print(f'Optimal number of clusters: {optimal_clusters}')

# Apply K-Means clustering with the optimal number of clusters
kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)
labels = kmeans.fit_predict(embeddings)

# Create a DataFrame to see the clusters
clustered_df = pd.DataFrame({'account_name': account_names, 'cluster': labels})



Optimal number of clusters: 18




In [11]:
clustered_df

Unnamed: 0,account_name,cluster
0,AcceleratedShareRepurchaseProgramAdjustment,6
1,AcceleratedShareRepurchasesFinalPricePaidPerShare,10
2,AcceleratedShareRepurchasesSettlementPaymentOr...,10
3,AccountsAndNotesReceivableNet,15
4,AccountsAndOtherReceivablesNetCurrent,15
...,...,...
4407,WeightedAverageNumberOfSharesOutstandingBasic,10
4408,WeightedAverageNumberOfSharesRestrictedStock,10
4409,WithdrawalFromContractHoldersFunds,0
4410,WorkersCompensationLiabilityCurrent,15


In [12]:
# Save the clustered DataFrame to a CSV file
output_file_path = 'clustered_account_names.csv'
clustered_df.to_csv(output_file_path, index=False)

print(f'Clustered account names have been saved to {output_file_path}')

Clustered account names have been saved to clustered_account_names.csv
