<a href="https://colab.research.google.com/github/Anirookie/Model_Exp/blob/main/Model_Finc.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import DataLoader, TensorDataset

In [None]:
from transformers import BertTokenizer, BertModel,BertConfig

In [None]:
# Load the new dataset
df = pd.read_csv('dome.csv')

In [None]:
df.head()

Unnamed: 0,Transaction ID,Date,Description,Amount,Category,Subcategory
0,1,2024-01-01,Salary Payment,3000.0,Income,Salary
1,2,2024-01-02,Coffee Shop,-5.5,Expenditure,Food & Beverage
2,3,2024-01-03,Stock Purchase,-1500.0,Investments,Stocks
3,4,2024-01-04,Rent Payment,-1200.0,Expenditure,Housing
4,5,2024-01-05,Freelance Work,500.0,Income,Freelance


In [None]:
# Encode the labels
category_encoder = LabelEncoder()
subcategory_encoder = LabelEncoder()
df['Category'] = category_encoder.fit_transform(df['Category'])
df['Subcategory'] = subcategory_encoder.fit_transform(df['Subcategory'])

In [None]:
scaler = StandardScaler()
df[['Amount']] = scaler.fit_transform(df[['Amount']])


In [None]:
df["Amount"].describe()

count    5.000000e+01
mean     2.942091e-17
std      1.010153e+00
min     -5.330449e+00
25%      1.841705e-01
50%      2.037791e-01
75%      2.285782e-01
max      3.444994e-01
Name: Amount, dtype: float64

In [None]:
df.duplicated().sum()

0

In [None]:
# # Check the values in the 'Amount' column
# print(df['Amount'].describe())
# print(df['Amount'].value_counts())


count        50.000000
mean      -4466.750000
std       21894.252892
min     -120000.000000
25%        -475.000000
50%         -50.000000
75%         487.500000
max        3000.000000
Name: Amount, dtype: float64
Amount
 3000.0      3
-300.0       2
-2000.0      2
-50.0        2
-500.0       2
-100.0       2
-1200.0      2
-200.0       2
 500.0       2
-1500.0      2
 100.0       1
-3000.0      1
 600.0       1
-600.0       1
-5000.0      1
 200.0       1
-7.0         1
 900.0       1
 550.0       1
-25.0        1
 150.0       1
-120000.0    1
-55.0        1
-400.0       1
 800.0       1
 700.0       1
-40.0        1
-150.0       1
 2000.0      1
-60.0        1
-5.5         1
 450.0       1
-250.0       1
 650.0       1
 750.0       1
-100000.0    1
-20.0        1
 50.0        1
 75.0        1
Name: count, dtype: int64


In [None]:
# # Ensure 'Amount' has variance
# if df['Amount'].std() == 0:
#     print("Warning: 'Amount' column has zero variance. Skipping StandardScaler.")
# else:
#     scaler = StandardScaler()
#     df[['Amount']] = scaler.fit_transform(df[['Amount']])

In [None]:
from transformers import BertTokenizer, BertModel

# Load the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Tokenize descriptions
train_descriptions = tokenizer(list(df['Description']), padding=True, truncation=True, return_tensors='pt')


In [None]:
# Extract BERT features
with torch.no_grad():
    bert_features = bert_model(**train_descriptions).last_hidden_state[:, 0, :]  # CLS token features


In [None]:
bert_features

tensor([[-0.3497,  0.2390, -0.2440,  ..., -0.1196, -0.3912,  0.6157],
        [-0.0448,  0.1907, -0.3153,  ..., -0.2591,  0.2497,  0.0919],
        [-0.5401, -0.1125, -0.3305,  ..., -0.0009,  0.1551,  0.3311],
        ...,
        [-0.0539,  0.1881, -0.3656,  ...,  0.0103, -0.1854,  0.2190],
        [-0.2713,  0.0684, -0.1924,  ...,  0.0146, -0.0506,  0.3271],
        [-0.1231,  0.2576, -0.2130,  ..., -0.0556, -0.0860,  0.2884]])

In [None]:
# Convert the numerical data to tensors
amount_tensor = torch.tensor(df[['Amount']].values, dtype=torch.float32)

# Concatenate BERT features with numerical features
combined_features = torch.cat((amount_tensor, bert_features), dim=1)


In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(combined_features, df[['Category', 'Subcategory']], test_size=0.2, random_state=42)

# Convert training and testing data to PyTorch tensors
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)


In [None]:
import torch.nn as nn
from transformers import PreTrainedModel, BertModel

class CombinedModel(PreTrainedModel):
    def __init__(self, config, num_categories, num_subcategories):
        super().__init__(config)
        self.bert = BertModel(config)
        self.fc1 = nn.Linear(config.hidden_size + 1, 256)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)
        self.category_classifier = nn.Linear(256, num_categories)
        self.subcategory_classifier = nn.Linear(256, num_subcategories)

    def forward(self, bert_input_ids, attention_mask, amount_tensor):
        bert_features = self.bert(input_ids=bert_input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]  # CLS token features
        amount_tensor = amount_tensor.squeeze(-1)  # Ensure correct shape
        combined_features = torch.cat((amount_tensor, bert_features), dim=1)
        x = self.fc1(combined_features)
        x = self.relu(x)
        x = self.dropout(x)
        category_outputs = self.category_classifier(x)
        subcategory_outputs = self.subcategory_classifier(x)
        return category_outputs, subcategory_outputs




# # Saving the model
# model.save_pretrained("C:\\Users\\anirudh.nandakumar\\Desktop",from_pt=True)

In [None]:
# Load pretrained BERT configuration
config = BertConfig.from_pretrained('bert-base-uncased')  # Use the appropriate BERT variant

# Calculate number of categories and subcategories
num_categories = len(df['Category'].unique())
num_subcategories = len(df['Subcategory'].unique())

# Instantiate the combined model
model = CombinedModel(config, num_categories, num_subcategories)



In [None]:
class CombinedModel(torch.nn.Module):
    def __init__(self, input_size, num_categories, num_subcategories):
        super(CombinedModel, self).__init__()
        self.fc1 = torch.nn.Linear(input_size, 128)
        self.fc2 = torch.nn.Linear(128, 64)
        self.fc3_category = torch.nn.Linear(64, num_categories)
        self.fc3_subcategory = torch.nn.Linear(64, num_subcategories)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        category_output = self.fc3_category(x)
        subcategory_output = self.fc3_subcategory(x)
        return category_output, subcategory_output

input_size = combined_features.shape[1]
num_categories = len(df['Category'].unique())
num_subcategories = len(df['Subcategory'].unique())
model = CombinedModel(input_size, num_categories, num_subcategories)





In [None]:
from torch.utils.data import DataLoader, TensorDataset

# Define loss and optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Create DataLoaders
train_dataset = TensorDataset(X_train, y_train_tensor)
test_dataset = TensorDataset(X_test, y_test_tensor)

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [None]:
!pip install wandb --q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m24.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m300.1/300.1 kB[0m [31m33.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import wandb

# Initialize wandb
wandb.init(project='dome_train', name='training_run')

In [None]:
import time
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

In [None]:
# Set hyperparameters and configurations
config = {
    'num_epochs': 18,
    'learning_rate': optimizer.param_groups[0]['lr'],
    'batch_size': train_dataloader.batch_size,
    # Add any other relevant hyperparameters
}
wandb.config.update(config)

num_epochs = config['num_epochs']

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    correct_category, total_category = 0, 0
    correct_subcategory, total_subcategory = 0, 0
    start_time = time.time()

    for inputs, labels in train_dataloader:
        optimizer.zero_grad()
        category_outputs, subcategory_outputs = model(inputs)
        loss_category = criterion(category_outputs, labels[:, 0])
        loss_subcategory = criterion(subcategory_outputs, labels[:, 1])
        loss = loss_category + loss_subcategory
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

        _, predicted_category = torch.max(category_outputs.data, 1)
        _, predicted_subcategory = torch.max(subcategory_outputs.data, 1)

        total_category += labels[:, 0].size(0)
        correct_category += (predicted_category == labels[:, 0]).sum().item()
        total_subcategory += labels[:, 1].size(0)
        correct_subcategory += (predicted_subcategory == labels[:, 1]).sum().item()

    epoch_accuracy_category = correct_category / total_category
    epoch_accuracy_subcategory = correct_subcategory / total_subcategory
    epoch_duration = time.time() - start_time

    # Log metrics for this epoch to wandb
    wandb.log({
        'epoch': epoch + 1,
        'loss': epoch_loss / len(train_dataloader),
        'accuracy_category': epoch_accuracy_category,
        'accuracy_subcategory': epoch_accuracy_subcategory,
        'epoch_duration': epoch_duration,
        'learning_rate': optimizer.param_groups[0]['lr']
    })
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss / len(train_dataloader):.4f}, '
          f'Accuracy Category: {epoch_accuracy_category:.4f}, '
          f'Accuracy Subcategory: {epoch_accuracy_subcategory:.4f}, '
          f'Epoch Duration: {epoch_duration:.2f}s')



Epoch [1/18], Loss: 4.6837, Accuracy Category: 0.1250, Accuracy Subcategory: 0.0500, Epoch Duration: 0.01s
Epoch [2/18], Loss: 4.5524, Accuracy Category: 0.3750, Accuracy Subcategory: 0.1250, Epoch Duration: 0.01s
Epoch [3/18], Loss: 4.4452, Accuracy Category: 0.3750, Accuracy Subcategory: 0.1500, Epoch Duration: 0.01s
Epoch [4/18], Loss: 4.4141, Accuracy Category: 0.4250, Accuracy Subcategory: 0.2000, Epoch Duration: 0.00s
Epoch [5/18], Loss: 4.2230, Accuracy Category: 0.5000, Accuracy Subcategory: 0.2000, Epoch Duration: 0.01s
Epoch [6/18], Loss: 4.1363, Accuracy Category: 0.5750, Accuracy Subcategory: 0.2000, Epoch Duration: 0.01s
Epoch [7/18], Loss: 4.1067, Accuracy Category: 0.6500, Accuracy Subcategory: 0.2000, Epoch Duration: 0.01s
Epoch [8/18], Loss: 3.8855, Accuracy Category: 0.7250, Accuracy Subcategory: 0.1750, Epoch Duration: 0.01s
Epoch [9/18], Loss: 3.7636, Accuracy Category: 0.7250, Accuracy Subcategory: 0.1750, Epoch Duration: 0.01s
Epoch [10/18], Loss: 3.6011, Accuracy

In [None]:
# Evaluate the model
def evaluate_model(model, dataloader):
    model.eval()
    total_category, correct_category = 0, 0
    total_subcategory, correct_subcategory = 0, 0
    all_labels = []
    all_predictions_category = []
    all_predictions_subcategory = []

    with torch.no_grad():
        for inputs, labels in dataloader:
            category_outputs, subcategory_outputs = model(inputs)
            _, predicted_category = torch.max(category_outputs.data, 1)
            _, predicted_subcategory = torch.max(subcategory_outputs.data, 1)

            total_category += labels[:, 0].size(0)
            correct_category += (predicted_category == labels[:, 0]).sum().item()
            total_subcategory += labels[:, 1].size(0)
            correct_subcategory += (predicted_subcategory == labels[:, 1]).sum().item()

            all_labels.append(labels)
            all_predictions_category.append(predicted_category)
            all_predictions_subcategory.append(predicted_subcategory)

    accuracy_category = correct_category / total_category
    accuracy_subcategory = correct_subcategory / total_subcategory

    all_labels = torch.cat(all_labels)
    all_predictions_category = torch.cat(all_predictions_category)
    all_predictions_subcategory = torch.cat(all_predictions_subcategory)

    # Compute additional metrics
    precision_category = precision_score(all_labels[:, 0].cpu(), all_predictions_category.cpu(), average='weighted')
    recall_category = recall_score(all_labels[:, 0].cpu(), all_predictions_category.cpu(), average='weighted')
    f1_category = f1_score(all_labels[:, 0].cpu(), all_predictions_category.cpu(), average='weighted')

    precision_subcategory = precision_score(all_labels[:, 1].cpu(), all_predictions_subcategory.cpu(), average='weighted')
    recall_subcategory = recall_score(all_labels[:, 1].cpu(), all_predictions_subcategory.cpu(), average='weighted')
    f1_subcategory = f1_score(all_labels[:, 1].cpu(), all_predictions_subcategory.cpu(), average='weighted')

    return {
        'accuracy_category': accuracy_category,
        'accuracy_subcategory': accuracy_subcategory,
        'precision_category': precision_category,
        'recall_category': recall_category,
        'f1_category': f1_category,
        'precision_subcategory': precision_subcategory,
        'recall_subcategory': recall_subcategory,
        'f1_subcategory': f1_subcategory
    }

test_metrics = evaluate_model(model, test_dataloader)
print(f'Test Accuracy for Category: {test_metrics["accuracy_category"]:.4f}')
print(f'Test Accuracy for Subcategory: {test_metrics["accuracy_subcategory"]:.4f}')

# Log the test metrics to wandb
wandb.log(test_metrics)

# Finish the wandb run
wandb.finish()

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Test Accuracy for Category: 0.7000
Test Accuracy for Subcategory: 0.2000


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy_category,▁▃▃▄▅▅▆▇▇▆▆▆▆▆▇███▇
accuracy_subcategory,▁▂▂▃▃▃▃▃▃▃▃▃▆▇█▇██▃
epoch,▁▁▂▂▃▃▃▄▄▅▅▆▆▆▇▇██
epoch_duration,▄▂▂▁▂▁▅▇█▇▆▅▆▆▆▇▆▃
f1_category,▁
f1_subcategory,▁
learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss,█▇▇▇▆▆▆▅▄▄▅▅▃▃▄▂▂▁
precision_category,▁
precision_subcategory,▁

0,1
accuracy_category,0.7
accuracy_subcategory,0.2
epoch,18.0
epoch_duration,0.00612
f1_category,0.69286
f1_subcategory,0.07333
learning_rate,0.001
loss,2.90847
precision_category,0.73333
precision_subcategory,0.045


In [None]:


# Set hyperparameters and configurations
config = {
    'num_epochs': 80,
    'learning_rate': optimizer.param_groups[0]['lr'],
    'batch_size': train_dataloader.batch_size,
    # Add any other relevant hyperparameters
}
wandb.config.update(config)

num_epochs = config['num_epochs']

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    for inputs, labels in train_dataloader:
        optimizer.zero_grad()
        category_outputs, subcategory_outputs = model(inputs)
        loss_category = criterion(category_outputs, labels[:, 0])
        loss_subcategory = criterion(subcategory_outputs, labels[:, 1])
        loss = loss_category + loss_subcategory
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    # Log the average loss for this epoch to wandb
    wandb.log({'epoch': epoch + 1, 'loss': epoch_loss / len(train_dataloader)})
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss / len(train_dataloader):.4f}')

# Evaluate the model
def evaluate_model(model, dataloader):
    model.eval()
    total_category, correct_category = 0, 0
    total_subcategory, correct_subcategory = 0, 0
    with torch.no_grad():
        for inputs, labels in dataloader:
            category_outputs, subcategory_outputs = model(inputs)
            _, predicted_category = torch.max(category_outputs.data, 1)
            _, predicted_subcategory = torch.max(subcategory_outputs.data, 1)
            total_category += labels[:, 0].size(0)
            correct_category += (predicted_category == labels[:, 0]).sum().item()
            total_subcategory += labels[:, 1].size(0)
            correct_subcategory += (predicted_subcategory == labels[:, 1]).sum().item()
    accuracy_category = correct_category / total_category
    accuracy_subcategory = correct_subcategory / total_subcategory
    return accuracy_category, accuracy_subcategory

test_accuracy_category, test_accuracy_subcategory = evaluate_model(model, test_dataloader)
print(f'Test Accuracy for Category: {test_accuracy_category:.4f}')
print(f'Test Accuracy for Subcategory: {test_accuracy_subcategory:.4f}')

# Log the test accuracies to wandb
wandb.log({
    'test_accuracy_category': test_accuracy_category,
    'test_accuracy_subcategory': test_accuracy_subcategory
})

# Finish the wandb run
wandb.finish()

Epoch [1/80], Loss: 2.8446
Epoch [2/80], Loss: 2.9872
Epoch [3/80], Loss: 2.3665
Epoch [4/80], Loss: 2.6478
Epoch [5/80], Loss: 2.1596
Epoch [6/80], Loss: 2.0790
Epoch [7/80], Loss: 2.3346
Epoch [8/80], Loss: 2.1222
Epoch [9/80], Loss: 1.8897
Epoch [10/80], Loss: 2.0427
Epoch [11/80], Loss: 1.8352
Epoch [12/80], Loss: 1.5911
Epoch [13/80], Loss: 1.8396
Epoch [14/80], Loss: 1.7217
Epoch [15/80], Loss: 1.5613
Epoch [16/80], Loss: 1.5984
Epoch [17/80], Loss: 1.2384
Epoch [18/80], Loss: 1.3714
Epoch [19/80], Loss: 1.3010
Epoch [20/80], Loss: 1.3961
Epoch [21/80], Loss: 1.1916
Epoch [22/80], Loss: 1.2193
Epoch [23/80], Loss: 0.8033
Epoch [24/80], Loss: 0.9925
Epoch [25/80], Loss: 0.9293
Epoch [26/80], Loss: 0.9754
Epoch [27/80], Loss: 1.0226
Epoch [28/80], Loss: 0.7578
Epoch [29/80], Loss: 0.8191
Epoch [30/80], Loss: 0.7580
Epoch [31/80], Loss: 0.8116
Epoch [32/80], Loss: 0.6853
Epoch [33/80], Loss: 0.6305
Epoch [34/80], Loss: 0.6296
Epoch [35/80], Loss: 0.5123
Epoch [36/80], Loss: 0.5961
E

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
loss,█▇▆▇▆▅▅▅▄▄▄▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁
test_accuracy_category,▁
test_accuracy_subcategory,▁

0,1
epoch,80.0
loss,0.07873
test_accuracy_category,0.8
test_accuracy_subcategory,0.6


In [None]:
def preprocess_input(description, amount):
    # Tokenize the description using BERT tokenizer
    inputs = tokenizer(description, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        bert_features = bert_model(**inputs).last_hidden_state[:, 0, :]  # CLS token features

    # Scale the amount
    amount_scaled = scaler.transform([[amount]])
    amount_tensor = torch.tensor(amount_scaled, dtype=torch.float32)

    # Combine the features
    combined_features = torch.cat((amount_tensor, bert_features), dim=1)

    return combined_features

In [None]:
def predict_category_and_subcategory(model, description, amount):
    # Preprocess the input
    inputs = preprocess_input(description, amount)

    # Make predictions
    model.eval()
    with torch.no_grad():
        category_outputs, subcategory_outputs = model(inputs)
        _, predicted_category = torch.max(category_outputs.data, 1)
        _, predicted_subcategory = torch.max(subcategory_outputs.data, 1)

    # Convert predictions to original labels
    predicted_category_label = category_encoder.inverse_transform(predicted_category.numpy())[0]
    predicted_subcategory_label = subcategory_encoder.inverse_transform(predicted_subcategory.numpy())[0]

    return predicted_category_label, predicted_subcategory_label

# Example usage of the user prompt function
description = "Loan"
amount = 50000.0

predicted_category, predicted_subcategory = predict_category_and_subcategory(model, description, amount)
print(f"Prediction: Category - {predicted_category}, Subcategory - {predicted_subcategory}")

Prediction: Category - Expenditure, Subcategory - Salary




In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
!pip install transformers



In [None]:
!pip install huggingface_hub




In [None]:
from transformers import BertTokenizer, BertModel, BertConfig
from huggingface_hub import HfApi

# Assuming your model is defined as CombinedModel as shown earlier
class CombinedModel(BertPreTrainedModel):
    def __init__(self, config):
        super(CombinedModel, self).__init__(config)
        self.bert = BertModel(config)
        self.fc1 = nn.Linear(config.hidden_size + 1, 128)
        self.fc2 = nn.Linear(128, 64)
        self.category_classifier = nn.Linear(64, config.num_labels_category)
        self.subcategory_classifier = nn.Linear(64, config.num_labels_subcategory)

    def forward(self, input_ids, attention_mask, token_type_ids, amount):
        bert_outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        cls_token_output = bert_outputs.last_hidden_state[:, 0, :]  # CLS token output
        combined_input = torch.cat((cls_token_output, amount.unsqueeze(1)), dim=1)
        x = torch.relu(self.fc1(combined_input))
        x = torch.relu(self.fc2(x))
        category_output = self.category_classifier(x)
        subcategory_output = self.subcategory_classifier(x)
        return category_output, subcategory_output

# Create a config with the necessary number of labels
config = BertConfig.from_pretrained('bert-base-uncased')
config.num_labels_category = len(category_encoder.classes_)
config.num_labels_subcategory = len(subcategory_encoder.classes_)

# Initialize the model with the config
model = CombinedModel(config)

# Save the model and tokenizer to local directories
model_name = "Category"
model_dir = f"./{model_name}"
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

model.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir)

# Push the model to the hub
model.push_to_hub("Ani8Face/Category")
tokenizer.push_to_hub("Ani8Face/Category")


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Ani8Face/Category/commit/76ea05f1da321d4b95a93d661761af7e8cfd1776', commit_message='Upload tokenizer', commit_description='', oid='76ea05f1da321d4b95a93d661761af7e8cfd1776', pr_url=None, pr_revision=None, pr_num=None)

### Application Streamlit

In [None]:
%%writefile app.py
import streamlit as st
import pandas as pd
import requests
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
import torch
from torch import nn
from sklearn.preprocessing import StandardScaler, LabelEncoder
import os
# Define the CombinedModel class
class CombinedModel(nn.Module):
    def __init__(self, config, num_labels_category, num_labels_subcategory):
        super(CombinedModel, self).__init__()
        self.bert = AutoModelForSequenceClassification.from_pretrained(config._name_or_path, config=config)
        self.fc1 = nn.Linear(config.hidden_size + 1, 128)
        self.fc2 = nn.Linear(128, 64)
        self.category_classifier = nn.Linear(64, num_labels_category)
        self.subcategory_classifier = nn.Linear(64, num_labels_subcategory)

    def forward(self, input_ids, attention_mask, token_type_ids, amount):
        bert_outputs = self.bert.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        cls_token_output = bert_outputs.last_hidden_state[:, 0, :]  # CLS token output
        combined_input = torch.cat((cls_token_output, amount.unsqueeze(1)), dim=1)
        x = torch.relu(self.fc1(combined_input))
        x = torch.relu(self.fc2(x))
        category_output = self.category_classifier(x)
        subcategory_output = self.subcategory_classifier(x)
        return category_output, subcategory_output

# Load the model and tokenizer from Hugging Face Hub
model_name = "Ani8Face/Category"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Create a config with the necessary number of labels
config = AutoConfig.from_pretrained('bert-base-uncased')
num_labels_category = 5  # Update with your actual number of categories
num_labels_subcategory = 5  # Update with your actual number of subcategories
config.num_labels_category = num_labels_category
config.num_labels_subcategory = num_labels_subcategory

os.makedirs(model_name, exist_ok=True)
# Save the model
# torch.save(model.state_dict(), f"{model_name}/pytorch_model.bin")

model = CombinedModel(config=config, num_labels_category=num_labels_category, num_labels_subcategory=num_labels_subcategory)
# Load the model weights
model_weights_path = f"{model_name}/pytorch_model.bin"
if os.path.exists(model_weights_path):
    model.load_state_dict(torch.load(model_weights_path))
else:
    st.error("Model weights not found. Please ensure the correct path to the model weights.")


# Initialize label encoders and scaler
category_encoder = LabelEncoder()
subcategory_encoder = LabelEncoder()
scaler = StandardScaler()


##################################
import streamlit as st
# Define functions to interact with the Gemini API
def call_gemini_api(description, amount):
    # Preprocess the input
    inputs = tokenizer(description, padding=True, truncation=True, return_tensors='pt')
    amount_scaled = scaler.transform([[amount]])

    # Call the Gemini API (example URL and parameters)
    api_url = "GOOGLE_API_KEY"
    payload = {
        "model": model_name,
        "inputs": {
            "description": inputs.input_ids.tolist(),
            "amount": amount_scaled.tolist()
        }
    }
    response = requests.post(api_url, json=payload)

    if response.status_code == 200:
        result = response.json()
        return result['predicted_category'], result['predicted_subcategory']
    else:
        st.error("Error calling Gemini API")
        return None, None

# Streamlit app interface
st.title("Financial Transaction Classifier")

# File upload
uploaded_file = st.file_uploader("Upload a CSV or Excel file", type=["csv", "xlsx"])
if uploaded_file:
    if uploaded_file.name.endswith(".csv"):
        data = pd.read_csv(uploaded_file)
    else:
        data = pd.read_excel(uploaded_file)

    st.write("Uploaded Data:")
    st.write(data)

    # Assuming 'Description' and 'Amount' columns are present
    if 'Description' in data.columns and 'Amount' in data.columns:
        descriptions = data['Description'].tolist()
        amounts = data['Amount'].tolist()

        # Get predictions for the entire dataset
        predictions = [call_gemini_api(desc, amt) for desc, amt in zip(descriptions, amounts)]
        categories, subcategories = zip(*predictions)

        data['Predicted Category'] = categories
        data['Predicted Subcategory'] = subcategories

        st.write("Data with Predictions:")
        st.write(data)

        # Balance sheet aggregation
        balance_sheet = data.groupby(['Predicted Category', 'Predicted Subcategory']).agg({'Amount': 'sum'}).reset_index()
        st.write("Balance Sheet:")
        st.write(balance_sheet)

# User input prompt
description_input = st.text_input("Enter a transaction description")
amount_input = st.number_input("Enter the transaction amount", min_value=0.0, format="%.2f")

if st.button("Predict Category and Subcategory"):
    if description_input and amount_input:
        category, subcategory = call_gemini_api(description_input, amount_input)
        st.write(f"Predicted Category: {category}")
        st.write(f"Predicted Subcategory: {subcategory}")



Writing app.py


In [None]:
!streamlit run app.py


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://35.196.117.255:8501[0m
[0m
[34m  Stopping...[0m
[34m  Stopping...[0m


# Worked below code

In [None]:
import streamlit as st
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
import torch
from torch import nn
from sklearn.preprocessing import StandardScaler, LabelEncoder
from streamlit.web.cli import main as st_main
from dotenv import load_dotenv
import os
import google.generativeai as genai

# Load environment variables
load_dotenv()
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))

# Load Hugging Face model and tokenizer
model_name = "Ani8Face/Category"
tokenizer = AutoTokenizer.from_pretrained(model_name)
config = AutoConfig.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config)

# Define the model class
class CombinedModel(nn.Module):
    def __init__(self, config, num_labels_category, num_labels_subcategory):
        super(CombinedModel, self).__init__()
        self.bert = model  # Load the pre-trained model
        self.fc1 = nn.Linear(config.hidden_size + 1, 128)
        self.fc2 = nn.Linear(128, 64)
        self.category_classifier = nn.Linear(64, num_labels_category)
        self.subcategory_classifier = nn.Linear(64, num_labels_subcategory)

    def forward(self, input_ids, attention_mask, token_type_ids, amount):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        cls_token_output = outputs.last_hidden_state[:, 0, :]  # CLS token output
        combined_input = torch.cat((cls_token_output, amount.unsqueeze(1)), dim=1)
        x = torch.relu(self.fc1(combined_input))
        x = torch.relu(self.fc2(x))
        category_output = self.category_classifier(x)
        subcategory_output = self.subcategory_classifier(x)
        return category_output, subcategory_output

num_labels_category = 5  # Update with your actual number of categories
num_labels_subcategory = 5  # Update with your actual number of subcategories

# Initialize the model
model = CombinedModel(config=config, num_labels_category=num_labels_category, num_labels_subcategory=num_labels_subcategory)

# Initialize label encoders and scaler
category_encoder = LabelEncoder()
subcategory_encoder = LabelEncoder()

# Define function to interact with the Google Generative AI API
def call_gemini_api(description, amount, all_amounts):
    try:
        # Preprocess the input
        inputs = tokenizer(description, padding=True, truncation=True, return_tensors='pt')

        # Ensure all_amounts is a 2D array
        all_amounts_reshaped = [[amt] for amt in all_amounts]
        scaler = StandardScaler()
        scaler.fit(all_amounts_reshaped)

        # Ensure amount is a 2D array
        amount_scaled = scaler.transform([[amount]])

        # Convert amount_scaled to a string representation
        amount_scaled_str = str(amount_scaled[0][0])

        # Use a compatible model for text generation
        prompt = f"Transaction description: {description}, Amount: {amount_scaled_str}. Provide the category (e.g., Assets, Expenditure, Liabilities) and subcategory."

        # Generative AI API call
        response = genai.generate_text(prompt=prompt, model="models/text-bison-001")

        # Extract the generated text from the response
        generated_text = response.result

        # Assuming response contains the category and subcategory separated by a comma
        return generated_text
    except Exception as e:
        st.error(f"Error in calling Gemini API: {e}")
        return "Unknown, Unknown"

# Streamlit app interface
st.title("Financial Transaction Classifier")

# File upload
uploaded_file = st.file_uploader("Upload a CSV or Excel file", type=["csv", "xlsx"])
if uploaded_file:
    if uploaded_file.name.endswith(".csv"):
        data = pd.read_csv(uploaded_file)
    else:
        data = pd.read_excel(uploaded_file)

    st.write("Uploaded Data:")
    st.write(data)

    # Ensure columns exist and handle missing data
    if 'Description' in data.columns and 'Amount' in data.columns:
        descriptions = data['Description'].tolist()
        amounts = data['Amount'].tolist()

        # Fit and transform the scaler with the current amounts
        all_amounts = [amt for amt in amounts]

        # Get predictions for the entire dataset
        predictions = [call_gemini_api(desc, amt, all_amounts) for desc, amt in zip(descriptions, amounts)]

        # Debugging output for predictions
        st.write("Predictions:")
        st.write(predictions)

        # Handle unexpected responses
        categories = []
        subcategories = []
        for pred in predictions:
            try:
                category, subcategory = pred.split(',')
                categories.append(category.strip())
                subcategories.append(subcategory.strip())
            except ValueError:
                categories.append("Unknown")
                subcategories.append("Unknown")

        data['Predicted Category'] = categories
        data['Predicted Subcategory'] = subcategories

        st.write("Data with Predictions:")
        st.write(data)

        # Balance sheet aggregation
        balance_sheet = data.groupby(['Predicted Category', 'Predicted Subcategory']).agg({'Amount': 'sum'}).reset_index()

        # Formatting the balance sheet
        st.write("Balance Sheet:")
        for category in balance_sheet['Predicted Category'].unique():
            st.write(f"### {category}")
            category_data = balance_sheet[balance_sheet['Predicted Category'] == category]
            for _, row in category_data.iterrows():
                st.write(f"{row['Predicted Subcategory']}: ${row['Amount']:,.2f}")
            total = category_data['Amount'].sum()
            st.write(f"**Total {category}: ${total:,.2f}**")

# User input prompt
description_input = st.text_input("Enter a transaction description")
amount_input = st.number_input("Enter the transaction amount", min_value=0.0, format="%.2f")

if st.button("Predict Category and Subcategory"):
    if description_input and amount_input:
        # Fit and transform with a single amount
        all_amounts = [amount_input]
        pred = call_gemini_api(description_input, amount_input, all_amounts)
        try:
            category, subcategory = pred.split(',')
            st.write(f"Predicted Category: {category.strip()}")
            st.write(f"Predicted Subcategory: {subcategory.strip()}")
        except ValueError:
            st.write("Prediction error: Unable to parse the response")

if __name__ == '__main__':
    st_main()