In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import os

In [28]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device




device(type='cpu')

In [None]:

# csv_path = r"C:\Users\final\FinancialTrackerApp\model_code\expenses_data.csv"
BASE_DIR = os.path.join(os.getcwd(), "model_code")
csv_path = os.path.join(BASE_DIR, "expenses_data.csv")
df = pd.read_csv(csv_path)
texts = df['text']       
labels = df['category']  

In [30]:
#converting text data into numerical vectors
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(texts)
X


<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 180 stored elements and shape (76, 111)>

In [31]:
# Encoding the labels
encoder = LabelEncoder()
y = encoder.fit_transform(labels)
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 0, 0, 3, 1, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [32]:
#Train and test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert sparse matrices to dense arrays to avoid type errors
X_train = X_train.toarray()
X_test = X_test.toarray()


X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.long)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.long)


In [33]:
X_train = X_train.to(device)
y_train = y_train.to(device)
X_test = X_test.to(device)
y_test = y_test.to(device)


In [34]:
#Define model
class ExpenseClassifier(nn.Module):
    def __init__(self, input_size,hidden_size, num_classes):
        super(ExpenseClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)

    def forward(self,x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

In [35]:
input_size = X_train.shape[1]
hidden_size = 64
num_classes = len(set(y))

model = ExpenseClassifier(input_size, hidden_size, num_classes)
model.to(device)

ExpenseClassifier(
  (fc1): Linear(in_features=111, out_features=64, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=64, out_features=4, bias=True)
)

In [36]:
#Loss and optimizer
criterion = nn.CrossEntropyLoss() # compares predicted catgeory vs actual
optimizer = optim.Adam(model.parameters(), lr=0.01) # updates model weights efficiently


In [37]:
#Training loop
epochs = 200
for epoch in range(epochs):
    outputs = model(X_train)
    loss = criterion(outputs, y_train)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if(epoch+1)%10==0:
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')

Epoch [10/200], Loss: 0.8718
Epoch [20/200], Loss: 0.3412
Epoch [30/200], Loss: 0.0509
Epoch [40/200], Loss: 0.0076
Epoch [50/200], Loss: 0.0023
Epoch [60/200], Loss: 0.0012
Epoch [70/200], Loss: 0.0009
Epoch [80/200], Loss: 0.0007
Epoch [90/200], Loss: 0.0006
Epoch [100/200], Loss: 0.0005
Epoch [110/200], Loss: 0.0005
Epoch [120/200], Loss: 0.0005
Epoch [130/200], Loss: 0.0004
Epoch [140/200], Loss: 0.0004
Epoch [150/200], Loss: 0.0004
Epoch [160/200], Loss: 0.0003
Epoch [170/200], Loss: 0.0003
Epoch [180/200], Loss: 0.0003
Epoch [190/200], Loss: 0.0003
Epoch [200/200], Loss: 0.0002


In [38]:
#Evaluation
with torch.inference_mode():
    y_pred = model(X_test) #forward pass
    y_pred_classes = torch.argmax(y_pred, dim=1 ) #pick highest probability class
    acc = (y_pred_classes == y_test).float().mean() #accuracy
    print(f'Accuracy: {acc.item():.4f}')

Accuracy: 0.7500


In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")
totals = {
    "food": 0.0,
    "medical": 0.0,
    "transport": 0.0,
    "others": 0.0
}

def predict(text):
    # Vectorize and move to correct device
    vec = vectorizer.transform([text]).toarray()
    vec = torch.tensor(vec, dtype=torch.float32).to(device)

    # Predict category
    output = model(vec)
    pred = torch.argmax(output, 1).item()
    pred = encoder.inverse_transform([pred])[0]

    # Extract all amounts
    doc = nlp(text)
    amount = 0.0
    for ent in doc.ents:
        if ent.label_ in ["MONEY", "CARDINAL"]:
            try:
                amount += float(ent.text)  # sum all numbers in the sentence
            except:
                pass

    totals[pred] += amount
    return totals
# testing:
# predict("Bought apples for 80 rs")   
# predict("Hospital bill as 750")   
# predict("Taxi fare as 300")       
# predict("Netflix subscription as 500") 
# predict("Train from Velachery as 150") 
# predict("Spent 500 at hotel")

print(totals)


{'food': 580.0, 'medical': 750.0, 'transport': 450.0, 'others': 500.0}


In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

doc = nlp("I spent Rs. 700 and 300 on food on 26th July")
for ent in doc.ents:
    print(ent.text, ent.label_)


700 CARDINAL
300 CARDINAL
26th July DATE
