In [31]:
import tensorflow as tf
import numpy as np
import pandas as pd

print(tf.__version__)
print(np.__version__)

2.4.0
1.19.5


# Data Cleaning

In [3]:
# pull down data from the Expense Tagging repo
df = pd.read_csv("https://github.com/EZ-Walk/Expense-Tagging/blob/f6b58fbcc1270d6bcdab82873673ee34c6d1e84a/Data/raw/AccountHistory%20(1).csv?raw=true")
print(df.shape)
df.head()

(3509, 8)


Unnamed: 0,Account Number,Post Date,Check,Description,Debit,Credit,Status,Balance
0,443091309,4/27/2024,,"GITHUB, INC. +18774484820 CAUS",10.0,,Pending,
1,443091309,4/25/2024,,LIME*RIDE +18885463345 CAUS,4.44,,Pending,
2,443091309,4/27/2024,,Deposit weekly allowance,,120.0,Posted,116.43
3,443091309,4/27/2024,,Point Of Sale Withdrawal MASABI_RTD 1600 Blake...,2.7,,Posted,-3.57
4,443091309,4/27/2024,,Point Of Sale Withdrawal SQ *CURTIS PARK 2532 ...,19.24,,Posted,-0.87


In [4]:
# keep only rows with Debit!=NaN, Status=Posted, and Account Number == 4430913
df = df[df['Debit'].notnull()]
df = df[df['Status'] == 'Posted']
df = df[df['Account Number'] == 443091309]
print('Data shape:', df.shape)
df.head()

Data shape: (2726, 8)


Unnamed: 0,Account Number,Post Date,Check,Description,Debit,Credit,Status,Balance
3,443091309,4/27/2024,,Point Of Sale Withdrawal MASABI_RTD 1600 Blake...,2.7,,Posted,-3.57
4,443091309,4/27/2024,,Point Of Sale Withdrawal SQ *CURTIS PARK 2532 ...,19.24,,Posted,-0.87
5,443091309,4/27/2024,,Point Of Sale Withdrawal SQ *CURTIS PARK 2532 ...,2.75,,Posted,18.37
6,443091309,4/26/2024,,Point Of Sale Withdrawal TST* FAMOUS ORI 713 E...,10.8,,Posted,21.12
7,443091309,4/26/2024,,External Withdrawal PAYPAL INSTANT TRANSFER - ...,60.0,,Posted,31.92


In [5]:
# keep only the desired columns
data = df[['Post Date', 'Debit', 'Description']]
print('Data shape:', data.shape)
data.head()

Data shape: (2726, 3)


Unnamed: 0,Post Date,Debit,Description
3,4/27/2024,2.7,Point Of Sale Withdrawal MASABI_RTD 1600 Blake...
4,4/27/2024,19.24,Point Of Sale Withdrawal SQ *CURTIS PARK 2532 ...
5,4/27/2024,2.75,Point Of Sale Withdrawal SQ *CURTIS PARK 2532 ...
6,4/26/2024,10.8,Point Of Sale Withdrawal TST* FAMOUS ORI 713 E...
7,4/26/2024,60.0,External Withdrawal PAYPAL INSTANT TRANSFER - ...


In [6]:
# convert the 'Post Date' column to datetime
data['Date'] = data['Post Date'].astype('datetime64')
data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Date'] = data['Post Date'].astype('datetime64')


Unnamed: 0,Post Date,Debit,Description,Date
3,4/27/2024,2.70,Point Of Sale Withdrawal MASABI_RTD 1600 Blake...,2024-04-27
4,4/27/2024,19.24,Point Of Sale Withdrawal SQ *CURTIS PARK 2532 ...,2024-04-27
5,4/27/2024,2.75,Point Of Sale Withdrawal SQ *CURTIS PARK 2532 ...,2024-04-27
6,4/26/2024,10.80,Point Of Sale Withdrawal TST* FAMOUS ORI 713 E...,2024-04-26
7,4/26/2024,60.00,External Withdrawal PAYPAL INSTANT TRANSFER - ...,2024-04-26
...,...,...,...,...
3502,5/24/2018,7.35,Point Of Sale Withdrawal DEBIT PURCHASE CHIC...,2018-05-24
3503,5/22/2018,45.25,Point Of Sale Withdrawal POS PURCHASE CORNER...,2018-05-22
3504,5/17/2018,45.66,External Withdrawal ANYTIME FIT ABC/CLUB FEES/...,2018-05-17
3505,5/8/2018,9.47,Point Of Sale Withdrawal DEBIT PURCHASE CHIP...,2018-05-08


In [7]:
# fill in dates without expenses as 0
data = data.set_index('Date').resample('D').sum().fillna(0)

In [8]:
# Print the shape one last time and the date range encomapssed by the data as a Month, Day, Year
print('Data shape:', data.shape)
date_range = data.index[[0, -1]]
print('Date range:', date_range[0].strftime('%B %d, %Y'), 'to', date_range[1].strftime('%B %d, %Y'))

Data shape: (2183, 1)
Date range: May 07, 2018 to April 27, 2024


In [9]:
data.to_csv('data/transactions.csv', index=False)

# Pre Processing

In [10]:
from sklearn.preprocessing import MinMaxScaler
df = pd.read_csv('data/transactions.csv')
print(df.shape)
df.head()



(2183, 1)


Unnamed: 0,Debit
0,10.0
1,9.47
2,0.0
3,0.0
4,0.0


In [53]:
# Normalize the data
scaler = MinMaxScaler(feature_range=(0, 1))
df['norm_Debit'] = scaler.fit_transform(df['Debit'].values.reshape(-1,1))
df.head()

Unnamed: 0,Debit,norm_Debit
0,10.0,0.000294
1,9.47,0.000278
2,0.0,0.0
3,0.0,0.0
4,0.0,0.0


In [54]:
# Transform the data into sequences
sequence_length = 32
result = []
for index in range(len(df) - sequence_length+1):
    result.append(df['norm_Debit'].values[index: index + sequence_length])

result = np.array(result)

In [55]:
# Split the data into training and testing sets
train_size = round(0.9 * result.shape[0])  # 90% for training
x_train = result[:int(train_size), :-1]
y_train = result[:int(train_size), -1]
x_test = result[int(train_size):, :-1]
y_test = result[int(train_size):, -1]
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

(1937, 31) (1937,) (215, 31) (215,)


In [56]:
# Save the training data
np.save('data/x_train.npy', x_train)
np.save('data/y_train.npy', y_train)

# Save the testing data
np.save('data/x_test.npy', x_test)
np.save('data/y_test.npy', y_test)

# Model Building

In [57]:
# Define the model architecture
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout


In [None]:


model = Sequential()
model.add(LSTM(100, activation='tanh', return_sequences=True, input_shape=(378, 1)))
model.add(Dropout(0.2))
model.add(LSTM(100, activation='tanh'))
model.add(Dropout(0.2))
model.add(Dense(3, activation='tanh'))

In [72]:
# Define a simple model
model = Sequential()
model.add(LSTM(100, activation='tanh', return_sequences=True, input_shape=(32, 1)))
model.add(Dropout(0.2))
model.add(Dense(1, activation='tanh'))


In [59]:
# Compile the model
from tensorflow.keras.optimizers import Adam, SGD
adam = Adam(learning_rate=0.001)
sgd = SGD(learning_rate=1.0)

model.compile(optimizer=sgd, loss='mean_absolute_error')

In [60]:
# Print the model summary
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_6 (LSTM)                (None, 32, 30)            3840      
_________________________________________________________________
dropout_6 (Dropout)          (None, 32, 30)            0         
_________________________________________________________________
dense_4 (Dense)              (None, 32, 1)             31        
Total params: 3,871
Trainable params: 3,871
Non-trainable params: 0
_________________________________________________________________


# Model training

In [61]:
# Load the training and testing data
x_train = np.load('data/x_train.npy')
x_train = np.expand_dims(x_train, axis=2)
y_train = np.load('data/y_train.npy')
y_train = np.expand_dims(y_train, axis=1)

# Load the testing data
x_test = np.load('data/x_test.npy')
y_test = np.load('data/y_test.npy')
x_test = np.expand_dims(x_test, axis=2)
y_test = np.expand_dims(y_test, axis=1)
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

(1937, 31, 1) (1937, 1) (215, 31, 1) (215, 1)


In [62]:
# Reshape input to be 3D [samples, timesteps, features]
# x_train = x_train.reshape((x_train.shape[0], x_train.shape[1], 1))

# Fit the model
history = model.fit(
    x_train, y_train,
    epochs=10,  # adjust this value based on your requirements
    batch_size=64,  # adjust this value based on your requirements
    # validation_split=0.1,  # 10% of the data will be used for validation
    shuffle=False,
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [63]:
import altair as alt
# Prepare the data for plotting
# Prepare the data for plotting
data = pd.DataFrame({
    'Epoch': list(range(1, len(history.history['loss']) + 1)) * 2,
    'Loss': history.history['loss'] + history.history['val_loss'],
    'Type': ['Training Loss'] * len(history.history['loss']) + ['Validation Loss'] * len(history.history['val_loss'])
})

# Plot the loss
alt.Chart(data).mark_line().encode(
    x='Epoch',
    y='Loss',
    color='Type'
).interactive()

In [71]:
# predict the next 30 days of expenses
predictions = model.predict(x_test)
predictions.shape

(215, 31, 1)

In [68]:
daily_expenses = scaler.inverse_transform(predictions)

ValueError: Found array with dim 3. Estimator expected <= 2.

In [69]:
daily_expenses

array([[240.96193, 236.17885, 234.8663 ],
       [242.8327 , 237.788  , 236.63513],
       [242.82092, 238.23582, 237.01141],
       [241.92282, 237.9731 , 236.61967],
       [240.44524, 237.17888, 235.68271],
       [257.52563, 247.2765 , 248.33508],
       [265.4116 , 253.35785, 255.34375],
       [267.01282, 255.85667, 257.83545],
       [267.29526, 257.10712, 259.06323],
       [264.51462, 256.13367, 257.80264],
       [262.2694 , 254.98935, 256.60733],
       [258.4059 , 252.63959, 254.05357],
       [257.11545, 251.55542, 253.1687 ],
       [254.12115, 249.52213, 251.03955],
       [250.38853, 246.96274, 248.23195],
       [247.74956, 244.93825, 246.07341],
       [245.20152, 242.98914, 243.93372],
       [243.75539, 241.71698, 242.55417],
       [241.63885, 240.14633, 240.7026 ],
       [240.12012, 238.93079, 239.23958],
       [239.92497, 238.53995, 238.74579],
       [238.80798, 237.72623, 237.64944],
       [238.46701, 237.36523, 237.10002],
       [238.03806, 236.99817, 236.