In [10]:
import pandas as pd
import numpy as np

In [21]:
# pull down data from the Expense Tagging repo
df = pd.read_csv("https://github.com/EZ-Walk/Expense-Tagging/blob/f6b58fbcc1270d6bcdab82873673ee34c6d1e84a/Data/raw/AccountHistory%20(1).csv?raw=true")
print(df.shape)
df.head()

(3509, 8)


Unnamed: 0,Account Number,Post Date,Check,Description,Debit,Credit,Status,Balance
0,443091309,4/27/2024,,"GITHUB, INC. +18774484820 CAUS",10.0,,Pending,
1,443091309,4/25/2024,,LIME*RIDE +18885463345 CAUS,4.44,,Pending,
2,443091309,4/27/2024,,Deposit weekly allowance,,120.0,Posted,116.43
3,443091309,4/27/2024,,Point Of Sale Withdrawal MASABI_RTD 1600 Blake...,2.7,,Posted,-3.57
4,443091309,4/27/2024,,Point Of Sale Withdrawal SQ *CURTIS PARK 2532 ...,19.24,,Posted,-0.87


In [22]:
# keep only rows with Debit!=NaN, Status=Posted, and Account Number == 4430913
df = df[df['Debit'].notnull()]
df = df[df['Status'] == 'Posted']
df = df[df['Account Number'] == 443091309]
print('Data shape:', df.shape)
df.head()

Data shape: (2726, 8)


Unnamed: 0,Account Number,Post Date,Check,Description,Debit,Credit,Status,Balance
3,443091309,4/27/2024,,Point Of Sale Withdrawal MASABI_RTD 1600 Blake...,2.7,,Posted,-3.57
4,443091309,4/27/2024,,Point Of Sale Withdrawal SQ *CURTIS PARK 2532 ...,19.24,,Posted,-0.87
5,443091309,4/27/2024,,Point Of Sale Withdrawal SQ *CURTIS PARK 2532 ...,2.75,,Posted,18.37
6,443091309,4/26/2024,,Point Of Sale Withdrawal TST* FAMOUS ORI 713 E...,10.8,,Posted,21.12
7,443091309,4/26/2024,,External Withdrawal PAYPAL INSTANT TRANSFER - ...,60.0,,Posted,31.92


In [26]:
# keep only the desired columns
data = df[['Post Date', 'Debit']]
print('Data shape:', data.shape)
data.head()

Data shape: (2726, 2)


Unnamed: 0,Post Date,Debit
3,4/27/2024,2.7
4,4/27/2024,19.24
5,4/27/2024,2.75
6,4/26/2024,10.8
7,4/26/2024,60.0


In [28]:
# convert the 'Post Date' column to datetime
data['Post Date'] = data['Post Date'].astype('datetime64')
data.rename(columns={'Post Date': 'Date'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Post Date'] = data['Post Date'].astype('datetime64')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [33]:
# fill in dates without expenses as 0
data = data.set_index('Date').resample('D').sum().fillna(0)

In [36]:
# Print the shape one last time and the date range encomapssed by the data as a Month, Day, Year
print('Data shape:', data.shape)
date_range = data.index[[0, -1]]
print('Date range:', date_range[0].strftime('%B %d, %Y'), 'to', date_range[1].strftime('%B %d, %Y'))

Data shape: (2183, 1)
Date range: May 07, 2018 to April 27, 2024


In [37]:
data.to_csv('data/transactions.csv', index=False)


# Pre-processing

In [16]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
import numpy as np

In [38]:
df = pd.read_csv('data/transactions.csv')
print(df.shape)
df.head()

(2183, 1)


Unnamed: 0,Debit
0,10.0
1,9.47
2,0.0
3,0.0
4,0.0


In [40]:
# Normalize the data
scaler = MinMaxScaler(feature_range=(0, 1))
df['Debit'] = scaler.fit_transform(df['Debit'].values.reshape(-1,1))
df.head()

Unnamed: 0,Debit
0,0.000294
1,0.000278
2,0.0
3,0.0
4,0.0


In [41]:
# Transform the data into sequences
sequence_length = 30
result = []
for index in range(len(df) - sequence_length):
    result.append(df['Debit'].values[index: index + sequence_length])

result = np.array(result)

In [42]:
# Split the data into training and testing sets
train_size = round(0.9 * result.shape[0])  # 90% for training
x_train = result[:int(train_size), :-1]
y_train = result[:int(train_size), -1]
x_test = result[int(train_size):, :-1]
y_test = result[int(train_size):, -1]

In [46]:
train_data = pd.DataFrame({'x': list(x_train), 'y': list(y_train)})
train_data.to_csv('data/train.csv', index=False)
test_data = pd.DataFrame({'x': list(x_test), 'y': list(y_test)})
test_data.to_csv('data/test.csv', index=False)