In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from gensim.models import Word2Vec
import numpy as np

In [20]:
df = pd.read_csv('data/clean/transactions.csv')
print(df.shape)
df.head()

(2726, 3)


Unnamed: 0,Date,Description,Amount
0,2024-04-27,Point Of Sale Withdrawal MASABI_RTD 1600 Blake...,2.7
1,2024-04-27,Point Of Sale Withdrawal SQ *CURTIS PARK 2532 ...,19.24
2,2024-04-27,Point Of Sale Withdrawal SQ *CURTIS PARK 2532 ...,2.75
3,2024-04-26,Point Of Sale Withdrawal TST* FAMOUS ORI 713 E...,10.8
4,2024-04-26,External Withdrawal PAYPAL INSTANT TRANSFER - ...,60.0


# Vectorizing Text Features (Vendor Descriptions):
Vendor descriptions are typically short text snippets. There are a few ways to create features from them:
A. Bag-of-Words (BoW) Representation:
    1. Tokenize the vendor descriptions into individual words (or n-grams).
    2. Create a vocabulary of unique words (or n-grams) across all descriptions.
    3. Represent each description as a vector where each element corresponds to the count or presence of a word in the vocabulary.

B. TF-IDF (Term Frequency-Inverse Document Frequency):
    1. Calculate the term frequency (TF) for each word in each description.
    2. Compute the inverse document frequency (IDF) for each word across all descriptions.
    3. Multiply the TF by the IDF to get the final TF-IDF representation.

C. Word Embeddings (Advanced):
    1. Use pre-trained language models (e.g., Word2Vec, GloVe, BERT) to generate dense vector representations for each description[1].

In [7]:
# Bag-of-Words Representation
vectorizer = CountVectorizer(ngram_range=(1, 1))
X_bow = vectorizer.fit_transform(df['Description'])
X_bow = X_bow.toarray()
X_bow.shape


(2726, 2864)

In [29]:
# TF-IDF
transformer = TfidfTransformer(smooth_idf=False)
X_tfidf = transformer.fit_transform(X_bow)
X_tfidf = X_tfidf.toarray()
X_tfidf.shape

(2726, 2864)

In [30]:
# Word Embeddings
sentences = [desc.split() for desc in df['Description']]
model = Word2Vec(sentences, min_count=1)
X_w2v = np.array([model.wv[desc] for desc in sentences], dtype=object)
# print(X_w2v[0][0])
X_w2v.shape

(2726,)

In [38]:
# Date Features
df['Date'] = pd.to_datetime(df['Date'])
df['Day of Week'] = df['Date'].dt.dayofweek
df['Month'] = df['Date'].dt.month

# Encode day of week and month as numerical values
le = LabelEncoder()
df['Day of Week'] = le.fit_transform(df['Day of Week'])
df['Month'] = le.fit_transform(df['Month'])
print(df.shape)
df[['Day of Week', 'Month']].head()

(2726, 6)


Unnamed: 0,Day of Week,Month
0,5,3
1,5,3
2,5,3
3,4,3
4,4,3


In [31]:
# Amount Features
scaler = StandardScaler()
df['Normalized Amount'] = scaler.fit_transform(df[['Amount']])
print(df['Amount'].shape)
df['Normalized Amount'].head()

(2726,)


0   -0.131137
1   -0.120165
2   -0.131103
3   -0.125764
4   -0.093128
Name: Normalized Amount, dtype: float64

In [42]:
# Combined Feature Vector
# Here we're using the TF-IDF features as an example, but you could replace this with BoW or Word2Vec features
# Convert X_tfidf to a DataFrame
df_tfidf = pd.DataFrame(list(X_tfidf))

# Create a new DataFrame with the other features
df_other_features = df[['Day of Week', 'Month', 'Normalized Amount']]

# Reset the index of both DataFrames to ensure they align correctly
df_tfidf.reset_index(drop=True, inplace=True)
df_other_features.reset_index(drop=True, inplace=True)

# Concatenate the DataFrames along the columns axis
features = pd.concat([df_tfidf, df_other_features], axis=1)

In [44]:
features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2857,2858,2859,2860,2861,2862,2863,Day of Week,Month,Normalized Amount
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5,3,-0.131137
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5,3,-0.120165
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5,3,-0.131103
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,3,-0.125764
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,3,-0.093128
