In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder

In [12]:
data = pd.read_parquet("../data/df_cleaned.parquet")
data

Unnamed: 0,transaction_date,amount,memo,new_category,Cleaned Text
8,2018-03-06,15.25,POS CASINO BAR @ SPOTL - MEMO=PURCHASE 03/02 C...,Entertainment,pos casino bar spotl memopurchase 0302 coache...
39,2018-09-29,16.04,BEST BUY GRAND REGENCY BRANDON F,General Merchandise,best buy grand regency brandon f
45,2018-10-17,10.36,CORNER STORE ARLINGTON TX 10/17 Purchase $5.3...,Automotive,corner store arlington tx 1017 purchase $536 ...
52,2017-07-11,4.63,SPEEDWAY IN BEDFORD IN 07/10 DEBIT_CARD,Automotive,speedway bedford 0710 debitcard
55,2018-09-10,223.00,PAYMENT FOR AMZ STORECARD WEB ID: ACH_DEBIT,General Merchandise,payment amz storecard web id achdebit
...,...,...,...,...,...
1999981,2021-02-19,24.24,DBT/WDR CASEYS GEN STORE FENNIMORE WI,Automotive,dbtwdr caseys gen store fennimore wi
1999982,2021-01-19,120.60,POS Debit - DDA KOHLS CLIVE IA #,General Merchandise,pos debit dda kohls clive ia
1999983,2022-02-25,6.47,STARBUCKS STORE MAPLE VALLEY WA 02/24,Food and Beverages,starbucks store maple valley wa 0224
1999991,2021-01-11,44.98,POS Debit - Visa Check Card - APPLE.COM/BILL ...,General Merchandise,pos debit visa check card applecombill ca


# Part 1: TF-IDF Model

##### Koosha's TF-IDF model + Nathan's fine-tuning

In [13]:
# define the feature columns
feature_columns = "Cleaned Text"

# define the target column
target_column = "new_category"

# Create a LabelEncoder object
le = LabelEncoder()

# Fit and transform the target column
y = le.fit_transform(data[target_column])

# create the feature matrix and target vector
X = data[feature_columns]

# split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [14]:
# create the TfidfVectorizer
vectorizer = TfidfVectorizer()

# create the pipeline
pipeline = Pipeline([
    ("vectorizer", vectorizer),
    ("classifier", LogisticRegression())
])

# fit the model on the training data
pipeline.fit(X_train, y_train)

In [15]:
# test the model on the test data
test_data = "autozone"
predicted_category = pipeline.predict([test_data])

print("Predicted category:", le.inverse_transform(predicted_category))

Predicted category: ['Automotive']


In [16]:
# evaluate the model on the test data
score = pipeline.score(X_test, y_test)
print("Test score:", score)

Test score: 0.8560599582198224


### Parameter Tuning

In [17]:
def trainTestTFIDF(gram='word', ngram_range=(1,1)):
        # create the TfidfVectorizer
    vectorizer = TfidfVectorizer(analyzer=gram, ngram_range=ngram_range)

    # create the pipeline
    pipeline = Pipeline([
        ("vectorizer", vectorizer),
        ("classifier", LogisticRegression())
    ])

    # fit the model on the training data
    pipeline.fit(X_train, y_train)
    
    # evaluate the model on the test data
    score = pipeline.score(X_test, y_test)
    print("Test score:", score)
    return pipeline

In [18]:
# trainTestTFIDF('word')
# trainTestTFIDF('word', ngram_range=(2,2))
# trainTestTFIDF('word', ngram_range=(3,3))
# trainTestTFIDF('char')
# trainTestTFIDF('char', ngram_range=(2,2))
# trainTestTFIDF('char', ngram_range=(3,3))
# trainTestTFIDF('char', ngram_range=(4,4))
best_tfidf = trainTestTFIDF('char', ngram_range=(5,5)) # BEST
# trainTestTFIDF('char', ngram_range=(6,6))
# trainTestTFIDF('char', ngram_range=(7,7))
# trainTestTFIDF('char', ngram_range=(8,8))

tfidf_proba = best_tfidf.predict_proba(X_test)

Test score: 0.8746594671194888


# Part 2: Non-Text Model
##### Kyle's Non-Text Model

In [19]:
data['dt'] = pd.to_datetime(data['transaction_date'])
data = data[["amount", "dt", "new_category"]]
data

Unnamed: 0,amount,dt,new_category
8,15.25,2018-03-06,Entertainment
39,16.04,2018-09-29,General Merchandise
45,10.36,2018-10-17,Automotive
52,4.63,2017-07-11,Automotive
55,223.00,2018-09-10,General Merchandise
...,...,...,...
1999981,24.24,2021-02-19,Automotive
1999982,120.60,2021-01-19,General Merchandise
1999983,6.47,2022-02-25,Food and Beverages
1999991,44.98,2021-01-11,General Merchandise


In [20]:
# feature engineering using dt column: adding year, month, day, etc.
data['year'] = data['dt'].dt.year # ranges 2010 - 2022
data['month'] = data['dt'].dt.month # ranges 1 - 12
data['is_holiday'] = data['month'].apply(lambda x: 1 if x == 11 or x == 12 else 0) # 0 or 1
data['day'] = data['dt'].dt.dayofweek # to be discarded
data['is_weekend'] = data['day'].apply(lambda x: 1 if x == 5 or x == 6 else 0) # 0 or 1

# feature engineering using amount: encoding whole numbers
data['is_whole_number'] = data['amount'].apply(lambda x: 1 if str(x).split(".")[1] == "0" else 0)
data

# X y split
X = data[["amount", "is_whole_number", "year", "month", "day", "is_holiday", "is_weekend"]]
y = data[["new_category"]]

# standard scale: amount
scaler = StandardScaler()
scaler_df = pd.DataFrame(scaler.fit_transform(X[['amount']]), index = X.index)

# one hot encode: year, month, day
ohe = OneHotEncoder(handle_unknown='ignore')
encoder_df = pd.DataFrame(ohe.fit_transform(X[['year']]).toarray(), columns = pd.Series(X['year'].unique()).sort_values().values, index = X.index)

ohe2 = OneHotEncoder(handle_unknown='ignore')
encoder_df2 = pd.DataFrame(ohe2.fit_transform(X[['month']]).toarray(), columns = pd.Series(X['month'].unique()).sort_values().values, index = X.index)

# monday = 0, sunday = 6
def convert_date_to_day(date):
    if date == 0:
        return "monday"
    if date == 1:
        return "tuesday"
    if date == 2:
        return "wednesday"
    if date == 3:
        return "thursday"
    if date == 4:
        return "friday"
    if date == 5:
        return "saturday"
    if date == 6:
        return "sunday"

X['day_word'] = X['day'].apply(convert_date_to_day)
ohe3 = OneHotEncoder(handle_unknown='ignore')
encoder_df3 = pd.DataFrame(ohe3.fit_transform(X[['day_word']]).toarray(), columns = pd.Series(X['day_word'].unique()).sort_values().values, index = X.index)

# combine features and remove duplicate features
X = pd.concat([X, scaler_df, encoder_df, encoder_df2, encoder_df3], axis=1)
X = X[[0, 'is_whole_number', 'is_holiday', 'is_weekend', 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"]]
X.rename({0: "amount (standardized)"}, axis=1, inplace=True)

# train test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# train model
lr = LogisticRegression(multi_class = "multinomial").fit(X_train, y_train)
sgd = SGDClassifier(loss="log").fit(X_train, y_train)

# predict and store predictions
lr_preds = lr.predict(X_test)
sgd_preds = sgd.predict(X_test)

X

Unnamed: 0,amount (standardized),is_whole_number,is_holiday,is_weekend,2010,2011,2012,2013,2014,2015,...,10,11,12,monday,tuesday,wednesday,thursday,friday,saturday,sunday
8,-0.012686,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
39,-0.012246,0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
45,-0.015409,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
52,-0.018601,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
55,0.103030,1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1999981,-0.007678,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1999982,0.045994,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1999983,-0.017576,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1999991,0.003874,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
y_test = np.array(y_test['new_category'])
lr_accuracy = (lr_preds == y_test).mean()
sgd_accuracy = (sgd_preds == y_test).mean()
print("Logistic Regression accuracy: {}, \nSGDClassifier accuracy: {}".format(lr_accuracy, sgd_accuracy))


Logistic Regression accuracy: 0.3368007623049953, 
SGDClassifier accuracy: 0.33819954310565986


# Part 3: Composite Model
##### Kyle using Logistic Regression to combine the probabilities from each class from both models

In [33]:
tfidf_proba

array([[9.57847627e-01, 9.74671946e-03, 1.55101825e-02, ...,
        8.12890111e-04, 4.46890408e-04, 5.75781960e-03],
       [4.14820158e-03, 5.43304648e-03, 2.93838118e-02, ...,
        2.29459494e-03, 6.09062113e-04, 9.46416586e-01],
       [1.06142951e-03, 1.05985341e-03, 9.92737279e-01, ...,
        5.80393520e-04, 1.34653817e-04, 6.52147583e-04],
       ...,
       [9.28650504e-03, 4.26548105e-03, 9.67001563e-01, ...,
        7.38354105e-04, 4.49596467e-04, 3.43314236e-03],
       [9.86281380e-01, 8.81278471e-04, 1.63881433e-03, ...,
        6.88686207e-04, 4.51327206e-04, 1.94686289e-03],
       [7.71357777e-02, 5.85802094e-02, 3.23670034e-01, ...,
        2.09363968e-02, 9.74992757e-03, 2.40521711e-01]])

In [None]:
nontext_proba = lr.predict_proba(X_test)
# final_model = LogisticRegression().fit()
for weight in range(0,20): # learn weight
    weighted_average = nontext_proba + weight*tfidf_proba
    final_df = pd.DataFrame(weighted_average)
# todo: do a sanity check on this model. look at a specific transaction and trace it
    final_model = LogisticRegression().fit(final_df, y_test)
    
    preds = final_model.predict(final_df)
    print(weight, np.mean(preds == y_test))

In [32]:
weighted_average

array([[1.83233299e+01, 2.48707107e-01, 6.28060491e-01, ...,
        2.59940246e-02, 1.31098017e-02, 1.68582798e-01],
       [1.99033152e-01, 1.58272487e-01, 8.93864295e-01, ...,
        6.38714140e-02, 1.76827721e-02, 1.80166949e+01],
       [1.47366471e-01, 9.98012138e-02, 1.91974203e+01, ...,
        1.99214296e-02, 6.98435138e-03, 6.83711139e-02],
       ...,
       [3.14367977e-01, 1.48900939e-01, 1.87165137e+01, ...,
        2.20580534e-02, 1.28721928e-02, 1.22867169e-01],
       [1.88704750e+01, 8.22597323e-02, 3.73098079e-01, ...,
        2.00768545e-02, 1.44781521e-02, 1.00545578e-01],
       [1.59549694e+00, 1.17185875e+00, 6.49828548e+00, ...,
        4.04490917e-01, 1.90392223e-01, 4.62794176e+00]])

In [23]:
nontext_proba = lr.predict_proba(X_test)
weight = 5
weighted_average = nontext_proba + weight*tfidf_proba
final_df = pd.DataFrame(weighted_average)
final_df
pred = final_df.idxmax(axis=1)
print(np.mean(pred == y_test))
pred, y_test

0.0


(0         0
 1         7
 2         2
 3         2
 4         2
          ..
 163709    3
 163710    2
 163711    2
 163712    0
 163713    2
 Length: 163714, dtype: int64,
 array(['Automotive', 'Travel', 'Food and Beverages', ...,
        'Food and Beverages', 'Automotive', 'Travel'], dtype=object))

In [24]:
pred.value_counts()

2    58351
3    47141
0    20055
4    19568
1     8935
7     7448
5     1629
6      587
dtype: int64