In [1]:
import numpy as np
import pandas as pd
# from sklearn.preprocessing import OrdinalEncoder

In [2]:
data = pd.read_csv("data_test1.csv", names=["date", "cost", "description", "category"])


In [3]:
print(data)

           date     cost                               description  \
0    2014-08-11   184.22                               Ski tickets   
1    2011-09-21  4034.86                               Edfinancial   
2    2010-05-27   283.24                                  GME YOLO   
3    2019-02-21  9012.58                                  Pet food   
4    2015-01-09   850.34                                  GME YOLO   
..          ...      ...                                       ...   
245  2012-01-20  5812.81    Deposit in savings at Bank of the West   
246  2018-01-10  3449.14  Blue Cross Bleu Shield of North Carolina   
247  2017-07-22  5059.13                               Capital One   
248  2010-05-01  1023.06                                ExxonMobil   
249  2015-10-17   233.35                                  Vacation   

               category  
0        Non-Essentials  
1             Education  
2                 Other  
3             Education  
4                 Other  
.. 

In [4]:
i_to_cat_enc = {cat: i for cat, i in enumerate(data["category"].unique())}
cat_to_i_enc = {i: cat for cat, i in enumerate(data["category"].unique())}

In [5]:
prepositions = pd.read_csv("prepositions.csv", names=["words"])

In [6]:
# Tokenize the descriptions and remove any prepositions
tokens = {index: item.split() for index, item in enumerate(data["description"])}
processed_tokens = {}
for key, value in tokens.items():
    no_prepositions = []
    for word in value:
        word = word.lower()
        if word not in prepositions.values:
            no_prepositions.append(word)
    processed_tokens.update({key: no_prepositions})

In [7]:
# count the occurences of each unigram per catgory
unigram_value_counts = {cat: {} for cat in data["category"].unique()}
for index, tok_list in processed_tokens.items():
    cat = data.at[index, 'category']
    for word in tok_list:
        counts = unigram_value_counts[cat]
        counts.update({word: counts.setdefault(word, 0) + 1})

In [8]:
# count the occurences of each bigram per catgory
bigram_value_counts = {cat: {} for cat in data["category"].unique()}
for index, tok_list in processed_tokens.items():
    cat = data.at[index, 'category']
    for i in range(0, len(tok_list) - 1, 2):
        counts = bigram_value_counts[cat]
        bigram = (tok_list[i], tok_list[i+1])
        counts.update({bigram: counts.setdefault(bigram, 0) + 1})

In [9]:
# Create the lexica from unigrams and bigrams over minimum counts
unigram_min = 5
bigram_min = 3
lexica = {cat: set() for cat in data["category"].unique()}
for cat, counts in unigram_value_counts.items():
    for gram, count in counts.items():
        if count > unigram_min:
            lexica[cat].add((gram,))
for cat, counts in bigram_value_counts.items():
    for gram, count in counts.items():
        if count > bigram_min:
            lexica[cat].add(gram)

In [25]:
# Extract feature from lexica
lex_feature = np.zeros((len(data), len(cat_to_i_enc)), dtype=np.int8)
for i, words in processed_tokens.items():
    unigrams = set([(word,) for word in words])
    bigrams = set([(words[i], words[i+1]) for i in range(0,len(words) - 1,2)])      
    for cat, gram_set in lexica.items():
        # count common items by length of intersection        
        lex_feature[i][cat_to_i_enc[cat]] += len(gram_set.intersection(unigrams.union(bigrams)))        

In [27]:
lex_frame = pd.DataFrame(lex_feature, columns=cat_to_i_enc.keys())
output_frame = data.join(lex_frame)

In [28]:
output_

           date     cost                               description  \
0    2014-08-11   184.22                               Ski tickets   
1    2011-09-21  4034.86                               Edfinancial   
2    2010-05-27   283.24                                  GME YOLO   
3    2019-02-21  9012.58                                  Pet food   
4    2015-01-09   850.34                                  GME YOLO   
..          ...      ...                                       ...   
245  2012-01-20  5812.81    Deposit in savings at Bank of the West   
246  2018-01-10  3449.14  Blue Cross Bleu Shield of North Carolina   
247  2017-07-22  5059.13                               Capital One   
248  2010-05-01  1023.06                                ExxonMobil   
249  2015-10-17   233.35                                  Vacation   

               category  Non-Essentials  Education  Other  Debt  Savings  \
0        Non-Essentials               0          0      0     0        0   
1      