# Imports

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, confusion_matrix

import pickle
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
class CategoricalEncoder:
    def __init__(self, df, columns=None):
        self.dtypes = df.dtypes.apply(str).to_dict()
        if columns is None:
            self.columns = [k for k in self.dtypes.keys() if self.dtypes[k] == 'object']
        else: 
            self.columns = columns
        self.mapping = self.__get_mapping(df, self.columns)
        
        
    def __get_mapping(self, df, cols) -> dict:
        mapping = {}
        for c in cols:
            unique_cats = df[c].unique()
            # assign to each unique value its number
            mapping[c] = dict(zip(unique_cats, range(len(unique_cats))))
        return mapping
    
    def encode(self, df, col) -> None:
        assert col in self.columns, 'Unknown column. Cannot process further'
        # values not in mapper
        df.loc[~df[col].isin(self.mapping[col].keys()), col] = -1
        # values in mapper
        df[col].replace(to_replace=self.mapping[col], inplace=True)
    
    def decode(self, df, col) -> None:
        assert col in self.columns, 'Unknown column. Cannot process further'
        inv_mapping = {v: k for k, v in self.mapping[col].values()}
        df.loc[~df[col].isin(inv_mapping.keys()), col] = None

# Load data

In [3]:
df = pd.read_csv('data/ready/train.csv')
df

Unnamed: 0,Description,Category,Amount,Currency,00,0000,03,04,05,06,...,leverandørservice,me,overførsel,postering,rollzone,snap,to,trading,ua,udland
0,Dk-Ind29.03,0,1210.90,0,0.0,0.0,0.900321,0.00000,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
1,B20230618X002HJX000666628738 0030014291298480 ...,1,-6.59,1,0.0,0.0,0.000000,0.00000,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
2,Dk3dsf17.08,0,826.20,0,0.0,0.0,0.000000,0.00000,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
3,Dk-Ind01.06,0,481.95,0,0.0,0.0,0.000000,0.00000,0.0,0.512906,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
4,Dk-Ind09.08,0,796.05,0,0.0,0.0,0.000000,0.00000,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7031,Dk3dsf09.08,0,160.20,0,0.0,0.0,0.000000,0.00000,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
7032,"Snap, 310-399-3339",2,0.00,0,0.0,0.0,0.000000,0.00000,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.500067,0.0,0.0,0.0,0.0
7033,Dk-Ind09.07,0,1031.40,0,0.0,0.0,0.000000,0.00000,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
7034,Dk-Ind05.04,0,1848.48,0,0.0,0.0,0.000000,0.88272,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0


In [4]:
df.drop(columns=['Description'], inplace=True)

# Train-val split

In [5]:
df_train, df_val = train_test_split(df, test_size=0.1, random_state=10)
display(
    df_train.shape, df_val.shape
    , df_train.head(), df_val.head()
)

(6332, 53)

(704, 53)

Unnamed: 0,Category,Amount,Currency,00,0000,03,04,05,06,07,...,leverandørservice,me,overførsel,postering,rollzone,snap,to,trading,ua,udland
2392,0,479.5,0,0.0,0.0,0.612135,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4304,0,1508.25,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2387,0,153.55,0,0.0,0.0,0.0,0.0,0.491421,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2416,0,274.45,0,0.0,0.0,0.0,0.0,0.0,0.531295,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
890,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,Category,Amount,Currency,00,0000,03,04,05,06,07,...,leverandørservice,me,overførsel,postering,rollzone,snap,to,trading,ua,udland
3537,0,175.7,0,0.0,0.0,0.0,0.0,0.509585,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5647,3,-1332.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6844,0,104.45,0,0.0,0.0,0.0,0.88272,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6233,2,-6000.0,0,0.0,0.57735,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5147,0,5631.89,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Apply model

In [6]:
df_train.drop(columns='Category')

Unnamed: 0,Amount,Currency,00,0000,03,04,05,06,07,08,...,leverandørservice,me,overførsel,postering,rollzone,snap,to,trading,ua,udland
2392,479.50,0,0.0,0.0,0.612135,0.000000,0.000000,0.000000,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4304,1508.25,0,0.0,0.0,0.000000,0.000000,0.000000,1.000000,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2387,153.55,0,0.0,0.0,0.000000,0.000000,0.491421,0.000000,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2416,274.45,0,0.0,0.0,0.000000,0.000000,0.000000,0.531295,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
890,0.00,0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1180,567.91,0,0.0,0.0,0.000000,0.000000,0.000000,0.865342,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3441,-60.00,0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,...,0.0,0.502298,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1344,322.15,0,0.0,0.0,0.000000,0.550232,0.000000,0.000000,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4623,1058.30,0,0.0,0.0,0.900321,0.000000,0.000000,0.000000,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
rf = RandomForestClassifier(n_jobs=-1, random_state=10)

rf.fit(df_train.drop(columns='Category'), df_train['Category'])

y_train_pred = rf.predict(df_train.drop(columns='Category'))
y_val_pred = rf.predict(df_val.drop(columns='Category'))

print('Train accuracy score: '
      , accuracy_score(df_train['Category'], y_train_pred)
)
print('Val accuracy score: '
      , accuracy_score(df_val['Category'], y_val_pred)
)

Train accuracy score:  0.9731522425773848
Val accuracy score:  0.9105113636363636


In [8]:
rf = RandomForestClassifier(n_jobs=-1, random_state=10, min_samples_leaf=3)

rf.fit(df_train.drop(columns='Category'), df_train['Category'])

y_train_pred = rf.predict(df_train.drop(columns='Category'))
y_val_pred = rf.predict(df_val.drop(columns='Category'))

print('Train accuracy score: '
      , accuracy_score(df_train['Category'], y_train_pred)
)
print('Val accuracy score: '
      , accuracy_score(df_val['Category'], y_val_pred)
)

Train accuracy score:  0.9019267214150347
Val accuracy score:  0.9019886363636364


In [9]:
rf = RandomForestClassifier(n_jobs=-1, random_state=10, min_samples_leaf=10)

rf.fit(df_train.drop(columns='Category'), df_train['Category'])

y_train_pred = rf.predict(df_train.drop(columns='Category'))
y_val_pred = rf.predict(df_val.drop(columns='Category'))

print('Train accuracy score: '
      , accuracy_score(df_train['Category'], y_train_pred)
)
print('Val accuracy score: '
      , accuracy_score(df_val['Category'], y_val_pred)
)

Train accuracy score:  0.8771320277953253
Val accuracy score:  0.8849431818181818


In [10]:
rf = RandomForestClassifier(n_jobs=-1, random_state=10, min_samples_leaf=1, max_features=0.5)

rf.fit(df_train.drop(columns='Category'), df_train['Category'])

y_train_pred = rf.predict(df_train.drop(columns='Category'))
y_val_pred = rf.predict(df_val.drop(columns='Category'))

print('Train accuracy score: '
      , accuracy_score(df_train['Category'], y_train_pred)
)
print('Val accuracy score: '
      , accuracy_score(df_val['Category'], y_val_pred)
)

Train accuracy score:  0.9731522425773848
Val accuracy score:  0.9105113636363636


In [11]:
rf = RandomForestClassifier(n_jobs=-1, random_state=10, min_samples_leaf=1, max_features=0.25)

rf.fit(df_train.drop(columns='Category'), df_train['Category'])

y_train_pred = rf.predict(df_train.drop(columns='Category'))
y_val_pred = rf.predict(df_val.drop(columns='Category'))

print('Train accuracy score: '
      , accuracy_score(df_train['Category'], y_train_pred)
)
print('Val accuracy score: '
      , accuracy_score(df_val['Category'], y_val_pred)
)

Train accuracy score:  0.9731522425773848
Val accuracy score:  0.9090909090909091


In [12]:
rf = RandomForestClassifier(n_jobs=-1, random_state=10, min_samples_leaf=1, max_features=0.75)

rf.fit(df_train.drop(columns='Category'), df_train['Category'])

y_train_pred = rf.predict(df_train.drop(columns='Category'))
y_val_pred = rf.predict(df_val.drop(columns='Category'))

print('Train accuracy score: '
      , accuracy_score(df_train['Category'], y_train_pred)
)
print('Val accuracy score: '
      , accuracy_score(df_val['Category'], y_val_pred)
)

Train accuracy score:  0.9731522425773848
Val accuracy score:  0.9105113636363636


In [13]:
dict(zip(rf.feature_names_in_, rf.feature_importances_))

{'Amount': 0.5795634041701546,
 'Currency': 0.020689432909558658,
 '00': 0.004321772398647373,
 '0000': 0.007429779171109463,
 '03': 0.0005696002609996017,
 '04': 0.00022720333208148862,
 '05': 0.000581856770651869,
 '06': 0.0001272100821765208,
 '07': 0.0007815418722910033,
 '08': 4.770266575606434e-07,
 '10852425': 0.00010359623386848858,
 '2023': 0.0033677971572328006,
 '253': 0.007929405890801046,
 '310': 0.01972489911262382,
 '3339': 0.026633498774816994,
 '399': 0.013754457506116547,
 '650': 0.007965429165660315,
 'ads': 0.03197748590138633,
 'ads4649893433': 0.014271102308857852,
 'advis': 0.01592776895547815,
 'aps': 0.007419337558377617,
 'company': 0.008414896153300747,
 'dk': 0.05150077010771369,
 'eur': 0.008761613204735789,
 'facebk': 0.0006214662039764023,
 'fb': 0.003143206932535788,
 'guangzhou': 0.009151069351076842,
 'ind01': 0.0,
 'ind03': 5.19403052862786e-06,
 'ind06': 0.0,
 'ind09': 0.0,
 'ind10': 0.0,
 'ind11': 0.0,
 'ind12': 0.0,
 'ind14': 0.0,
 'ind15': 0.0,
 '

# Test

In [14]:
df_test = pd.read_csv('data/processed/test.csv')
df_test

Unnamed: 0,Description,Category,Amount,Currency
0,"FACEBK NN4B7FPTK2, fb.me/ads",Marketing and advertising,0.00,DKK
1,Postering,Shipping & Logistics,-105.00,DKK
2,Clearing: CLEARING-20234416133 (period Apr 202...,Sale of Goods,564.45,EUR
3,Dk-Ind03.07,Sale of Goods,648.00,DKK
4,Advertising,Marketing and advertising,0.00,DKK
...,...,...,...,...
777,Dk-Ind10.07,Sale of Goods,433.70,DKK
778,LeverandørService 53821MXNEY APS,Loans/ Debts,0.00,DKK
779,20740193MI 1030000750656248 ROLLZONE*,Inventory / Equipment,-336.26,EUR
780,Dk-Ind13.04,Sale of Goods,490.95,DKK


In [15]:
with open('models/enc.pickle', 'rb') as file:  
    enc = pickle.load(file) 

In [16]:
with open('models/fillna.pickle', 'rb') as file:  
    fillna = pickle.load(file) 

In [17]:
with open('models/tf_idf.pickle', 'rb') as file:  
    tf_idf = pickle.load(file) 

In [18]:
for col in fillna.keys():
    df_test.loc[df_test[col].isna(), col] = fillna[col]

df_test

Unnamed: 0,Description,Category,Amount,Currency
0,"FACEBK NN4B7FPTK2, fb.me/ads",Marketing and advertising,0.00,DKK
1,Postering,Shipping & Logistics,-105.00,DKK
2,Clearing: CLEARING-20234416133 (period Apr 202...,Sale of Goods,564.45,EUR
3,Dk-Ind03.07,Sale of Goods,648.00,DKK
4,Advertising,Marketing and advertising,0.00,DKK
...,...,...,...,...
777,Dk-Ind10.07,Sale of Goods,433.70,DKK
778,LeverandørService 53821MXNEY APS,Loans/ Debts,0.00,DKK
779,20740193MI 1030000750656248 ROLLZONE*,Inventory / Equipment,-336.26,EUR
780,Dk-Ind13.04,Sale of Goods,490.95,DKK


In [19]:
enc

<__main__.CategoricalEncoder at 0x7fe7a82e7040>

In [20]:
for c in enc.columns:
    enc.encode(df_test, c)
    
df_test.head()

Unnamed: 0,Description,Category,Amount,Currency
0,"FACEBK NN4B7FPTK2, fb.me/ads",2,0.0,0
1,Postering,8,-105.0,0
2,Clearing: CLEARING-20234416133 (period Apr 202...,0,564.45,1
3,Dk-Ind03.07,0,648.0,0
4,Advertising,2,0.0,0


In [21]:
tf_idf_matr_test = tf_idf.transform(df_test['Description'])

In [22]:
tf_idf_matr_test

<782x50 sparse matrix of type '<class 'numpy.float64'>'
	with 1623 stored elements in Compressed Sparse Row format>

In [23]:
df_tf_idf_test = pd.DataFrame(tf_idf_matr_test.toarray(), columns=tf_idf.get_feature_names_out())
df_tf_idf_test

Unnamed: 0,00,0000,03,04,05,06,07,08,10852425,2023,...,leverandørservice,me,overførsel,postering,rollzone,snap,to,trading,ua,udland
0,0.0,0.0,0.000000,0.00000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.000000,0.502298,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.000000,0.00000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.000000,0.000000,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.000000,0.00000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.000000,0.00000,0.0,0.0,0.537231,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.000000,0.00000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
777,0.0,0.0,0.000000,0.00000,0.0,0.0,0.528582,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
778,0.0,0.0,0.000000,0.00000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.717804,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
779,0.0,0.0,0.000000,0.00000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
780,0.0,0.0,0.000000,0.88272,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
df_concat_test = pd.concat([df_test, df_tf_idf_test], axis=1)
df_concat_test

Unnamed: 0,Description,Category,Amount,Currency,00,0000,03,04,05,06,...,leverandørservice,me,overførsel,postering,rollzone,snap,to,trading,ua,udland
0,"FACEBK NN4B7FPTK2, fb.me/ads",2,0.00,0,0.0,0.0,0.000000,0.00000,0.0,0.0,...,0.000000,0.502298,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Postering,8,-105.00,0,0.0,0.0,0.000000,0.00000,0.0,0.0,...,0.000000,0.000000,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Clearing: CLEARING-20234416133 (period Apr 202...,0,564.45,1,0.0,0.0,0.000000,0.00000,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Dk-Ind03.07,0,648.00,0,0.0,0.0,0.000000,0.00000,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Advertising,2,0.00,0,0.0,0.0,0.000000,0.00000,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
777,Dk-Ind10.07,0,433.70,0,0.0,0.0,0.000000,0.00000,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
778,LeverandørService 53821MXNEY APS,9,0.00,0,0.0,0.0,0.000000,0.00000,0.0,0.0,...,0.717804,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
779,20740193MI 1030000750656248 ROLLZONE*,3,-336.26,1,0.0,0.0,0.000000,0.00000,0.0,0.0,...,0.000000,0.000000,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
780,Dk-Ind13.04,0,490.95,0,0.0,0.0,0.000000,0.88272,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
df_concat_test.drop(columns=['Description'], inplace=True)

In [26]:
y_test_pred = rf.predict(df_concat_test.drop(columns='Category'))

print('Test accuracy score: '
      , accuracy_score(df_concat_test['Category'], y_test_pred)
)

Test accuracy score:  0.9104859335038363


# Save

In [27]:
with open('models/rf.pickle', 'wb') as handle:
    pickle.dump(rf, handle)