# Imports

In [1]:
import pandas as pd
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer

# Load data

In [2]:
df = pd.read_csv('data/processed/train.csv')
df.head()

Unnamed: 0,Description,Category,Amount,Currency
0,Dk-Ind29.03,Sale of Goods,1210.9,DKK
1,B20230618X002HJX000666628738 0030014291298480 ...,Nonbusiness costs,-6.59,EUR
2,Dk3dsf17.08,Sale of Goods,826.2,DKK
3,Dk-Ind01.06,Sale of Goods,481.95,DKK
4,Dk-Ind09.08,Sale of Goods,796.05,DKK


# Prepare

### 1) Drop duplicates

As it was decided in 01_eda - we won't drop duplicates cause they are most likely different transactions

### 2) Fillna

In [3]:
fillna_values = {}

for col in ['Description', 'Amount']:
    fillna_values[col] = df[col].mode()[0]

fillna_values

{'Description': 'Snap, 310-399-3339', 'Amount': 0.0}

In [4]:
for col in fillna_values.keys():
    df.loc[df[col].isna(), col] = fillna_values[col]

df

Unnamed: 0,Description,Category,Amount,Currency
0,Dk-Ind29.03,Sale of Goods,1210.90,DKK
1,B20230618X002HJX000666628738 0030014291298480 ...,Nonbusiness costs,-6.59,EUR
2,Dk3dsf17.08,Sale of Goods,826.20,DKK
3,Dk-Ind01.06,Sale of Goods,481.95,DKK
4,Dk-Ind09.08,Sale of Goods,796.05,DKK
...,...,...,...,...
7031,Dk3dsf09.08,Sale of Goods,160.20,DKK
7032,"Snap, 310-399-3339",Marketing and advertising,0.00,DKK
7033,Dk-Ind09.07,Sale of Goods,1031.40,DKK
7034,Dk-Ind05.04,Sale of Goods,1848.48,DKK


### 3) Encode categoricals

In [5]:
class CategoricalEncoder:
    def __init__(self, df, columns=None):
        self.dtypes = df.dtypes.apply(str).to_dict()
        if columns is None:
            self.columns = [k for k in self.dtypes.keys() if self.dtypes[k] == 'object']
        else: 
            self.columns = columns
        self.mapping = self.__get_mapping(df, self.columns)
        
        
    def __get_mapping(self, df, cols) -> dict:
        mapping = {}
        for c in cols:
            unique_cats = df[c].unique()
            # assign to each unique value its number
            mapping[c] = dict(zip(unique_cats, range(len(unique_cats))))
        return mapping
    
    def encode(self, df, col) -> None:
        assert col in self.columns, 'Unknown column. Cannot process further'
        # values not in mapper
        df.loc[~df[col].isin(self.mapping[col].keys()), col] = -1
        # values in mapper
        df[col].replace(to_replace=self.mapping[col], inplace=True)
    
    def decode(self, df, col) -> None:
        assert col in self.columns, 'Unknown column. Cannot process further'
        inv_mapping = {v: k for k, v in self.mapping[col].values()}
        df.loc[~df[col].isin(inv_mapping.keys()), col] = None

In [6]:
enc = CategoricalEncoder(df, ['Category', 'Currency'])

In [7]:
for c in enc.columns:
    enc.encode(df, c)
    
df.head()

Unnamed: 0,Description,Category,Amount,Currency
0,Dk-Ind29.03,0,1210.9,0
1,B20230618X002HJX000666628738 0030014291298480 ...,1,-6.59,1
2,Dk3dsf17.08,0,826.2,0
3,Dk-Ind01.06,0,481.95,0
4,Dk-Ind09.08,0,796.05,0


In [8]:
unique_cats = df['Category'].unique()

dict(zip(unique_cats, range(len(unique_cats)))).values()

dict_values([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23])

### Description col

In [9]:
tf_idf = TfidfVectorizer(max_features=50)

tf_idf_matr = tf_idf.fit_transform(df['Description'])


In [10]:
df_tf_idf = pd.DataFrame(tf_idf_matr.toarray(), columns=tf_idf.get_feature_names_out())
df_tf_idf

Unnamed: 0,00,0000,03,04,05,06,07,08,10852425,2023,...,leverandørservice,me,overførsel,postering,rollzone,snap,to,trading,ua,udland
0,0.0,0.0,0.900321,0.00000,0.0,0.000000,0.000000,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
1,0.0,0.0,0.000000,0.00000,0.0,0.000000,0.000000,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
2,0.0,0.0,0.000000,0.00000,0.0,0.000000,0.000000,1.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
3,0.0,0.0,0.000000,0.00000,0.0,0.512906,0.000000,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
4,0.0,0.0,0.000000,0.00000,0.0,0.000000,0.000000,0.535289,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7031,0.0,0.0,0.000000,0.00000,0.0,0.000000,0.000000,1.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
7032,0.0,0.0,0.000000,0.00000,0.0,0.000000,0.000000,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.500067,0.0,0.0,0.0,0.0
7033,0.0,0.0,0.000000,0.00000,0.0,0.000000,0.515157,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
7034,0.0,0.0,0.000000,0.88272,0.0,0.000000,0.000000,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0


In [11]:
df_concat = pd.concat([df, df_tf_idf], axis=1)

In [12]:
df_concat

Unnamed: 0,Description,Category,Amount,Currency,00,0000,03,04,05,06,...,leverandørservice,me,overførsel,postering,rollzone,snap,to,trading,ua,udland
0,Dk-Ind29.03,0,1210.90,0,0.0,0.0,0.900321,0.00000,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
1,B20230618X002HJX000666628738 0030014291298480 ...,1,-6.59,1,0.0,0.0,0.000000,0.00000,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
2,Dk3dsf17.08,0,826.20,0,0.0,0.0,0.000000,0.00000,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
3,Dk-Ind01.06,0,481.95,0,0.0,0.0,0.000000,0.00000,0.0,0.512906,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
4,Dk-Ind09.08,0,796.05,0,0.0,0.0,0.000000,0.00000,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7031,Dk3dsf09.08,0,160.20,0,0.0,0.0,0.000000,0.00000,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
7032,"Snap, 310-399-3339",2,0.00,0,0.0,0.0,0.000000,0.00000,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.500067,0.0,0.0,0.0,0.0
7033,Dk-Ind09.07,0,1031.40,0,0.0,0.0,0.000000,0.00000,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
7034,Dk-Ind05.04,0,1848.48,0,0.0,0.0,0.000000,0.88272,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0


# Save

In [13]:
df_concat.to_csv('data/ready/train.csv', index=False)

In [14]:
with open('models/tf_idf.pickle', 'wb') as handle:
    pickle.dump(tf_idf, handle)

In [15]:
with open('models/fillna.pickle', 'wb') as handle:
    pickle.dump(fillna_values, handle)

In [16]:
with open('models/enc.pickle', 'wb') as handle:
    pickle.dump(enc, handle)