In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [38]:
data = pd.read_csv('./data/training_data.csv')

In [39]:
data['Descripción de Transacción'] = data['Descripción de Transacción'].str.strip()

data['Fecha de Transacción'] = pd.to_datetime(data['Fecha de Transacción'], format='%d/%m/%Y')

In [40]:
def clean_special_characters(text):
    cleaned_text = re.sub(r'[-*,.@]', '', text)
    cleaned_text = re.sub(r'_+$', '', cleaned_text)
    cleaned_text = re.sub(r'_', ' ', cleaned_text)
    return cleaned_text

data['Descripción de Transacción'] = data['Descripción de Transacción'].apply(clean_special_characters)

In [41]:
X = data[['Descripción de Transacción', 'Débito de Transacción', 'Crédito de Transacción']]
y = data['Categoria']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [42]:
preprocessor = ColumnTransformer(
    transformers=[
        ('text', TfidfVectorizer(), 'Descripción de Transacción'),
        ('num', StandardScaler(), ['Débito de Transacción', 'Crédito de Transacción'])
    ])

model = make_pipeline(
    preprocessor,
    RandomForestClassifier()
)

model.fit(X_train, y_train)

In [43]:
y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)

print(f"{model} Accuracy: {accuracy:.4f}")

                                                precision    recall  f1-score   support

                                       alcohol       0.60      0.90      0.72        10
                          comida en el trabajo       0.00      0.00      0.00         1
                                 comida rapida       0.50      0.40      0.44        10
                     comida rapida a domicilio       1.00      1.00      1.00         3
                             compra extranjero       0.33      1.00      0.50         1
                                cuidado propio       1.00      1.00      1.00         1
                               entretenimiento       1.00      1.00      1.00         4
                                    ferreteria       1.00      1.00      1.00         1
                             ingreso intereses       1.00      1.00      1.00        35
                               ingreso trabajo       1.00      1.00      1.00         3
ingresos movimientos bancarios 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
