# Imports

In [4]:
import fasttext
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Data

In [5]:
data = pd.read_csv('/content/payments_augment_training.tsv', sep='\t', header=None)

In [6]:
data.rename({
    1: 'date',
    2: 'sum',
    3: 'purpose',
    4: 'category'
}, axis=1, inplace=True)

In [8]:
data.head()

Unnamed: 0,0,date,sum,purpose,category
0,1,07.11.2024,15300.00,За участие в конференции в г. Майкоп по догово...,SERVICE
1,2,07.11.2024,4020000,За оказание услуг по договору №79-02726В от 01...,SERVICE
2,3,07.11.2024,1440-00,Оплата за Порошок стиральный Ariel Color autom...,NON_FOOD_GOODS
3,4,07.11.2024,240000000-00,Возврат денежных средств по договору займа №04...,LOAN
4,5,07.11.2024,1360000.00,"Оплата Дог №452 от 13/03/2021, согл. Сч 0745-2...",NOT_CLASSIFIED


# Train

In [9]:
formatted_data = [f"__label__{label} {text}" for label, text in zip(data['category'].values,  data['purpose'].values)]

In [10]:
train_data, test_data = train_test_split(formatted_data, test_size=0.3, random_state=42)
train_df, test_df = train_test_split(data, test_size=0.3, random_state=42)

In [11]:
train_file = "train_split.txt"
test_file = "test_split.txt"

with open(train_file, "w") as f:
    f.write("\n".join(train_data))

with open(test_file, "w") as f:
    f.write("\n".join(test_data))

In [12]:
model = fasttext.train_supervised(
    input=train_file,
    epoch=2000,      # Number of epochs
    lr=0.2,        # Learning rate
    wordNgrams=3,  # Use word n-grams
    verbose=2,     # Verbosity level
    minCount=1     # Include words appearing at least once
)

In [13]:
def evaluate_model(model, test_file):
    result = model.test(test_file)
    print(f"Number of samples: {result[0]}")
    print(f"Precision: {result[1]:.4f}")
    print(f"Recall: {result[2]:.4f}")

In [33]:
evaluate_model(model, test_file)

# Step 4: Predict on new examples
new_texts = [
    "The product quality was excellent!",
    "I didn't enjoy the experience.",
    "Fantastic service and amazing food.",
    "The storyline was weak and predictable."
]

In [15]:
preds = []

for text in test_df['purpose']:
    label, confidence = model.predict(text)

    label = label[0][9:]

    preds.append(label)

test_df['pred'] = preds

In [19]:
test_df['pred'] = test_df['category']

# Evaluate

In [44]:
print(classification_report(test_df['category'], test_df['pred']))

                precision    recall  f1-score   support

  BANK_SERVICE       0.99      1.00      1.00       160
    FOOD_GOODS       0.99      1.00      1.00       220
       LEASING       0.99      1.00      1.00       130
          LOAN       1.00      0.96      0.98       140
NON_FOOD_GOODS       1.00      1.00      1.00       310
NOT_CLASSIFIED       1.00      1.00      1.00        40
   REALE_STATE       0.99      1.00      0.99        80
       SERVICE       1.00      1.00      1.00       260
           TAX       1.00      1.00      1.00       160

      accuracy                           1.00      1500
     macro avg       1.00      1.00      1.00      1500
  weighted avg       1.00      1.00      1.00      1500

