In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd
import dill
import os

In [2]:
file_path = "./../input/test_predictions_2_models_gree_valid.tsv"
df_train = pd.read_csv(file_path, sep="\t", header=None)
df_train.columns = [
    "id", "date", "cost", "text", "category_gemma_2b", "prob_gemma_2b",
    "category_gemma_9b", "prob_gemma_9b", "category",
]

In [4]:
vectorizer = TfidfVectorizer(max_features=5700)
X = vectorizer.fit_transform(df_train["text"])
y = df_train["category"]

In [5]:
model = RidgeClassifier(alpha=0)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
cv_scores = cross_val_score(model, X, y, cv=cv, scoring="accuracy", n_jobs=-1)
print(f"Cross-Validation Accuracy Scores: {cv_scores}")
print(f"Mean Accuracy: {cv_scores.mean():.4f}")
print(f"Standard Deviation: {cv_scores.std():.4f}")

Cross-Validation Accuracy Scores: [1.     1.     0.9996 0.9992 1.     1.     0.9988 1.     0.9996 0.9996]
Mean Accuracy: 0.9997
Standard Deviation: 0.0004


In [6]:
file_path = "./../input/payments_training.tsv"
df_test = pd.read_csv(file_path, sep="\t", header=None)
df_test.columns = ["index", "date", "amount", "text", "category"]

In [7]:
model.fit(X, y)
X_test = vectorizer.transform(df_test["text"])
y_test = df_test["category"]
y_pred = model.predict(X_test)

In [8]:
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print(f"Final Test Accuracy: {accuracy:.4f}")
print("Classification Report:", report, sep='\n')

Final Test Accuracy: 1.0000
Classification Report:
                precision    recall  f1-score   support

  BANK_SERVICE       1.00      1.00      1.00        49
    FOOD_GOODS       1.00      1.00      1.00        90
       LEASING       1.00      1.00      1.00        38
          LOAN       1.00      1.00      1.00        41
NON_FOOD_GOODS       1.00      1.00      1.00        96
NOT_CLASSIFIED       1.00      1.00      1.00        23
   REALE_STATE       1.00      1.00      1.00        27
       SERVICE       1.00      1.00      1.00        88
           TAX       1.00      1.00      1.00        48

      accuracy                           1.00       500
     macro avg       1.00      1.00      1.00       500
  weighted avg       1.00      1.00      1.00       500



In [9]:
os.system("rm -rf ./../src/models; mkdir ./../src/models")
with open("./../src/models/tf-idf-vectorizer.dill", "wb") as f:
    dill.dump(vectorizer, f)
with open("./../src/models/classifier.dill", "wb") as f:
    dill.dump(model, f)