In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import joblib
import re


## 1. Preprocessing Function


In [2]:
def clean_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r"[^a-z0-9\s]", " ", text)
        text = re.sub(r"\s+", " ", text).strip()
        return text
    return ""

## 2. Load Datasets

In [3]:
train_df = pd.read_csv("../data/transactions.csv")
test_clean_df = pd.read_csv("../data/test_clean.csv")
test_noisy_df = pd.read_csv("../data/test_noisy.csv")

# Apply cleaning
train_df["clean"] = train_df["text"].apply(clean_text)
test_clean_df["clean"] = test_clean_df["text"].apply(clean_text)
test_noisy_df["clean"] = test_noisy_df["text"].apply(clean_text)

FileNotFoundError: [Errno 2] No such file or directory: '../data/transactions.csv'

## 3. Split Train/Validation

In [None]:
X = train_df["clean"]
y = train_df["category"]

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Training rows:", len(X_train))
print("Validation rows:", len(X_val))

## 4. Build Model Pipeline

In [None]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
model = LogisticRegression(max_iter=300)

X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec = vectorizer.transform(X_val)

model.fit(X_train_vec, y_train)

## 5. Validation Evaluation


In [None]:
val_pred = model.predict(X_val_vec)
print("\n=== Validation Performance ===\n")
print(classification_report(y_val, val_pred))

## 6. Evaluate on CLEAN TEST DATASET

In [None]:
print("\n=== Clean Unseen Test Dataset ===\n")
X_test_clean_vec = vectorizer.transform(test_clean_df["clean"])
test_clean_pred = model.predict(X_test_clean_vec)
print(classification_report(test_clean_df["category"], test_clean_pred))


## 7. Evaluate on EXTREME NOISY TEST DATASET

In [None]:
print("\n=== Extreme Noisy Test Samples ===\n")
X_test_noisy_vec = vectorizer.transform(test_noisy_df["clean"])
test_noisy_pred = model.predict(X_test_noisy_vec)

# Show predictions with confidence scores
probas = model.predict_proba(X_test_noisy_vec)

for i in range(len(test_noisy_df)):
    text = test_noisy_df["text"].iloc[i]
    pred = test_noisy_pred[i]
    confidence = round(probas[i].max(), 3)
    print(f"Input: {text}\n â†’ Predicted: {pred}  | Confidence: {confidence}\n")


## 8. Save Model + Vectorizer

In [None]:
joblib.dump(vectorizer, "../model/tfidf.pkl")
joblib.dump(model, "../model/logreg_model.pkl")
print("\nModel & vectorizer saved successfully!")