# Spam/Fraud Message Detection with TF-IDF + Interpretable Linear Model

**Features:**
* Clean reproducible pipeline
* Proper evaluation (precision/recall/F1, ROC-AUC, PR-AUC)
* Threshold tuning for “fraud triage” style (optimize recall at acceptable precision)
* Interpretability: top spam indicators (feature weights)
* Error analysis: show false positives/negatives

In [7]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pathlib import Path

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report, confusion_matrix,
    roc_auc_score, average_precision_score,
    precision_recall_curve
)

## Load + clean data (robust)

In [None]:
dataset = Path(Path.cwd()).resolve().parents[1] / "introduction" / "datasets" / "spam.csv"

raw = pd.read_csv(dataset, encoding="latin1")

label_col = "v1" if "v1" in raw.columns else raw.columns[0]
text_col  = "v2" if "v2" in raw.columns else raw.columns[1]

df = raw[[label_col, text_col]].rename(columns={label_col: "label", text_col: "text"}).copy()
df["text"] = df["text"].astype(str).fillna("")

df["y"] = (df["label"].str.lower().str.strip() != "ham").astype(int)
df = df.drop(columns=["label"])

df.head()


Unnamed: 0,text,y
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


### Dataset Checks


In [9]:
df["y"].value_counts(normalize=True).rename({0:"ham", 1:"spam"}).to_frame("proportion")


Unnamed: 0_level_0,proportion
y,Unnamed: 1_level_1
ham,0.865937
spam,0.134063


### Feature engineering

In [10]:
_url_re = re.compile(r"(https?://\S+|www\.\S+)", re.IGNORECASE)

def make_numeric_features(text_series: pd.Series) -> pd.DataFrame:
    s = text_series.fillna("").astype(str)

    length = s.str.len()
    num_digits = s.str.count(r"\d")
    num_exclam = s.str.count(r"!")
    num_qmark  = s.str.count(r"\?")
    num_upper  = s.apply(lambda x: sum(1 for c in x if c.isupper()))
    has_url    = s.apply(lambda x: 1 if _url_re.search(x) else 0)

    pct_digits = np.where(length > 0, (num_digits / length) * 100.0, 0.0)
    pct_upper  = np.where(length > 0, (num_upper / length) * 100.0, 0.0)

    return pd.DataFrame({
        "len": length,
        "pct_digits": pct_digits,
        "pct_upper": pct_upper,
        "exclam": num_exclam,
        "qmark": num_qmark,
        "has_url": has_url,
    })

numeric_transformer = Pipeline(steps=[
    ("feats", FunctionTransformer(lambda X: make_numeric_features(X["text"]), validate=False)),
])
