In [None]:
# === A 區：載入資料 ===
import joblib
from scipy import sparse
import pandas as pd

vectorizer = joblib.load("artifacts/vectorizer.joblib")
X_train = sparse.load_npz("artifacts/X_train.npz")
X_test  = sparse.load_npz("artifacts/X_test.npz")
y_train = pd.read_csv("artifacts/y_train.csv").squeeze("columns")
y_test  = pd.read_csv("artifacts/y_test.csv").squeeze("columns")

print("Vocabulary size:", len(vectorizer.vocabulary_))
print("Shapes:", X_train.shape, X_test.shape)

# === B 區：訓練 baseline 模型 ===
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

clf = LogisticRegression(solver='liblinear', max_iter=1000)
clf.fit(X_train, y_train)

train_acc = accuracy_score(y_train, clf.predict(X_train))
test_acc  = accuracy_score(y_test,  clf.predict(X_test))

print("Train Accuracy:", train_acc)
print("Test Accuracy:", test_acc)

# === C 區：分類報告與指標表 ===
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred, digits=4))

baseline_metrics = pd.DataFrame({
    'Model': ['Logistic Regression (baseline)'],
    'Train Accuracy': [train_acc],
    'Test Accuracy': [test_acc],
    'Test Precision': [precision_score(y_test, y_pred)],
    'Test Recall': [recall_score(y_test, y_pred)],
    'Test F1': [f1_score(y_test, y_pred)]
})

baseline_metrics



10000
(40000, 10000) (10000, 10000)
