In [None]:
# Baseline: Logistic Regression with CountVectorizer
# -----------------------------------------------

import os, joblib, json, time
import pandas as pd
from scipy import sparse
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# === A1. 載入資料與向量器 ===
vectorizer = joblib.load("artifacts/vectorizer.joblib")
X_train = sparse.load_npz("artifacts/X_train.npz")
X_test  = sparse.load_npz("artifacts/X_test.npz")
y_train = pd.read_csv("artifacts/y_train.csv").squeeze("columns")
y_test  = pd.read_csv("artifacts/y_test.csv").squeeze("columns")

print("vocab size:", len(vectorizer.vocabulary_))
print("shapes:", X_train.shape, X_test.shape)

# === A2. 建立與訓練模型 ===
clf_final = LogisticRegression(solver='liblinear', C=0.1, max_iter=1000)
clf_final.fit(X_train, y_train)

# 評估訓練集與測試集準確率
y_pred_train = clf_final.predict(X_train)
y_pred_test  = clf_final.predict(X_test)
train_acc_final = accuracy_score(y_train, y_pred_train)
test_acc_final  = accuracy_score(y_test,  y_pred_test)
print("Train acc:", train_acc_final, "Test acc:", test_acc_final)

# 額外列印分類報告（可選）
print(classification_report(y_test, y_pred_test, digits=4))

# === A3. 存檔 ===
os.makedirs("artifacts", exist_ok=True)
MODEL_PATH = "artifacts/model_logreg_count_C0.1.joblib"
joblib.dump(clf_final, MODEL_PATH)

meta = {
    "model": "LogisticRegression",
    "vectorizer": "CountVectorizer",
    "max_features": 10000,
    "solver": "liblinear",
    "C": 0.1,
    "train_accuracy": float(train_acc_final),
    "test_accuracy":  float(test_acc_final),
    "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
}
with open("artifacts/baseline_metrics.json", "w", encoding="utf-8") as f:
    json.dump(meta, f, ensure_ascii=False, indent=2)

print("Artifacts saved:", sorted(os.listdir("artifacts")))

# === A4. 驗證存檔可重讀 ===
clf_loaded = joblib.load(MODEL_PATH)
y_pred_reload = clf_loaded.predict(X_test)
print("Reloaded model acc:", accuracy_score(y_test, y_pred_reload))


vocab size: 10000
shapes: (40000, 10000) (10000, 10000)
Train acc: 0.946325 Test acc: 0.8805
              precision    recall  f1-score   support

           0     0.8835    0.8766    0.8800      5000
           1     0.8776    0.8844    0.8810      5000

    accuracy                         0.8805     10000
   macro avg     0.8805    0.8805    0.8805     10000
weighted avg     0.8805    0.8805    0.8805     10000

Artifacts saved: ['X_test.npz', 'X_train.npz', 'baseline_metrics.json', 'model_logreg_count_C0.1.joblib', 'vectorizer.joblib', 'y_test.csv', 'y_train.csv']
Reloaded model acc: 0.8805
