In [8]:
# === A 區：載入資料 ===
import joblib
from scipy import sparse
import pandas as pd

vectorizer = joblib.load("artifacts/vectorizer.joblib")
X_train = sparse.load_npz("artifacts/X_train.npz")
X_test  = sparse.load_npz("artifacts/X_test.npz")
y_train = pd.read_csv("artifacts/y_train.csv").squeeze("columns")
y_test  = pd.read_csv("artifacts/y_test.csv").squeeze("columns")

print("Vocabulary size:", len(vectorizer.vocabulary_))
print("Shapes:", X_train.shape, X_test.shape)

# === B 區：訓練 baseline 模型 ===
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

clf = LogisticRegression(solver='liblinear', max_iter=1000)
clf.fit(X_train, y_train)

train_acc = accuracy_score(y_train, clf.predict(X_train))
test_acc  = accuracy_score(y_test,  clf.predict(X_test))

print("Train Accuracy:", train_acc)
print("Test Accuracy:", test_acc)

# === C 區：分類報告與指標表 ===
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred, digits=4))

baseline_metrics = pd.DataFrame({
    'Model': ['Logistic Regression (baseline)'],
    'Train Accuracy': [train_acc],
    'Test Accuracy': [test_acc],
    'Test Precision': [precision_score(y_test, y_pred)],
    'Test Recall': [recall_score(y_test, y_pred)],
    'Test F1': [f1_score(y_test, y_pred)]
})

baseline_metrics



Vocabulary size: 10000
Shapes: (40000, 10000) (10000, 10000)
Train Accuracy: 0.9747
Test Accuracy: 0.8664
              precision    recall  f1-score   support

           0     0.8671    0.8654    0.8663      5000
           1     0.8657    0.8674    0.8665      5000

    accuracy                         0.8664     10000
   macro avg     0.8664    0.8664    0.8664     10000
weighted avg     0.8664    0.8664    0.8664     10000



Unnamed: 0,Model,Train Accuracy,Test Accuracy,Test Precision,Test Recall,Test F1
0,Logistic Regression (baseline),0.9747,0.8664,0.865669,0.8674,0.866533


In [None]:
# Count

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

param_grid = {"C": [0.1, 1, 10]}
base = LogisticRegression(solver='liblinear', max_iter=1000)

gs = GridSearchCV(
    estimator=base,
    param_grid=param_grid,
    cv=3,                 # 3-fold CV
    scoring='accuracy',
    n_jobs=-1,
    return_train_score=True
)
gs.fit(X_train, y_train)  # 讓格線搜尋真的做 3-fold 訓練

gs.best_params_, round(gs.best_score_, 4) # 取最佳 C 與 CV 平均分數

from sklearn.metrics import accuracy_score
best_C = gs.best_params_['C']
clf_C = LogisticRegression(solver='liblinear', max_iter=1000, C=best_C).fit(X_train, y_train)
train_acc_C = accuracy_score(y_train, clf_C.predict(X_train))
test_acc_C  = accuracy_score(y_test,  clf_C.predict(X_test))
(best_C, train_acc_C, test_acc_C)

({'C': 0.1}, np.float64(0.8845))

> 3-fold CV 最佳 C=0.1，CV 分數=0.8845。C 從 1 調到 0.1 後，test accuracy 從 0.8664 提升到 0.8805，過擬合 gap 從 10.8% 降到 6.6%。


In [None]:
# TF-IDF

import re, pandas as pd
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.model_selection import train_test_split

stop_words = ENGLISH_STOP_WORDS
def clean_text(text):
    cleaned = re.sub(r"[^\w\s]", "", text)
    lowercase = cleaned.lower()
    words = lowercase.split()
    return [w for w in words if w not in stop_words]

df_raw = pd.read_csv("dataset/IMDB Dataset.csv")
df_raw['clean_tokens'] = df_raw['review'].apply(clean_text)
df_raw['label'] = df_raw['sentiment'].map({'positive':1, 'negative':0})
df_raw['clean_text'] = df_raw['clean_tokens'].apply(lambda t: " ".join(t))

X_text_train2, X_text_test2, y_train2, y_test2 = train_test_split(
    df_raw['clean_text'], df_raw['label'],
    test_size=0.2, random_state=9, stratify=df_raw['label']
)

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=10000)
X_train_tfidf = tfidf.fit_transform(X_text_train2)
X_test_tfidf  = tfidf.transform(X_text_test2)

X_train_tfidf.shape, X_test_tfidf.shape  # 檢查 shape

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

best_C = 0.1  # 用你剛剛 GridSearch 得到的結果
clf_tfidf = LogisticRegression(solver='liblinear', max_iter=1000, C=best_C).fit(X_train_tfidf, y_train2)

acc_train_tfidf = accuracy_score(y_train2, clf_tfidf.predict(X_train_tfidf))
acc_test_tfidf  = accuracy_score(y_test2,  clf_tfidf.predict(X_test_tfidf))
(acc_train_tfidf, acc_test_tfidf)

import pandas as pd

compare = pd.DataFrame({
    'Vectorizer': ['Count (C=0.1)', 'TF-IDF (C=0.1)'],
    'Train Accuracy': [train_acc_C, acc_train_tfidf],
    'Test Accuracy':  [test_acc_C,  acc_test_tfidf],
})
compare


# 最佳 C=0.1 時，Count Vectorizer Test Accuracy=0.8805，TF-IDF=0.8610。
# Count 分數較高，但 TF-IDF 的 train-test gap 較小（過擬合程度較低）。
# 若追求測試分數，暫留 Count；若重視泛化穩定性，可考慮 TF-IDF。

Unnamed: 0,Vectorizer,Train Accuracy,Test Accuracy
0,Count (C=0.1),0.946325,0.8805
1,TF-IDF (C=0.1),0.88345,0.861


# === B4: 比較 Count vs TF-IDF ===

```python
compare = pd.DataFrame({
    'Vectorizer': ['Count (C=0.1)', 'TF-IDF (C=0.1)'],
    'Train Accuracy': [train_acc_C, acc_train_tfidf],
    'Test Accuracy':  [test_acc_C,  acc_test_tfidf],
})
compare
```

**結論**：
最佳 C=0.1 時，Count Vectorizer Test Accuracy=0.8805，TF-IDF=0.8610。
Count 分數較高，但 TF-IDF 的 train-test gap 較小（過擬合程度較低）。
若追求測試分數，暫留 Count；若重視泛化穩定性，可考慮 TF-IDF。