In [31]:
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

stop_words = ENGLISH_STOP_WORDS

# 前處理單句用函式
def clean_text(text):
    cleaned = re.sub(r"[^\w\s]", "", text)   # 去除標點與特殊符號
    lowercase = cleaned.lower()              # 轉小寫
    words = lowercase.split()                # 分詞
    filtered_words = [word for word in words if word not in stop_words]  # 去除 stop words
    return filtered_words

In [32]:
import pandas as pd

# 載入 CSV 資料（有兩欄：review, sentiment）
df = pd.read_csv("dataset/IMDB Dataset.csv")

# Step 1: 套用 clean_text() 處理文字欄位
df['clean_tokens'] = df['review'].apply(clean_text)

# Step 2: 轉換情緒標籤為數值
df['label'] = df['sentiment'].map({'positive': 1, 'negative': 0})

# 確認結果
df[['review', 'clean_tokens', 'sentiment', 'label']].head()

Unnamed: 0,review,clean_tokens,sentiment,label
0,One of the other reviewers has mentioned that ...,"[reviewers, mentioned, watching, just, 1, oz, ...",positive,1
1,A wonderful little production. <br /><br />The...,"[wonderful, little, production, br, br, filmin...",positive,1
2,I thought this was a wonderful way to spend ti...,"[thought, wonderful, way, spend, time, hot, su...",positive,1
3,Basically there's a family where a little boy ...,"[basically, theres, family, little, boy, jake,...",negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...","[petter, matteis, love, time, money, visually,...",positive,1


In [None]:
# 補：把 tokens 黏回一句文字，給向量器用
df['clean_text'] = df['clean_tokens'].apply(lambda toks: " ".join(toks))

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

# 為避免資料外洩，先 train_test_split 再 fit_transform（只在訓練集上 fit）
# Step 1: 先切分（避免資料外洩，並保持標籤比例）
X_text_train, X_text_test, y_train, y_test = train_test_split(
    df['clean_text'], df['label'],
    test_size=0.2, random_state=9, stratify=df['label']
)

# Step 2: 向量化：只在 train 上 fit
cv = CountVectorizer(max_features=10000)
X_train = cv.fit_transform(X_text_train)
X_test  = cv.transform(X_text_test)

import os, joblib
from scipy import sparse

os.makedirs("artifacts", exist_ok=True)
joblib.dump(cv, "artifacts/vectorizer.joblib")
sparse.save_npz("artifacts/X_train.npz", X_train)
sparse.save_npz("artifacts/X_test.npz",  X_test)
y_train.to_csv("artifacts/y_train.csv", index=False)
y_test.to_csv("artifacts/y_test.csv", index=False)
print("✅ 已存到 artifacts/")


✅ 已存到 artifacts/


In [34]:
import os; os.listdir("artifacts")

['vectorizer.joblib', 'X_test.npz', 'X_train.npz', 'y_test.csv', 'y_train.csv']