In [17]:
import os
import re
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, SpatialDropout1D


In [3]:
def load_imdb_data(data_dir, subset='train'):
    """
    从IMDB aclImdb目录中读取文本和标签。
    subset: 'train' or 'test'
    返回: (texts, labels)
    """
    texts = []
    labels = []

    # pos 文件夹
    pos_dir = os.path.join(data_dir, subset, 'pos')
    for fname in os.listdir(pos_dir):
        with open(os.path.join(pos_dir, fname), 'r', encoding='utf-8') as f:
            texts.append(f.read())
            labels.append(1)  # 正面影评

    # neg 文件夹
    neg_dir = os.path.join(data_dir, subset, 'neg')
    for fname in os.listdir(neg_dir):
        with open(os.path.join(neg_dir, fname), 'r', encoding='utf-8') as f:
            texts.append(f.read())
            labels.append(0)  # 负面影评

    return texts, labels

def basic_preprocess(text):
    """
    对文本做一个简单的清洗示例: 去除HTML标签、非字母字符等。
    也可以根据需要进行更高级的清洗或分词。
    """
    # 去除 HTML 标签
    text = re.sub(r"<.*?>", "", text)
    # 只保留字母和空格
    text = re.sub(r"[^a-zA-Z]", " ", text)
    # 转小写
    text = text.lower()
    # 去除多余空格
    text = re.sub(r"\s+", " ", text).strip()
    return text

data_dir = "/Volumes/Dreamer1.6/homework/大数据/aclImdb" 

train_texts_raw, train_labels = load_imdb_data(data_dir, subset='train')
test_texts_raw, test_labels = load_imdb_data(data_dir, subset='test')

# 基本预处理
train_texts = [basic_preprocess(t) for t in train_texts_raw]
test_texts = [basic_preprocess(t) for t in test_texts_raw]

train_labels = np.array(train_labels)
test_labels = np.array(test_labels)

print("训练集样本数:", len(train_texts))
print("测试集样本数:", len(test_texts))

训练集样本数: 25000
测试集样本数: 25000


In [14]:
# 初始化TfidfVectorizer，可以自定义token_pattern、ngram_range、max_features等超参数
tfidf = TfidfVectorizer(
    max_features=70000,  # 取词表最大维度，可根据内存或性能需求调整
    stop_words='english'
)

X_train_tfidf = tfidf.fit_transform(train_texts)
X_test_tfidf = tfidf.transform(test_texts)

y_train = train_labels
y_test = test_labels

print("TF-IDF 训练集维度: ", X_train_tfidf.shape)
print("TF-IDF 测试集维度: ", X_test_tfidf.shape)

TF-IDF 训练集维度:  (25000, 70000)
TF-IDF 测试集维度:  (25000, 70000)


In [5]:
lr_model = LogisticRegression(max_iter=200, verbose=1)
lr_model.fit(X_train_tfidf, y_train)

lr_preds = lr_model.predict(X_test_tfidf)
lr_acc = accuracy_score(y_test, lr_preds)

print("逻辑回归测试集准确率: {:.4f}".format(lr_acc))

RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =        20001     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  6.93147D-01    |proj g|=  5.38894D-03

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
20001     23     26      1     0     0   3.766D-05   3.394D-01
  F =  0.33936644771636248     

CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL            
逻辑回归测试集准确率: 0.8807


 This problem is unconstrained.


In [6]:
svm_model = SVC(kernel='linear', verbose=1)
svm_model.fit(X_train_tfidf, y_train)

svm_preds = svm_model.predict(X_test_tfidf)
svm_acc = accuracy_score(y_test, svm_preds)

print("SVM 测试集准确率: {:.4f}".format(svm_acc))

[LibSVM].................*........*
optimization finished, #iter = 25944
obj = -5781.909920, rho = -0.037240
nSV = 10536, nBSV = 5866
Total nSV = 10536
SVM 测试集准确率: 0.8727


In [None]:
vocab_size = 20000  # 词汇表大小
maxlen = 100        # 每条评论最长取多少个词
embedding_dim = 128 # Embedding维度
lstm_units = 128    # LSTM隐藏单元数

# Tokenizer序列化
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(train_texts)

X_train_seq = tokenizer.texts_to_sequences(train_texts)
X_test_seq = tokenizer.texts_to_sequences(test_texts)

# 序列填充
X_train_pad = pad_sequences(X_train_seq, maxlen=maxlen, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=maxlen, padding='post', truncating='post')

y_train = train_labels
y_test = test_labels

print("训练集序列矩阵维度:", X_train_pad.shape)
print("测试集序列矩阵维度:", X_test_pad.shape)

# 模型结构优化
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=maxlen))
model.add(SpatialDropout1D(0.2))  # 加入空间Dropout减少过拟合
model.add(Bidirectional(LSTM(lstm_units, dropout=0.2, recurrent_dropout=0.2)))  # 改进的关键点
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

epochs = 20  
batch_size = 128

model.fit(X_train_pad, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.1)

# 模型评估
loss, accuracy = model.evaluate(X_test_pad, y_test, verbose=0)
print("优化后LSTM测试集准确率: {:.4f}".format(accuracy))

训练集序列矩阵维度: (25000, 100)
测试集序列矩阵维度: (25000, 100)


Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 138ms/step - accuracy: 0.6434 - loss: 0.6101 - val_accuracy: 0.8196 - val_loss: 0.4109
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 170ms/step - accuracy: 0.8643 - loss: 0.3405 - val_accuracy: 0.8452 - val_loss: 0.3909
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 164ms/step - accuracy: 0.9061 - loss: 0.2490 - val_accuracy: 0.8024 - val_loss: 0.4959
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 166ms/step - accuracy: 0.9377 - loss: 0.1750 - val_accuracy: 0.8704 - val_loss: 0.3381
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 168ms/step - accuracy: 0.9537 - loss: 0.1358 - val_accuracy: 0.8212 - val_loss: 0.6576
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 165ms/step - accuracy: 0.9652 - loss: 0.1041 - val_accuracy: 0.8656 - val_loss: 0.4865
Epoch 7/20