In [6]:
import pandas as pd
import re
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
import nltk

# 下载nltk资源
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/pineapple/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pineapple/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [23]:
# 读取数据
imdb_df = pd.read_csv(r"../Week5/IMDB Dataset.csv")

# 添加一列表示review的长度
imdb_df['review_length'] = imdb_df['review'].apply(len)

# 按review长度排序
imdb_df_sorted = imdb_df.sort_values(by='review_length')

# 取出最短的5000条正面和负面评论
df_positive_shortest = imdb_df_sorted[imdb_df_sorted['sentiment']=='positive'][:5000]
df_negative_shortest = imdb_df_sorted[imdb_df_sorted['sentiment']=='negative'][:5000]

# 取出最长的5000条正面和负面评论
df_positive_longest = imdb_df_sorted[imdb_df_sorted['sentiment']=='positive'][-5000:]
df_negative_longest = imdb_df_sorted[imdb_df_sorted['sentiment']=='negative'][-5000:]

# 删除临时列
df_positive_shortest = df_positive_shortest.drop(columns=['review_length'])
df_negative_shortest = df_negative_shortest.drop(columns=['review_length'])
df_positive_longest = df_positive_longest.drop(columns=['review_length'])
df_negative_longest = df_negative_longest.drop(columns=['review_length'])

# 合并最短的评论
imdb_shortest = pd.concat([df_positive_shortest, df_negative_shortest])

# 合并最长的评论
imdb_longest = pd.concat([df_positive_longest, df_negative_longest])

# 查看数据形状
print("Shortest reviews shape:", imdb_shortest.shape)
print("Longest reviews shape:", imdb_longest.shape)

Shortest reviews shape: (10000, 2)
Longest reviews shape: (10000, 2)


In [24]:
def clean(imdb):
    # 数据清洗
    imdb['review'] = imdb['review'].apply(lambda x: re.sub('(<.*?>)', ' ', x))
    imdb['review'] = imdb['review'].apply(lambda x: re.sub('[,\.!?:()"]', '', x))
    imdb['review'] = imdb['review'].apply(lambda x: x.strip())
    imdb['review'] = imdb['review'].apply(lambda x: re.sub('[^a-zA-Z"]',' ',x))
    imdb['review'] = imdb['review'].apply(lambda x: x.lower())
clean(imdb_shortest)
clean(imdb_longest)


In [25]:
# 标记化和去除停用词
stop_words = set(stopwords.words('english'))


In [27]:
def preprocess_reviews(imdb):
    imdb['review'] = imdb['review'].apply(lambda x: ' '.join([word for word in word_tokenize(x) if word not in stop_words]))
# 将情感标签转换为数值
    imdb['sentiment'] = imdb['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

preprocess_reviews(imdb_shortest)
preprocess_reviews(imdb_longest)

In [28]:
# 构建词汇表
tokenized_reviews = [word_tokenize(review) for review in imdb_shortest['review']]
vocab = {word for review in tokenized_reviews for word in review}
word2idx = {word: idx + 1 for idx, word in enumerate(vocab)}  # 0 reserved for padding
word2idx['<PAD>'] = 0
# 将文本转换为索引序列
def encode_review(review):
    return [word2idx[word] for word in word_tokenize(review)]

imdb_shortest['review'] = imdb_shortest['review'].apply(encode_review)

In [29]:
# 构建词汇表
tokenized_reviews = [word_tokenize(review) for review in imdb_longest['review']]
vocab = {word for review in tokenized_reviews for word in review}
word2idx = {word: idx + 1 for idx, word in enumerate(vocab)}  # 0 reserved for padding
word2idx['<PAD>'] = 0
# 将文本转换为索引序列
def encode_review(review):
    return [word2idx[word] for word in word_tokenize(review)]

imdb_longest['review'] = imdb_longest['review'].apply(encode_review)

In [30]:
def fill(imdb):
    # 填充序列
    max_len = max(imdb['review'].apply(len))
    imdb['review'] = imdb['review'].apply(lambda x: x + [0] * (max_len - len(x)))

fill(imdb_longest)
fill(imdb_shortest)

In [31]:
# 拆分数据集
def dataset(imdb):
    train_reviews, test_reviews, train_labels, test_labels = train_test_split(imdb['review'], imdb['sentiment'], test_size=0.25, random_state=42)
    return train_reviews, test_reviews, train_labels, test_labels
trainS,testS,trainS_labels, testS_labels = dataset(imdb_shortest)
trainL,testL,trainL_labels, testL_labels = dataset(imdb_longest)

In [32]:
# 创建数据集类
class IMDBDataset(Dataset):
    def __init__(self, reviews, labels):
        self.reviews = torch.tensor(reviews, dtype=torch.long)
        self.labels = torch.tensor(labels, dtype=torch.float32)
    
    def __len__(self):
        return len(self.reviews)
    
    def __getitem__(self, idx):
        return self.reviews[idx], self.labels[idx]

train_dataset = IMDBDataset(trainS.tolist(), trainS_labels.tolist())
test_dataset = IMDBDataset(testS.tolist(), testS_labels.tolist())

trainS_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
testS_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

train_dataset = IMDBDataset(trainL.tolist(), trainL_labels.tolist())
test_dataset = IMDBDataset(testL.tolist(), testL_labels.tolist())

trainL_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
testL_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [33]:
# 定义模型
class SentimentRNN(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, output_size, n_layers=1, bidirectional=False):
        super(SentimentRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.LSTM(embed_size, hidden_size, n_layers, bidirectional=bidirectional, batch_first=True)
        self.fc = nn.Linear(hidden_size * (2 if bidirectional else 1), output_size)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.rnn(x)
        x = x[:, -1, :]  # 取最后一个时间步的输出
        x = self.fc(x)
        x = self.sigmoid(x)
        return x

In [41]:
# 检查是否支持 MPS
if torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

In [34]:
# 模型参数
vocab_size = len(word2idx)
embed_size = 128
hidden_size = 128
output_size = 1
n_layers = 2
bidirectional = True

In [39]:
model = SentimentRNN(vocab_size, embed_size, hidden_size, output_size, n_layers, bidirectional).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [40]:
# 训练模型
n_epochs = 5

In [44]:
# 训练函数
def train(model, train_loader, criterion, optimizer, n_epochs):
    model.train()
    for epoch in range(n_epochs):
        for inputs, targets in train_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), targets)
            loss.backward()
            optimizer.step()
        print(f'Epoch {epoch+1}/{n_epochs}, Loss: {loss.item()}')


# 调用训练函数
train(model, trainS_loader, criterion, optimizer, n_epochs)


Epoch 1/10, Loss: 0.00021586472576018423


KeyboardInterrupt: 

In [None]:
# 测试模型
def evaluate(model, test_loader, criterion):
    model.eval()
    total_loss = 0
    total_correct = 0
    with torch.no_grad():
        for inputs, targets in test_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), targets)
            total_loss += loss.item()
            predicted = (outputs.squeeze() > 0.5).float()
            total_correct += (predicted == targets).sum().item()
    return total_loss / len(test_loader), total_correct / len(test_loader.dataset)

test_loss, test_accuracy = evaluate(model, testL_loader, criterion)
print(f'Test Loss: {test_loss}, Test Accuracy: {test_accuracy}')
