# Step 1 读取数据集

In [2]:
import os
import glob

def load_imdb_data(data_dir="/kaggle/input/imdbv1/aclImdb", is_train=True):
    """读取IMDb训练集，返回(文本内容, 情感标签)列表"""
    if not os.path.exists(data_dir):
        raise FileNotFoundError(f"目录不存在: {data_dir}")
    if is_train:
        data_dir = os.path.join(data_dir, 'train')
    else:
        data_dir = os.path.join(data_dir, 'test')
        
    data = []
    # 遍历正负样本文件夹
    # neg: 0, pos: 1
    for label, folder in enumerate(["neg", "pos"]):
        folder_path = os.path.join(data_dir, folder)
        # 匹配所有txt文件（排除隐藏文件）
        file_pattern = os.path.join(folder_path, "*.txt")
        for file_path in glob.glob(file_pattern):
            try:
                with open(file_path, "r", encoding="utf-8") as f:
                    text = f.read().strip()
                    data.append( (text, label) )
            except UnicodeDecodeError:
                print(f"跳过损坏文件: {file_path}")
            except Exception as e:
                print(f"读取错误 {file_path}: {str(e)}")
    return data

In [3]:
data_dir = "../imdb_aclImdb_v1/aclImdb"
# 读取训练集和测试集数据
train_corpus = load_imdb_data(data_dir=data_dir, is_train=True)
test_corpus = load_imdb_data(data_dir=data_dir, is_train=False)

In [4]:
train_corpus[0]

("Story of a man who has unnatural feelings for a pig. Starts out with a opening scene that is a terrific example of absurd comedy. A formal orchestra audience is turned into an insane, violent mob by the crazy chantings of it's singers. Unfortunately it stays absurd the WHOLE time with no general narrative eventually making it just too off putting. Even those from the era should be turned off. The cryptic dialogue would make Shakespeare seem easy to a third grader. On a technical level it's better than you might think with some good cinematography by future great Vilmos Zsigmond. Future stars Sally Kirkland and Frederic Forrest can be seen briefly.",
 0)

# Step 2 数据预处理

## tokenize

In [5]:
! pip install nltk

Looking in indexes: https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
Collecting nltk
  Downloading https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/4d/66/7d9e26593edda06e8cb531874633f7c2372279c3b0f46235539fe546df8b/nltk-3.9.1-py3-none-any.whl (1.5 MB)
     ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
     ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
     ------ --------------------------------- 0.3/1.5 MB ? eta -:--:--
     ------------- -------------------------- 0.5/1.5 MB 1.7 MB/s eta 0:00:01
     --------------------------- ------------ 1.0/1.5 MB 1.8 MB/s eta 0:00:01
     ---------------------------------- ----- 1.3/1.5 MB 2.0 MB/s eta 0:00:01
     ---------------------------------------- 1.5/1.5 MB 1.5 MB/s eta 0:00:00
Collecting joblib (from nltk)
  Downloading https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/91/29/df4b9b42f2be0b623cbd5e2140cafcaa2bef0759a00b7b70104dcfe2fb51/joblib-1.4.2-py3-none-any.whl (301 kB)
Collecti

In [6]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

def tokenize_corpus(corpus):
    tokenized_corpus = []
    for text, label in corpus:
        tokens = word_tokenize(text)
        tokenized_corpus.append((tokens, label))
    return tokenized_corpus

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
tokenized_train_corpus = tokenize_corpus(train_corpus)
tokenized_test_corpus = tokenize_corpus(test_corpus)

In [8]:
tokenized_train_corpus[0]

(['Story',
  'of',
  'a',
  'man',
  'who',
  'has',
  'unnatural',
  'feelings',
  'for',
  'a',
  'pig',
  '.',
  'Starts',
  'out',
  'with',
  'a',
  'opening',
  'scene',
  'that',
  'is',
  'a',
  'terrific',
  'example',
  'of',
  'absurd',
  'comedy',
  '.',
  'A',
  'formal',
  'orchestra',
  'audience',
  'is',
  'turned',
  'into',
  'an',
  'insane',
  ',',
  'violent',
  'mob',
  'by',
  'the',
  'crazy',
  'chantings',
  'of',
  'it',
  "'s",
  'singers',
  '.',
  'Unfortunately',
  'it',
  'stays',
  'absurd',
  'the',
  'WHOLE',
  'time',
  'with',
  'no',
  'general',
  'narrative',
  'eventually',
  'making',
  'it',
  'just',
  'too',
  'off',
  'putting',
  '.',
  'Even',
  'those',
  'from',
  'the',
  'era',
  'should',
  'be',
  'turned',
  'off',
  '.',
  'The',
  'cryptic',
  'dialogue',
  'would',
  'make',
  'Shakespeare',
  'seem',
  'easy',
  'to',
  'a',
  'third',
  'grader',
  '.',
  'On',
  'a',
  'technical',
  'level',
  'it',
  "'s",
  'better',
  't

## build word dict

In [10]:
from collections import Counter

def build_vocab(tokenized_corpus, min_freq=5):
    """
    该函数用于根据分词后的语料库构建词表
    :param tokenized_corpus: 分词后的语料库，格式为 [(tokens, label), ...]
    :param min_freq: 词的最小出现频率，低于该频率的词将被过滤，默认为 5
    :return: 最终词表、词到索引的映射、索引到词的映射
    """
    all_tokens = []
    for tokens, _ in tokenized_corpus:
        all_tokens.extend(tokens)

    # 统计词频
    word_freq = Counter(all_tokens)

    # 构建词表
    vocab = sorted(word_freq, key=word_freq.get, reverse=True)

    # 过滤低频词
    filtered_vocab = [word for word in vocab if word_freq[word] >= min_freq]

    # 为词表添加特殊标记
    special_tokens = ['<PAD>', '<UNK>']
    final_vocab = special_tokens + filtered_vocab

    # 创建词到索引的映射
    word2idx = {word: idx for idx, word in enumerate(final_vocab)}
    idx2word = {idx: word for idx, word in enumerate(final_vocab)}

    return final_vocab, word2idx, idx2word

In [11]:
final_vocab, word2idx, idx2word = build_vocab(tokenized_train_corpus)

In [12]:
len(final_vocab)

35813

In [20]:
word2idx['77'], word2idx['<UNK>'], word2idx['<PAD>']

(20817, 1, 0)

In [17]:
idx2word[2], idx2word[100]

('the', 'we')

## mapping word to idx

In [18]:
def text_to_ids(tokenized_corpus, word2idx, unk_token="<UNK>"):
    """
    将分词语料转换为ID序列（含UNK处理）
    :param tokenized_corpus: 分词语料 [(tokens_list, label), ...]
    :param word2idx: 词表映射 {word: index}
    :param unk_token: 未登录词标记，默认"<UNK>"
    :return: (id_sequences, labels) 元组
    """
    unk_id = word2idx.get(unk_token, 0)  # 默认使用0作为UNK，需确保词表包含
    id_sequences = []
    labels = []
    
    for tokens, label in tokenized_corpus:
        # 转换单个样本：token → id，UNK处理
        seq_ids = [word2idx.get(token, unk_id) for token in tokens]
        
        # 简单校验（非空序列）
        if not seq_ids:
            print(f"警告：空序列，标签{label}，原始tokens：{tokens}")
            continue
        
        id_sequences.append(seq_ids)
        labels.append(label)
    
    return id_sequences, labels

In [19]:

# 转换训练集和测试集
train_ids, train_labels = text_to_ids(tokenized_train_corpus, word2idx)
test_ids, test_labels = text_to_ids(tokenized_test_corpus, word2idx)

# 结果预览（打印前2条）
print("\nID序列示例：")
for i in range(2):
    print(f"\n样本{i+1} - 标签：{train_labels[i]}")
    print(f"分词长度：{len(train_ids[i])} → ID序列：{train_ids[i][:10]}...（前10个ID）")
    print(f"对应分词：{tokenized_train_corpus[i][0][:10]}...（前10个词）")
    print(f"ID映射示例：{tokenized_train_corpus[i][0][0]} → {train_ids[i][0]}")
    


ID序列示例：

样本1 - 标签：0
分词长度：123 → ID序列：[1485, 7, 6, 171, 47, 56, 8727, 1410, 25, 6]...（前10个ID）
对应分词：['Story', 'of', 'a', 'man', 'who', 'has', 'unnatural', 'feelings', 'for', 'a']...（前10个词）
ID映射示例：Story → 1485

样本2 - 标签：0
分词长度：899 → ID序列：[9990, 21910, 535, 22, 6, 4418, 244, 9364, 14294, 1751]...（前10个ID）
对应分词：['Airport', "'77", 'starts', 'as', 'a', 'brand', 'new', 'luxury', '747', 'plane']...（前10个词）
ID映射示例：Airport → 9990


## mini-batch

In [22]:
# ------------ 1. 定义PyTorch Dataset ------------
import torch
from torch.utils.data import Dataset, DataLoader

class IMDBDataset(Dataset):
    def __init__(self, id_sequences, labels, pad_id=0):
        """
        :param id_sequences: train_ids/test_ids (列表的列表)
        :param labels: rain_labels/test_labels (列表)
        :param pad_id: <PAD>的ID (与词表一致, 默认0)
        """
        assert len(id_sequences) == len(labels), "数据与标签数量不匹配"
        self.seqs = id_sequences
        self.labels = labels
        self.pad_id = pad_id  # word2idx[<PAD>] = 0

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.seqs[idx], self.labels[idx]  # 返回原始未填充的序列和标签

# ------------ 2. 动态填充的Collate函数（关键！） ------------
def imdb_collate(batch, pad_id=0):
    """
    动态填充批次, 返回适合RNN的(填充序列, 长度, 标签)
    :param batch: 原始批次数据 [(seq1, label1), (seq2, label2), ...]
    :return: (padded_seqs, seq_lengths, labels)
    """
    seqs, labels = zip(*batch)  # 解包批次
    
    # 计算每个序列的长度（用于RNN）
    seq_lengths = torch.LongTensor([len(seq) for seq in seqs])
    
    # 动态填充到批次最大长度
    max_len = seq_lengths.max().item()
    padded_seqs = []
    for seq in seqs:
        # 填充方式：后补pad_id（与你的text_to_ids逻辑一致）
        padded = seq + [pad_id] * (max_len - len(seq))
        padded_seqs.append(torch.LongTensor(padded))  # 转换为LongTensor
    
    # 堆叠为批次张量
    padded_seqs = torch.stack(padded_seqs)  # [B, L]
    labels = torch.LongTensor(labels)  # [B]
    
    return padded_seqs, seq_lengths, labels  # 直接用于RNN的pack操作


In [23]:
# ------------ 3. 创建DataLoader（使用你的数据） ------------
# 假设你的词表中<PAD>的ID是0（在build_vocab中添加的第一个特殊标记）
assert word2idx.get("<PAD>", -1) == 0, "请确保词表中<PAD>的ID是0"

# 训练集批次
train_dataset = IMDBDataset(train_ids, train_labels, pad_id=0)
train_loader = DataLoader(
    train_dataset,
    batch_size=64,
    shuffle=True,
    collate_fn=imdb_collate,
    num_workers=1,  # Kaggle建议2-4，根据内核调整
    pin_memory=True  # GPU加速
)

# 测试集批次
test_dataset = IMDBDataset(test_ids, test_labels, pad_id=0)
test_loader = DataLoader(
    test_dataset,
    batch_size=128,
    shuffle=False,
    collate_fn=imdb_collate
)

# Step 3 构建网络与训练

## textcnn

In [24]:
import torch
import torch.nn as nn

class TextCNN(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, num_filters=64):
        super().__init__()
        # 嵌入层
        self.embedding = nn.Embedding(
            vocab_size, embed_dim, padding_idx=0
        )
        # 多尺度卷积核
        self.convs = nn.ModuleList([
            nn.Conv2d(1, num_filters, (k, embed_dim)) 
            for k in [3, 4, 5]  # 感受野3-5词
        ])
        self.fc = nn.Linear(num_filters * 3, 1)  # 3种核拼接
    
    def forward(self, x):
        # x: [B, L] 填充后的ID序列（无需长度信息）
        x = self.embedding(x)  # [B, L, E]
        x = x.unsqueeze(1)     # [B, 1, L, E] 适配Conv2d
        # 多尺度卷积+池化
        conv_outs = []
        for conv in self.convs:
            conv_out = conv(x).relu()  # [B, F, L-k+1, 1]
            pooled = conv_out.squeeze(3).max(dim=2)[0]  # [B, F]
            conv_outs.append(pooled)
        # 拼接并分类
        x = torch.cat(conv_outs, dim=1)  # [B, 3*F]
        return self.fc(x).squeeze()  # [B]
    

## bilstm

In [26]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class AttentionLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, hidden_dim=128):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(
            embed_dim, hidden_dim, 
            bidirectional=True, batch_first=True
        )
        # 注意力机制
        self.attn = nn.Linear(hidden_dim*2, 1)
        self.fc = nn.Linear(hidden_dim*2, 1)
    
    def forward(self, x, lengths):
        # x: [B, L], lengths: [B]
        x = self.embedding(x)  # [B, L, E]
        packed = pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
        outputs, (hn, cn) = self.lstm(packed)  # outputs: [B, L, 2H]
        outputs, _ = pad_packed_sequence(outputs, batch_first=True)
        
        # 注意力加权
        attn_scores = self.attn(outputs).squeeze(2)  # [B, L]
        attn_weights = torch.softmax(attn_scores, dim=1)
        weighted = torch.bmm(outputs.permute(0, 2, 1), attn_weights.unsqueeze(2)).squeeze()  # [B, 2H]
        
        return self.fc(weighted).squeeze()  # [B]
    

## 训练

In [28]:
# 初始化（使用你的词表长度)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TextCNN(vocab_size=len(final_vocab)).to(device)
criterion = nn.BCEWithLogitsLoss()  # 二分类
lr = 2e-4
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
epochs = 3

In [None]:
from tqdm import tqdm
import time

# 训练循环（直接train_loader）
for epoch in range(epochs):
    model.train()
    total_loss = 0.0
    start_time = time.time()  # 记录开始时间
    
    # 使用 tqdm 包装 train_loader
    with tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}", unit="batch") as t:
        for batch_seqs, _, batch_labels in t:
            batch_seqs = batch_seqs.to(device)
            batch_labels = batch_labels.float().to(device)
            
            outputs = model(batch_seqs)
            loss = criterion(outputs, batch_labels)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item() * batch_seqs.size(0)
            
            # 更新 tqdm 描述信息
            t.set_postfix(loss=loss.item())
    
    epoch_loss = total_loss / len(train_dataset)
    elapsed_time = time.time() - start_time  # 计算耗时
    print(f"Epoch {epoch+1} | Loss: {epoch_loss:.4f} | Time: {elapsed_time:.2f}s")

Epoch 1/3:   0%|          | 0/391 [00:00<?, ?batch/s]