In [2]:
# 导入必要的库
import pandas as pd

# 读取训练集和验证集文件
train_file = 'movie_reviews_train.txt'
dev_file = 'movie_reviews_dev.txt'

# 读取数据
train_data = pd.read_csv(train_file, sep='\t', header=None, names=["ID", "Review", "Label"])
dev_data = pd.read_csv(dev_file, sep='\t', header=None, names=["ID", "Review", "Label"])

# 计算训练集和验证集的评论数和每条评论的词汇数
train_num_reviews = len(train_data)
dev_num_reviews = len(dev_data)

# 计算每条评论的词汇数
train_token_counts = train_data['Review'].apply(lambda x: len(x.split()))
dev_token_counts = dev_data['Review'].apply(lambda x: len(x.split()))

# 计算训练集和验证集的词汇数量
train_vocab_size = len(set(" ".join(train_data['Review']).split()))
dev_vocab_size = len(set(" ".join(dev_data['Review']).split()))

# 计算训练集和验证集的词汇表重叠
train_vocab = set(" ".join(train_data['Review']).split())
dev_vocab = set(" ".join(dev_data['Review']).split())
vocab_overlap = len(train_vocab.intersection(dev_vocab))

# 输出结果
train_num_reviews, dev_num_reviews, train_token_counts.describe(), dev_token_counts.describe(), train_vocab_size, dev_vocab_size, vocab_overlap


(1600,
 200,
 count    1600.000000
 mean      230.710000
 std       170.682582
 min        16.000000
 25%       127.000000
 50%       175.000000
 75%       283.250000
 max      1300.000000
 Name: Review, dtype: float64,
 count    200.00000
 mean     235.79000
 std      172.64905
 min       36.00000
 25%      130.75000
 50%      178.50000
 75%      289.00000
 max      977.00000
 Name: Review, dtype: float64,
 47638,
 11709,
 7245)

In [3]:
# 重新计算独立词汇数量
train_vocab_unique = set(" ".join(train_data['Review']).split())
dev_vocab_unique = set(" ".join(dev_data['Review']).split())

# 输出训练集和验证集的独立词汇表大小
len(train_vocab_unique), len(dev_vocab_unique)

(47638, 11709)

In [10]:
# import nltk
# nltk.download('punkt')
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/huangjiabao/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/huangjiabao/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [11]:
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter

# 下载punkt分词模型（如果尚未下载）
nltk.download('punkt')
nltk.download('punkt_tab')

# 读取并分词的函数
def load_and_tokenize(file_path):
    with open(file_path, 'r', encoding='utf8') as file:
        reviews = file.readlines()
    # 提取评论文本（去除电影ID和评分）
    reviews_text = [line.split("\t")[1] for line in reviews if len(line.strip()) > 0]
    # 分词
    tokenized_reviews = [word_tokenize(review.lower()) for review in reviews_text]
    return tokenized_reviews

# 读取并分词训练集和验证集的评论
train_file_path = 'movie_reviews_train.txt'  # 修改为您训练集的文件路径
dev_file_path = 'movie_reviews_dev.txt'      # 修改为您验证集的文件路径

train_reviews = load_and_tokenize(train_file_path)
dev_reviews = load_and_tokenize(dev_file_path)

# 获取训练集和验证集的词汇表（唯一词汇）
train_vocab = set([word for review in train_reviews for word in review])
dev_vocab = set([word for review in dev_reviews for word in review])

# 计算训练集和验证集词汇表之间的重叠
overlap_vocab = train_vocab.intersection(dev_vocab)
overlap_size = len(overlap_vocab)

# 输出词汇重叠的大小
print(f"词汇表重叠的大小: {overlap_size}")


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/huangjiabao/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


词汇表重叠的大小: 6132


In [13]:
# # 改进代码，实现百分比：
# import nltk
# from nltk.tokenize import word_tokenize

# # 下载punkt分词模型（如果尚未下载）
# nltk.download('punkt')
# nltk.download('punkt_tab')

# # 读取并分词的函数
# def load_and_tokenize(file_path):
#     with open(file_path, 'r', encoding='utf8') as file:
#         reviews = file.readlines()
#     # 提取评论文本（去除电影ID和评分）
#     reviews_text = [line.split("\t")[1] for line in reviews if len(line.strip()) > 0]
#     # 分词
#     tokenized_reviews = [word_tokenize(review.lower()) for review in reviews_text]
#     return tokenized_reviews

# # 读取并分词训练集和验证集的评论
# train_file_path = 'movie_reviews_train.txt'  # 修改为您训练集的文件路径
# dev_file_path = 'movie_reviews_dev.txt'      # 修改为您验证集的文件路径

# train_reviews = load_and_tokenize(train_file_path)
# dev_reviews = load_and_tokenize(dev_file_path)

# # 获取训练集和验证集的词汇表（唯一词汇）
# train_vocab = set([word for review in train_reviews for word in review])
# dev_vocab = set([word for review in dev_reviews for word in review])

# # 计算训练集和验证集词汇表之间的重叠
# overlap_vocab = train_vocab.intersection(dev_vocab)
# overlap_size = len(overlap_vocab)

# # 计算重叠词汇的百分比
# train_vocab_size = len(train_vocab)
# dev_vocab_size = len(dev_vocab)

# # 计算重叠词汇的百分比
# overlap_percentage_train = (overlap_size / train_vocab_size) * 100
# overlap_percentage_dev = (overlap_size / dev_vocab_size) * 100

# # 输出百分比结果
# print(f"训练集词汇表重叠百分比: {overlap_percentage_train:.2f}%")
# print(f"验证集词汇表重叠百分比: {overlap_percentage_dev:.2f}%")


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/huangjiabao/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/huangjiabao/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


训练集词汇表重叠百分比: 22.57%
验证集词汇表重叠百分比: 75.17%


In [14]:
# 计算词汇重叠的大小
overlap_size = len(overlap_vocab)

# 输出重叠的大小
print(f"训练集和验证集词汇表重叠的大小: {overlap_size}")


训练集和验证集词汇表重叠的大小: 6132
