# 垃圾邮件分类数据探索

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

# 数据加载
raw_data = pd.read_csv('../data/raw/spam_dataset.csv')

# 基本数据统计
print(raw_data.info())
print(raw_data['label'].value_counts(normalize=True))

In [None]:
# 文本长度分析
raw_data['text_length'] = raw_data['text'].str.len()

plt.figure(figsize=(10, 6))
sns.boxplot(x='label', y='text_length', data=raw_data)
plt.title('文本长度分布')
plt.show()

In [None]:
# 词频分析
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_features=100)
word_matrix = vectorizer.fit_transform(raw_data['text'])
word_freq = np.sum(word_matrix, axis=0)

top_words = pd.DataFrame({
    'word': vectorizer.get_feature_names_out(),
    'frequency': word_freq.flatten()
}).sort_values('frequency', ascending=False).head(20)

plt.figure(figsize=(12, 6))
sns.barplot(x='word', y='frequency', data=top_words)
plt.title('高频词分析')
plt.xticks(rotation=45)
plt.show()

In [None]:
# 数据集划分
train_data, test_data = train_test_split(
    raw_data, test_size=0.2, stratify=raw_data['label'], random_state=42
)

train_data.to_csv('../data/splits/train.csv', index=False)
test_data.to_csv('../data/splits/test.csv', index=False)

print(f'训练集大小: {len(train_data)}')
print(f'测试集大小: {len(test_data)}')

In [None]:
# 特征工程初步探索
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(raw_data['text'])
feature_names = tfidf.get_feature_names_out()

# 保存特征名称
pd.Series(feature_names).to_csv('../data/processed/tfidf_features.csv', index=False)