# 垃圾邮件数据探索分析

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

# 数据加载与基本探索
df = pd.read_csv('../data/raw/emails.csv')
print(f'数据集大小: {df.shape}')
print(f'垃圾邮件比例: {df["label"].mean():.2%}')

In [None]:
# 文本长度分析
df['text_length'] = df['text'].str.len()

plt.figure(figsize=(10, 6))
sns.boxplot(x='label', y='text_length', data=df)
plt.title('垃圾邮件与正常邮件文本长度分布')
plt.show()

In [None]:
# 特征工程
def extract_features(text):
    return {
        'has_url': int('http' in text.lower()),
        'has_dollar': int('$' in text),
        'uppercase_ratio': len([c for c in text if c.isupper()]) / len(text),
        'punctuation_count': sum(1 for c in text if c in '!?.')
    }

features_df = df['text'].apply(extract_features).apply(pd.Series)
df_with_features = pd.concat([df, features_df], axis=1)

# 特征相关性分析
plt.figure(figsize=(10, 8))
sns.heatmap(df_with_features[['label', 'has_url', 'has_dollar', 'uppercase_ratio', 'punctuation_count']].corr(), annot=True)
plt.title('特征与垃圾邮件的相关性')
plt.show()

In [None]:
# 数据集划分
X = df_with_features.drop(['label', 'text'], axis=1)
y = df_with_features['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 保存数据集
X_train.to_csv('../data/splits/X_train.csv', index=False)
X_test.to_csv('../data/splits/X_test.csv', index=False)
y_train.to_csv('../data/splits/y_train.csv', index=False)
y_test.to_csv('../data/splits/y_test.csv', index=False)

print('数据集划分完成')