In [1]:
"""
Data Exploration Notebook
Explore the SMS Spam Collection dataset
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import sys
from pathlib import Path

# Add src to path
sys.path.append(str(Path().parent / "src"))

from data_preprocessing import SMSPreprocessor
from utils import calculate_message_stats
import config

# Load data
preprocessor = SMSPreprocessor()
df = preprocessor.load_and_preprocess_data(config.SPAM_DATA_FILE)

# Display basic info
print("Dataset Shape:", df.shape)
print("\nColumn Names:", df.columns.tolist())
print("\nFirst 5 rows:")
df.head()

# Dataset statistics
stats = calculate_message_stats(df)
print("\nDataset Statistics:")
for key, value in stats.items():
    print(f"{key}: {value}")

# Visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# 1. Label distribution
axes[0, 0].pie([stats['ham_count'], stats['spam_count']], 
               labels=['Ham', 'Spam'], 
               autopct='%1.1f%%',
               colors=['lightgreen', 'lightcoral'])
axes[0, 0].set_title('Message Distribution')

# 2. Message length distribution
df['message_length'] = df['message'].str.len()
sns.histplot(data=df, x='message_length', hue='label', bins=50, ax=axes[0, 1])
axes[0, 1].set_title('Message Length Distribution')

# 3. Word count distribution
df['word_count'] = df['message'].str.split().str.len()
sns.boxplot(data=df, x='label', y='word_count', ax=axes[1, 0])
axes[1, 0].set_title('Word Count by Label')

# 4. Character count distribution
sns.boxplot(data=df, x='label', y='message_length', ax=axes[1, 1])
axes[1, 1].set_title('Character Count by Label')

plt.tight_layout()
plt.show()

# Word clouds
fig, axes = plt.subplots(1, 2, figsize=(16, 8))

# Ham messages word cloud
ham_text = ' '.join(df[df['label'] == 'ham']['cleaned_message'])
ham_wordcloud = WordCloud(width=400, height=400, background_color='white').generate(ham_text)
axes[0].imshow(ham_wordcloud, interpolation='bilinear')
axes[0].axis('off')
axes[0].set_title('Ham Messages Word Cloud', fontsize=16)

# Spam messages word cloud
spam_text = ' '.join(df[df['label'] == 'spam']['cleaned_message'])
spam_wordcloud = WordCloud(width=400, height=400, background_color='white').generate(spam_text)
axes[1].imshow(spam_wordcloud, interpolation='bilinear')
axes[1].axis('off')
axes[1].set_title('Spam Messages Word Cloud', fontsize=16)

plt.tight_layout()
plt.show()

ModuleNotFoundError: No module named 'data_preprocessing'