In [1]:
"""
Data Preprocessing Notebook
Detailed preprocessing steps and analysis
"""

import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
import matplotlib.pyplot as plt
import seaborn as sns
import sys
from pathlib import Path

# Add src to path
sys.path.append(str(Path().parent / "src"))

from data_preprocessing import SMSPreprocessor
import config

# Initialize preprocessor
preprocessor = SMSPreprocessor()

# Load raw data
df_raw = pd.read_csv(config.SPAM_DATA_FILE, encoding='latin-1')
print("Raw data shape:", df_raw.shape)
print("Raw data columns:", df_raw.columns.tolist())
print("\nFirst 5 rows:")
df_raw.head()

# Clean and preprocess
df_clean = preprocessor.load_and_preprocess_data(config.SPAM_DATA_FILE)

# Compare before and after preprocessing
print("\nPreprocessing Comparison:")
print("="*50)

sample_messages = df_raw.iloc[:5]
for i, row in sample_messages.iterrows():
    original = row['v2']
    cleaned = preprocessor.clean_text(original)
    print(f"\nOriginal: {original}")
    print(f"Cleaned:  {cleaned}")
    print("-" * 30)

# Analyze preprocessing effects
df_clean['original_length'] = df_raw['v2'].str.len()
df_clean['cleaned_length'] = df_clean['cleaned_message'].str.len()
df_clean['length_reduction'] = df_clean['original_length'] - df_clean['cleaned_length']

print(f"\nPreprocessing Statistics:")
print(f"Average length reduction: {df_clean['length_reduction'].mean():.2f} characters")
print(f"Percentage reduction: {(df_clean['length_reduction'].mean() / df_clean['original_length'].mean()) * 100:.1f}%")

# Visualize preprocessing effects
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Original vs cleaned length
axes[0].scatter(df_clean['original_length'], df_clean['cleaned_length'], alpha=0.6)
axes[0].plot([0, df_clean['original_length'].max()], [0, df_clean['original_length'].max()], 'r--', alpha=0.8)
axes[0].set_xlabel('Original Length')
axes[0].set_ylabel('Cleaned Length')
axes[0].set_title('Original vs Cleaned Message Length')

# Length reduction distribution
axes[1].hist(df_clean['length_reduction'], bins=50, alpha=0.7, color='skyblue')
axes[1].set_xlabel('Length Reduction (characters)')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Distribution of Length Reduction')

# Length reduction by label
sns.boxplot(data=df_clean, x='label', y='length_reduction', ax=axes[2])
axes[2].set_title('Length Reduction by Message Type')

plt.tight_layout()
plt.show()

# Most common words analysis
from collections import Counter

def get_top_words(messages, n=20):
    all_words = ' '.join(messages).split()
    return Counter(all_words).most_common(n)

# Top words in ham messages
ham_words = get_top_words(df_clean[df_clean['label'] == 'ham']['cleaned_message'])
spam_words = get_top_words(df_clean[df_clean['label'] == 'spam']['cleaned_message'])

print("\nTop 10 words in HAM messages:")
for word, count in ham_words[:10]:
    print(f"{word}: {count}")

print("\nTop 10 words in SPAM messages:")
for word, count in spam_words[:10]:
    print(f"{word}: {count}")

# Visualize top words
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Ham words
ham_words_df = pd.DataFrame(ham_words[:15], columns=['word', 'count'])
sns.barplot(data=ham_words_df, x='count', y='word', ax=axes[0], palette='viridis')
axes[0].set_title('Top 15 Words in Ham Messages')

# Spam words
spam_words_df = pd.DataFrame(spam_words[:15], columns=['word', 'count'])
sns.barplot(data=spam_words_df, x='count', y='word', ax=axes[1], palette='plasma')
axes[1].set_title('Top 15 Words in Spam Messages')

plt.tight_layout()
plt.show()

ModuleNotFoundError: No module named 'data_preprocessing'