In [1]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score
import nltk
from nltk.corpus import stopwords
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns

# Download NLTK stopwords
nltk.download('stopwords')

# Load and preprocess data
df = pd.read_csv("spam.csv", encoding='latin-1').drop(columns=["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"])
df['Category'] = df['Category'].map({'spam': 0, 'ham': 1})
X, Y = df['Message'], df['Category'].astype(int)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=3)

# Vectorize text data
vectorizer = TfidfVectorizer(min_df=1, stop_words="english", lowercase=True)
X_train_features = vectorizer.fit_transform(X_train)
X_test_features = vectorizer.transform(X_test)

# Train Logistic Regression model
model = LogisticRegression()
model.fit(X_train_features, Y_train)

# Evaluate model
train_accuracy = accuracy_score(Y_train, model.predict(X_train_features))
test_accuracy = accuracy_score(Y_test, model.predict(X_test_features))
print(f"Training Accuracy: {train_accuracy}")
print(f"Testing Accuracy: {test_accuracy}")

# Predict sample emails
sample_emails = ["Congratulations! You won a free vacation!", "Meeting tomorrow, 10 AM."]
for email in sample_emails:
    prediction = model.predict(vectorizer.transform([email]))[0]
    print("Spam" if prediction == 0 else "Ham", "Mail")

# Visualize data distribution
plt.bar(['Spam', 'Ham'], [df['Category'].value_counts()[0], df['Category'].value_counts()[1]])
plt.title('Email Type Distribution')
plt.show()

# Plot Confusion Matrix
cm = confusion_matrix(Y_test, model.predict(X_test_features))
sns.heatmap(cm, annot=True, fmt="d", cmap='Blues', cbar=False)
plt.title('Confusion Matrix')
plt.show()

# ROC Curve
probabilities = model.predict_proba(X_test_features)[:, 1]
fpr, tpr, _ = roc_curve(Y_test, probabilities)
plt.plot(fpr, tpr, label=f'AUC = {roc_auc_score(Y_test, probabilities):.2f}')
plt.plot([0, 1], [0, 1], 'k--')
plt.title('ROC Curve')
plt.legend()
plt.show()

# Common words in spam and ham emails
stop_words = set(stopwords.words('english'))
for label, color in zip([0, 1], ['g', 'k']):
    words = " ".join(df[df['Category'] == label]['Message']).split()
    word_freq = Counter(word.lower() for word in words if word.lower() not in stop_words and word.isalpha())
    plt.bar(*zip(*word_freq.most_common(10)), color=color)
    plt.title(f"Top Words in {'Spam' if label == 0 else 'Ham'} Emails")
    plt.xticks(rotation=45)
    plt.show()


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/anuradha/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


KeyError: 'Category'