<a href="https://colab.research.google.com/github/Cakethehacker/Ai_email_filter/blob/main/Ai_WorkShop.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **AI-Powered Content Classification and Sentiment Analysis.**

This notebook presents a step-by-step guide to building an efficient Email spam classification model using the email Spam Collection dataset. By the end of this notebook, you'll have a powerful tool to help you filter out unwanted email messages and ensure that your email messaging experience is smoother and safer. 🙂

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix, roc_curve, auc, precision_recall_curve
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from google.colab import files
import re
import string


In [None]:
# Upload dataset
uploaded = files.upload()
df = pd.read_csv(next(iter(uploaded)), encoding='latin-1')  # or 'cp1252', 'iso-8859-1', etc.

In [None]:
# Display basic dataset info
print(df.info())
print(df.head())
df.describe()
df.shape

In [None]:
# Text Preprocessing
def clean_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(f"[{string.punctuation}]", "", text)
        return text
    return ""

df.dropna(inplace=True)  # Drop missing values
df['text'] = df['title'] + ' ' + df['text']  # Combine title and text
df['text'] = df['text'].apply(clean_text)

In [None]:
# Convert labels to numerical values
df['label'] = df['type'].apply(lambda x: 1 if x == 'spam' else 0)

In [None]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42, stratify=df['label'])

In [None]:
# Visualization: Spam vs. Not Spam Distribution
plt.figure(figsize=(6, 4))
sns.countplot(x=df['label'])
plt.title('Spam vs Not Spam Distribution')
plt.xlabel('Label (0 = Not Spam, 1 = Spam)')
plt.ylabel('Count')
plt.show()

In [None]:
# Convert text to numerical features using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)
X_train_tfidf = vectorizer.fit_transform(X_train).toarray()
X_test_tfidf = vectorizer.transform(X_test).toarray()

In [None]:
# Handle Class Imbalance with SMOTE
smote = SMOTE(random_state=42)
X_train_tfidf, y_train = smote.fit_resample(X_train_tfidf, y_train)

In [None]:
# Get most common words for spam and real emails
spam_words = ' '.join(spam_emails).split()
real_words = ' '.join(real_emails).split()

spam_common = [word for word, count in Counter(spam_words).most_common(20)]
real_common = [word for word, count in Counter(real_words).most_common(20)]

In [None]:
# Build Improved Neural Network Model
model = Sequential([
    Dense(512, activation='relu', input_shape=(X_train_tfidf.shape[1],)),
    BatchNormalization(),
    Dropout(0.5),
    Dense(256, activation='relu'),
    BatchNormalization(),
    Dropout(0.5),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [None]:
# Train model and save each epoch's model
best_val_accuracy = 0.0
best_model_path = "best_epoch_model.h5"

for epoch in range(1, 41):
    history = model.fit(X_train_tfidf, y_train, epochs=1, batch_size=32, validation_data=(X_test_tfidf, y_test), verbose=1)
    model_path = f"epoch_model_{epoch}.h5"
    model.save(model_path)
    print(f"Saved {model_path}")

    val_accuracy = history.history['val_accuracy'][0]
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        model.save(best_model_path)
        print(f"Best model updated: {best_model_path}")

In [None]:
_pred_probs = model.predict(X_test_tfidf)
y_pred = (y_pred_probs > 0.5).astype('int32')  # Lower threshold to capture more spam

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')
print(classification_report(y_test, y_pred))


In [None]:
# Visualization: Training History
plt.figure(figsize=(10, 5))
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Model Accuracy Over Epochs')
plt.legend()
plt.show()


In [None]:
# Visualization: Training History
plt.figure(figsize=(10, 5))
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Model Accuracy Over Epochs')
plt.legend()
plt.show()


In [None]:
# Visualization: Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Spam', 'Spam'], yticklabels=['Not Spam', 'Spam'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()


In [None]:
# Visualization: Precision-Recall Curve
precision, recall, _ = precision_recall_curve(y_test, y_pred_probs)
plt.figure(figsize=(6, 4))
plt.plot(recall, precision, marker='.')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.show()

In [None]:
# Visualization: ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_pred_probs)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(6, 4))
plt.plot(fpr, tpr, label=f'AUC = {roc_auc:.2f}')
plt.plot([0, 1], [0, 1], linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()

In [None]:
def predict_from_csv():
    print("Upload a CSV file with 'title' and 'text' columns for prediction.")
    uploaded_file = files.upload()
    test_df = pd.read_csv(next(iter(uploaded_file)), encoding='latin-1') # Change encoding to 'latin-1' or 'cp1252'

    # Convert 'title' and 'text' columns to string type before combining
    # Removed the .str causing the error, it is not necessary when columns are already strings
    test_df['title'] = test_df['title'].astype(str)
    test_df['text'] = test_df['text'].astype(str)

    test_df['text'] = test_df['title'] + ' ' + test_df['text']  # Combine title and text
    test_tfidf = vectorizer.transform(test_df['text']).toarray()
    predictions = (model.predict(test_tfidf) > 0.5).astype('int32')

    test_df['prediction'] = predictions
    test_df['prediction'] = test_df['prediction'].apply(lambda x: 'Spam' if x == 1 else 'Not Spam')

    test_df[test_df['prediction'] == 'Not Spam'].to_csv('primary_mail.csv', index=False, encoding='utf-8')
    test_df[test_df['prediction'] == 'Spam'].to_csv('junk_mail.csv', index=False, encoding='utf-8')

    print("Prediction results saved to 'primary_mail.csv' and 'junk_mail.csv'.")
    print(test_df[['title', 'text', 'prediction']])
    return test_df

In [None]:
#Run prediction on uploaded CSV
predicted_df = predict_from_csv()
