In [2]:
import re
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import tensorflow as tf
from tensorflow.keras import layers, models

# read data 

data = pd.read_csv('email.csv')  

#Data processing

data['label'] = data['label'].replace(
    to_replace=['ham', 'spam'], 
    value=[0, 1])

def preprocess_text(text):
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove non-alphabetic characters
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    # Lowercase and strip
    text = text.lower().strip()
    return text

# Apply preprocessing
data['text'] = data['text'].apply(preprocess_text)

# Splitting data
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label'], test_size=0.2, random_state=42)

# Vectorization
vectorizer = TfidfVectorizer(max_features=10000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Convert sparse matrices to dense and ensure they are of type float32
X_train_tfidf_dense = X_train_tfidf.toarray().astype('float32')
X_test_tfidf_dense = X_test_tfidf.toarray().astype('float32')

# Ensure labels are of type int32
y_train = y_train.astype('int32')
y_test = y_test.astype('int32')

  data['label'] = data['label'].replace(


In [3]:
# Model definition
model = models.Sequential([
    layers.Dense(128, input_shape=(6829,), activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history=model.fit(X_train_tfidf_dense, y_train, epochs=10, batch_size=32, validation_split=0.2)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.8542 - loss: 0.5134 - val_accuracy: 0.8700 - val_loss: 0.2049
Epoch 2/10
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.9354 - loss: 0.1386 - val_accuracy: 0.9787 - val_loss: 0.0738
Epoch 3/10
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9934 - loss: 0.0311 - val_accuracy: 0.9798 - val_loss: 0.0742
Epoch 4/10
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9968 - loss: 0.0130 - val_accuracy: 0.9821 - val_loss: 0.0626
Epoch 5/10
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.9973 - loss: 0.0084 - val_accuracy: 0.9809 - val_loss: 0.0772
Epoch 6/10
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9993 - loss: 0.0039 - val_accuracy: 0.9821 - val_loss: 0.0810
Epoch 7/10
[1m112/112[0m 

In [4]:
# Evaluate the model
train_loss, train_accuracy = model.evaluate(X_train_tfidf_dense, y_train)
print(f"Train Accuracy: {train_accuracy:.2f}")

[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9996 - loss: 0.0020    
Train Accuracy: 1.00


In [5]:
# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test_tfidf_dense, y_test)
print(f"Test Accuracy: {test_accuracy:.2f}")

[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9845 - loss: 0.1085
Test Accuracy: 0.99


In [6]:
#Data Testing
def predict_phishing(email_text):
    processed_text = preprocess_text(email_text)
    vectorized_text = vectorizer.transform([processed_text])
    prediction = model.predict(vectorized_text)
    return "Phishing" if prediction[0] > 0.5 else "Normal"

# Example usage
email = "Get me out of this dump heap. My mom decided to come to lowes. BORING."
print(predict_phishing(email))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 149ms/step
Normal
