#### Detecting spam emails using the Tensorflow which is typically used for building neural network models, including deep learning models like Convolutional Neural Networks (CNNs) or Recurrent Neural Networks (RNNs). These models can be used for text classification tasks such as spam detection.

In [27]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.utils import resample
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [18]:
df=pd.read_csv("spam emails dataset.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [19]:
# Understand the structure of the dataset
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB
None


In [20]:
# Checking for missing values
df.isnull().sum()

Category    0
Message     0
dtype: int64

In [21]:
# Explore the distribution of classes
print(df['Category'].value_counts())

Category
ham     4825
spam     747
Name: count, dtype: int64


#### handling the imbalanced class

In [22]:
df_spam = df[df['Category'] == 'spam']
df_non_spam = df[df['Category'] == 'ham']

# Upsample the minority class (spam) to address class imbalance
df_spam_upsampled = resample(df_spam, replace=True, n_samples=len(df_non_spam), random_state=42)
df_upsampled = pd.concat([df_non_spam, df_spam_upsampled])

In [23]:
# Checking if df_upsampled is created properly
print(df_upsampled['Category'].value_counts())

Category
ham     4825
spam    4825
Name: count, dtype: int64


In [24]:
df_upsampled

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
6,ham,Even my brother is not like to speak with me. ...
...,...,...
4248,spam,Text PASS to 69669 to collect your polyphonic ...
3675,spam,You have won a Nokia 7250i. This is what you g...
3620,spam,8007 25p 4 Alfie Moon's Children in Need song ...
3501,spam,Dorothy@kiefer.com (Bank of Granite issues Str...


### Preprocessing

In [31]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [32]:
# Preprocess the text data
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation and special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Remove stopwords (assuming you have downloaded NLTK stopwords)
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

df_upsampled['Processed_Message'] = df_upsampled['Message'].apply(preprocess_text)


### splitting the training and test sets

In [34]:
from sklearn.model_selection import train_test_split

In [35]:
X_train, X_test, y_train, y_test = train_test_split(df_upsampled['Processed_Message'], df_upsampled['Category'], test_size=0.2, random_state=42)

In [38]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [39]:
# Tokenize and pad sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
max_length = max([len(seq) for seq in X_train_seq])
X_train_padded = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_length, padding='post')

### defining the neural network

In [40]:
model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=64, input_length=max_length),
    LSTM(64),
    Dense(1, activation='sigmoid')
])



In [41]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

### As we can see our target varibale is in string so we need to convert it in binary for training our model

In [53]:
print(y_train.unique())
print(y_test.unique())

['ham' 'spam']
['ham' 'spam']


In [45]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Check the encoded labels
print(y_train_encoded)
print(y_test_encoded)

[0 1 0 ... 1 0 1]
[0 0 0 ... 1 1 0]


In [49]:
## Train the model
history = model.fit(X_train_padded, y_train_encoded, epochs=10, validation_data=(X_test_padded, y_test_encoded))


Epoch 1/10
[1m242/242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 29ms/step - accuracy: 0.6681 - loss: 0.6168 - val_accuracy: 0.6415 - val_loss: 0.6297
Epoch 2/10
[1m242/242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 28ms/step - accuracy: 0.6174 - loss: 0.6507 - val_accuracy: 0.6021 - val_loss: 0.6536
Epoch 3/10
[1m242/242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 28ms/step - accuracy: 0.5815 - loss: 0.6719 - val_accuracy: 0.5565 - val_loss: 0.6770
Epoch 4/10
[1m242/242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 29ms/step - accuracy: 0.5421 - loss: 0.6872 - val_accuracy: 0.5249 - val_loss: 0.6884
Epoch 5/10
[1m242/242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 31ms/step - accuracy: 0.5128 - loss: 0.6921 - val_accuracy: 0.4927 - val_loss: 0.6941
Epoch 6/10
[1m242/242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 30ms/step - accuracy: 0.5014 - loss: 0.6935 - val_accuracy: 0.5212 - val_loss: 0.6859
Epoch 7/10
[1m242/242

In [58]:
# Print the model summary
print(new_model.summary())

None


In [48]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test_padded, y_test_encoded)
print("Test Accuracy:", accuracy)

[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.7064 - loss: 0.5733
Test Accuracy: 0.7119171023368835
