# Data Preprocessing

## 1. Import Libraries

In [172]:
import re
import pandas as pd 
from tqdm import tqdm
tqdm.pandas()
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras.layers import Input
from keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from keras.models import Model


## 2. Load and Inspect Data

In [87]:
file_path = "D:\main project\data\spam_or_not_spam.csv"
df = pd.read_csv(file_path)
print(df.head())

                                               email  label
0   date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...      0
1  martin a posted tassos papadopoulos the greek ...      0
2  man threatens explosion in moscow thursday aug...      0
3  klez the virus that won t die already the most...      0
4   in adding cream to spaghetti carbonara which ...      0


In [98]:
print(df.shape)

(3000, 2)


## 3. Get Class Percentage

In [89]:
spam_percentage = (df["label"].value_counts()*100/df.shape[0])[1]
ham_percentage = (df["label"].value_counts()*100/df.shape[0])[0]
print(f"Percentage of spam emails: {spam_percentage:.2f}%")

Percentage of spam emails: 16.67%


## 4. Function to Remove Given RegEx pattern

In [90]:
def remove_pattern(text, pattern):
    cleaned_text = re.sub(pattern, "", str(text))
    return " ".join(cleaned_text.split(" "))

## 5. Remove Unnecessary Patterns

In [91]:
# Lambda expression to remove pattern NUMBER from the text
df["email"] = df["email"].progress_apply(lambda x: remove_pattern(x, "NUMBER"))

# Write Lambda expression to remove pattern URL from the text
df["email"] = df["email"].progress_apply(lambda text: re.sub(r'https?://\S+|www\.\S+', '', text))

# Write Lambda expression to remove underscores from the text

df["email"] = df["email"].progress_apply(lambda text: text.replace("_", ""))

# Write Lambda expression to remove emails from the text

df["email"] = df["email"].progress_apply(lambda text: re.sub(r'\S+@\S+', '', text))


# Write Lambda expression to remove digits

df["email"] = df["email"].progress_apply(lambda text: re.sub(r'\d', '', text))



100%|██████████| 3000/3000 [00:00<00:00, 24014.29it/s]
100%|██████████| 3000/3000 [00:00<00:00, 74545.97it/s]
100%|██████████| 3000/3000 [00:00<00:00, 312797.67it/s]
100%|██████████| 3000/3000 [00:00<00:00, 16279.08it/s]
100%|██████████| 3000/3000 [00:00<00:00, 50691.76it/s]


In [92]:
df

Unnamed: 0,email,label
0,date wed aug from chris garrigues cwg d...,0
1,martin a posted tassos papadopoulos the greek ...,0
2,man threatens explosion in moscow thursday aug...,0
3,klez the virus that won t die already the most...,0
4,in adding cream to spaghetti carbonara which ...,0
...,...,...
2995,abc s good morning america ranks it the chri...,1
2996,hyperlink hyperlink hyperlink let mortgage le...,1
2997,thank you for shopping with us gifts for all ...,1
2998,the famous ebay marketing e course learn to s...,1


## 6. Function to Replace Text

In [68]:
#def replace_text(source_pattern, destination_pattern, text):
   # text = text.replace(source_pattern, destination_pattern)
   # return text

## 7. Decontraction of Phrases

In [69]:
# Write Lamda Expression to replace pattern won't to will not
#df["email"] = df["email"].progress_apply(lambda x: replace_text(x, r"won't", "will not"))

100%|██████████| 3000/3000 [00:00<00:00, 633484.97it/s]


In [82]:
df

Unnamed: 0,email,label
0,date wed aug from chris garrigues cwg d...,0
1,martin a posted tassos papadopoulos the greek ...,0
2,man threatens explosion in moscow thursday aug...,0
3,klez the virus that won t die already the most...,0
4,in adding cream to spaghetti carbonara which ...,0
...,...,...
2995,abc s good morning america ranks it the chri...,1
2996,hyperlink hyperlink hyperlink let mortgage le...,1
2997,thank you for shopping with us gifts for all ...,1
2998,the famous ebay marketing e course learn to s...,1


In [36]:

# Write Lamda Expression to replace pattern can't --> can not
#df["email"] = df["email"].progress_apply(lambda x: replace_text(x, r"can't", "can not"))


100%|██████████| 3000/3000 [00:00<00:00, 306608.64it/s]


In [37]:
df

Unnamed: 0,email,label
0,can not,0
1,can not,0
2,can not,0
3,can not,0
4,can not,0
...,...,...
2995,can not,1
2996,can not,1
2997,can not,1
2998,can not,1


In [38]:

# Write Lamda Expression to replace pattern n't --> not
#df["email"] = df["email"].progress_apply(lambda x: replace_text(x, r"n't", "not"))


100%|██████████| 3000/3000 [00:00<00:00, 577648.26it/s]


In [39]:
df

Unnamed: 0,email,label
0,not,0
1,not,0
2,not,0
3,not,0
4,not,0
...,...,...
2995,not,1
2996,not,1
2997,not,1
2998,not,1


In [40]:

# Write Lamda Expression to replace pattern 're --> are
#df["email"] = df["email"].progress_apply(lambda x: replace_text(x, r"re", "are"))


100%|██████████| 3000/3000 [00:00<00:00, 385045.81it/s]


In [41]:
df

Unnamed: 0,email,label
0,are,0
1,are,0
2,are,0
3,are,0
4,are,0
...,...,...
2995,are,1
2996,are,1
2997,are,1
2998,are,1


In [42]:

# Write Lamda Expression to replace pattern 's -> is
#df["email"] = df["email"].progress_apply(lambda x: replace_text(x, r"s", "is"))


100%|██████████| 3000/3000 [00:00<00:00, 251995.91it/s]


In [43]:
df

Unnamed: 0,email,label
0,is,0
1,is,0
2,is,0
3,is,0
4,is,0
...,...,...
2995,is,1
2996,is,1
2997,is,1
2998,is,1


In [44]:

# Write Lamda Expression to replace pattern 'd -> would
#df["email"] = df["email"].progress_apply(lambda x: replace_text(x, r"d", "would"))


100%|██████████| 3000/3000 [00:00<00:00, 826192.51it/s]


In [45]:
df

Unnamed: 0,email,label
0,would,0
1,would,0
2,would,0
3,would,0
4,would,0
...,...,...
2995,would,1
2996,would,1
2997,would,1
2998,would,1


In [46]:

# Write Lamda Expression to replace pattern 'll -> will
#df["email"] = df["email"].progress_apply(lambda x: replace_text(x, r"ll", "will"))


100%|██████████| 3000/3000 [00:00<00:00, 283616.10it/s]


In [47]:
df

Unnamed: 0,email,label
0,will,0
1,will,0
2,will,0
3,will,0
4,will,0
...,...,...
2995,will,1
2996,will,1
2997,will,1
2998,will,1


In [48]:

# Write Lamda Expression to replace pattern 've -> have
#df["email"] = df["email"].progress_apply(lambda x: replace_text(x, r"ve", "have"))


100%|██████████| 3000/3000 [00:00<00:00, 207416.34it/s]


In [49]:
df

Unnamed: 0,email,label
0,have,0
1,have,0
2,have,0
3,have,0
4,have,0
...,...,...
2995,have,1
2996,have,1
2997,have,1
2998,have,1


In [60]:
# Write function to convert text to lowercase
#def convert_to_lowercase(text):
    #return text.lower()





In [51]:
df

Unnamed: 0,email,label
0,have,0
1,have,0
2,have,0
3,have,0
4,have,0
...,...,...
2995,have,1
2996,have,1
2997,have,1
2998,have,1


In [52]:

# Write function to replace non-alphabets
#def replace_non_alphabets(text, replacement=' '):
    #return re.sub(r'[^a-zA-Z]+', replacement, text)

In [93]:
df

Unnamed: 0,email,label
0,date wed aug from chris garrigues cwg d...,0
1,martin a posted tassos papadopoulos the greek ...,0
2,man threatens explosion in moscow thursday aug...,0
3,klez the virus that won t die already the most...,0
4,in adding cream to spaghetti carbonara which ...,0
...,...,...
2995,abc s good morning america ranks it the chri...,1
2996,hyperlink hyperlink hyperlink let mortgage le...,1
2997,thank you for shopping with us gifts for all ...,1
2998,the famous ebay marketing e course learn to s...,1


## 8. Convert Feature Column to Categorical

In [84]:


# Example DataFrame with a feature column


# Convert the 'Feature' column to categorical


# Display the DataFrame



## 9. Separate Features and Labels

In [106]:


# Example DataFrame with features and labels


# Separate features and labels
X= df['email']
y=df['label']




# Display the separated features and labels


## 10. Perform Train-Test Split

In [111]:


# Example DataFrame with features and labels


# Separate features and labels


# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(
  X,y , random_state=104,test_size=0.25, shuffle=True)



# Display the shapes of the training and testing sets



In [112]:
df

Unnamed: 0,email,label
0,date wed aug from chris garrigues cwg d...,0
1,martin a posted tassos papadopoulos the greek ...,0
2,man threatens explosion in moscow thursday aug...,0
3,klez the virus that won t die already the most...,0
4,in adding cream to spaghetti carbonara which ...,0
...,...,...
2995,abc s good morning america ranks it the chri...,1
2996,hyperlink hyperlink hyperlink let mortgage le...,1
2997,thank you for shopping with us gifts for all ...,1
2998,the famous ebay marketing e course learn to s...,1


## 11. Compute maximum number of words in all emails

In [118]:


# Assuming you have a DataFrame with an 'Emails' column
# Replace 'your_data.csv' with the actual file containing your email data


# Assuming the 'Emails' column contains the text of emails
# You may need to preprocess the text if required (e.g., remove punctuation, lowercasing, etc.)

# Split each email into words and calculate the number of words in each email

# Find the maximum number of words across all emails
length_of_the_messages = df["email"].str.split("\\s+")

max_words=(length_of_the_messages.str.len().max())


## 12. Create Keras preprocessing Tokenizer Object

In [152]:
# Fit Tokenizer object on train data
# Encode X_train and X_test using the Tokenizer object
# Assuming you have a DataFrame with a 'Text' column
# Replace 'your_data.csv' with the actual file containing your text data
# Assuming 'Text' column contains the text data
# Split the data into training and testing sets
# Create a Tokenizer object
# Fit the tokenizer on the training data
# Encode the training and testing data using the fitted tokenizer
# Example of the encoded sequences
# If you want to pad the sequences to have the same length
# Example of the padded sequences

keras_tokenizer=tf.keras.preprocessing.text.Tokenizer(
    num_words=max_words)

keras_tokenizer.fit_on_texts(df["email"])

X_train_sequence=keras_tokenizer.texts_to_sequences(X_train)        # <- fixed typo
X_test_sequence=keras_tokenizer.texts_to_sequences(X_test)

## 13. Pad Input Sequences using Keras preprocessing pad_sequences function

In [182]:
#Pad the sequences to have the same length

# Display the shapes of the padded sequences

train_pad_sequence=pad_sequences(
X_train_sequence,
    maxlen=10000,
    padding='post',
    
)
test_pad_sequence=pad_sequences(
X_test_sequence,
    maxlen=10000,
    padding='post',
    
)






In [183]:
train_pad_sequence.shape

(2250, 10000)

## 14. Create Character Embedding 

In [184]:
#Create a Sequential model

# Add an Embedding layer for character embeddings

# Add an LSTM layer (you can use GRU or other recurrent layers as well)


# Add a Dense layer for classification (adjust units based on your specific task)


# Compile the model


# Display the model summary


# Modeling

## 15. Define Model Architecture Using Functional API

In [186]:
# Define the model using the Functional API

# Compile the model


# Display the model summary
input_layer= Input(shape=(10000,))
dense_layer1= Dense(16)(input_layer)
output_layer= Dense(2)(dense_layer1)

## 16. Define Optimizer (e.g. Adam)

In [188]:


# ... (previous code for data preprocessing and model definition)

# Compile the model with the Adam optimizer
optimizer_adam=Adam(
    learning_rate=0.001,

)

## 17. Write class for custom callback for micro-f1 score

## 18. Create Modelcheckpoint Callback

## 19. Create Callback for TensorBoard

## 20. Compile the Model

In [189]:
model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer_adam,
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=["accuracy"]
              )

## 21. Train the model using model.fit()

In [190]:
model.fit(
    x=train_pad_sequence,
    y=y_train,
    batch_size=32,
    epochs=50,
    
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x14cac92d950>

## 22. Plot Performance Curves Using history object

## 23. Plot Model Architecture Using keras.utils plot_model function