# Data Preprocessing

## 1. Import Libraries

In [1]:
import re
import pandas as pd 
from tqdm import tqdm
tqdm.pandas()
from keras.preprocessing.text import Tokenizer
#from keras.preprocessing.sequence import pad_sequences

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


ModuleNotFoundError: No module named 'keras.preprocessing.text'

## 2. Load and Inspect Data

In [12]:
file_path = "/kaggle/input/spam-or-not-spam-dataset/spam_or_not_spam.csv"
df = pd.read_csv(file_path)
print(df.head())

FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/spam-or-not-spam-dataset/spam_or_not_spam.csv'

In [None]:
print(df.shape)

## 3. Get Class Percentage

In [None]:
spam_percentage = (df["label"].value_counts()*100/df.shape[0])[1]
ham_percentage = (df["label"].value_counts()*100/df.shape[0])[0]
print(f"Percentage of spam emails: {spam_percentage:.2f}%")

## 4. Function to Remove Given RegEx pattern

In [None]:
def remove_pattern(text, pattern):
    cleaned_text = re.sub(pattern, "", str(text))
    return " ".join(cleaned_text.split(" "))

## 5. Remove Unnecessary Patterns

In [None]:
# Lambda expression to remove pattern NUMBER from the text
df["email"] = df["email"].progress_apply(lambda x: remove_pattern(x, "NUMBER"))

# Write Lambda expression to remove pattern URL from the text
remove_urls = lambda text: re.sub(r'https?://\S+|www\.\S+', '', text)

# Write Lambda expression to remove underscores from the text
remove_underscores = lambda text: text.replace("_", "")

# Write Lambda expression to remove emails from the text
remove_emails = lambda text: re.sub(r'\S+@\S+', '', text)

# Write Lambda expression to remove digits
remove_digits = lambda text: re.sub(r'\d', '', text)

## 6. Function to Replace Text

In [None]:
def replace_text(source_pattern, destination_pattern, text):
    text = re.sub(source_pattern, destination_pattern, str(text))
    return text

## 7. Decontraction of Phrases

In [None]:
# Write Lamda Expression to replace pattern won't to will not
df["email"] = df["email"].progress_apply(lambda x: replace_text(x, r"won't", "will not"))

# Write Lamda Expression to replace pattern can't --> can not
df["email"] = df["email"].progress_apply(lambda x: replace_text(x, r"can't", "can not"))

# Write Lamda Expression to replace pattern n't --> not
df["email"] = df["email"].progress_apply(lambda x: replace_text(x, r"n't", "not"))

# Write Lamda Expression to replace pattern 're --> are
df["email"] = df["email"].progress_apply(lambda x: replace_text(x, r"re", "are"))

# Write Lamda Expression to replace pattern 's -> is
df["email"] = df["email"].progress_apply(lambda x: replace_text(x, r"s", "is"))

# Write Lamda Expression to replace pattern 'd -> would
df["email"] = df["email"].progress_apply(lambda x: replace_text(x, r"d", "would"))

# Write Lamda Expression to replace pattern 'll -> will
df["email"] = df["email"].progress_apply(lambda x: replace_text(x, r"ll", "will"))

# Write Lamda Expression to replace pattern 've -> have
df["email"] = df["email"].progress_apply(lambda x: replace_text(x, r"ve", "have"))


In [None]:
# Write function to convert text to lowercase
def convert_to_lowercase(text):
    return text.lower()

# Write function to replace non-alphabets
def replace_non_alphabets(text, replacement=' '):
    return re.sub(r'[^a-zA-Z]+', replacement, text)

## 8. Convert Feature Column to Categorical

In [None]:


# Example DataFrame with a feature column
data = {'Feature': ['Category1', 'Category2', 'Category1', 'Category3', 'Category2']}
df = pd.DataFrame(data)

# Convert the 'Feature' column to categorical
df['Feature'] = df['Feature'].astype('category')

# Display the DataFrame
print(df)


## 9. Separate Features and Labels

In [None]:


# Example DataFrame with features and labels
data = {
    'Feature1': [1, 2, 3, 4, 5],
    'Feature2': [5, 4, 3, 2, 1],
    'Label': ['A', 'B', 'A', 'B', 'A']
}

df = pd.DataFrame(data)

# Separate features and labels
features = df.drop('Label', axis=1)  # Drop the 'Label' column
labels = df['Label']

# Display the separated features and labels
print("Features:")
print(features)

print("\nLabels:")
print(labels)


## 10. Perform Train-Test Split

In [None]:
from sklearn.model_selection import train_test_split

# Example DataFrame with features and labels
data = {
    'Feature1': [1, 2, 3, 4, 5],
    'Feature2': [5, 4, 3, 2, 1],
    'Label': ['A', 'B', 'A', 'B', 'A']
}

df = pd.DataFrame(data)

# Separate features and labels
X = df.drop('Label', axis=1)  # Features
y = df['Label']  # Labels

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the training and testing sets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


## 11. Compute maximum number of words in all emails

In [None]:


# Assuming you have a DataFrame with an 'Emails' column
# Replace 'your_data.csv' with the actual file containing your email data
df = pd.read_csv('/kaggle/input/spam-or-not-spam-dataset/spam_or_not_spam.csv')

# Assuming the 'Emails' column contains the text of emails
# You may need to preprocess the text if required (e.g., remove punctuation, lowercasing, etc.)

# Split each email into words and calculate the number of words in each email
df['Word_Count'] = df['Emails'].apply(lambda x: len(str(x).split()))

# Find the maximum number of words across all emails
max_words = df['Word_Count'].max()

print("Maximum number of words in all emails:", max_words)


## 12. Create Keras preprocessing Tokenizer Object

In [None]:
# Fit Tokenizer object on train data
# Encode X_train and X_test using the Tokenizer object
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd
from sklearn.model_selection import train_test_split

# Assuming you have a DataFrame with a 'Text' column
# Replace 'your_data.csv' with the actual file containing your text data
df = pd.read_csv('/kaggle/input/spam-or-not-spam-dataset/spam_or_not_spam.csv')

# Assuming 'Text' column contains the text data
X = df['Text'].values
y = df['Label'].values  # Assuming you have labels, adjust accordingly

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Tokenizer object
tokenizer = Tokenizer()

# Fit the tokenizer on the training data
tokenizer.fit_on_texts(X_train)

# Encode the training and testing data using the fitted tokenizer
X_train_encoded = tokenizer.texts_to_sequences(X_train)
X_test_encoded = tokenizer.texts_to_sequences(X_test)

# Example of the encoded sequences
print("Encoded X_train example:")
print(X_train_encoded[0])

# If you want to pad the sequences to have the same length
max_sequence_length = max(len(seq) for seq in X_train_encoded + X_test_encoded)
X_train_padded = pad_sequences(X_train_encoded, maxlen=max_sequence_length, padding='post')
X_test_padded = pad_sequences(X_test_encoded, maxlen=max_sequence_length, padding='post')

# Example of the padded sequences
print("\nPadded X_train example:")
print(X_train_padded[0])


## 13. Pad Input Sequences using Keras preprocessing pad_sequences function

In [None]:
#Pad the sequences to have the same length
max_sequence_length = 100  # Adjust this based on your specific requirements
X_train_padded = pad_sequences(X_train_encoded, maxlen=max_sequence_length, padding='post')
X_test_padded = pad_sequences(X_test_encoded, maxlen=max_sequence_length, padding='post')

# Display the shapes of the padded sequences
print("X_train_padded shape:", X_train_padded.shape)
print("X_test_padded shape:", X_test_padded.shape)

## 14. Create Character Embedding 

In [None]:
#Create a Sequential model
model = Sequential()

# Add an Embedding layer for character embeddings
embedding_dim = 50  # Adjust this based on your specific requirements
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim, input_length=max_sequence_length))

# Add an LSTM layer (you can use GRU or other recurrent layers as well)
model.add(LSTM(100))

# Add a Dense layer for classification (adjust units based on your specific task)
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Display the model summary
print(model.summary())

# Modeling

## 15. Define Model Architecture Using Functional API

In [None]:
# Define the model using the Functional API
inputs = Input(shape=(max_sequence_length,))
embedding_dim = 50  # Adjust this based on your specific requirements

embedding_layer = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim, input_length=max_sequence_length)(inputs)
bidirectional_lstm = Bidirectional(LSTM(100))(embedding_layer)
outputs = Dense(1, activation='sigmoid')(bidirectional_lstm)

model = Model(inputs=inputs, outputs=outputs)

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Display the model summary
print(model.summary())

## 16. Define Optimizer (e.g. Adam)

In [None]:
from tensorflow.keras.optimizers import Adam

# ... (previous code for data preprocessing and model definition)

# Compile the model with the Adam optimizer
optimizer = Adam(learning_rate=0.001)  # You can adjust the learning rate based on your needs

model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])


## 17. Write class for custom callback for micro-f1 score

## 18. Create Modelcheckpoint Callback

## 19. Create Callback for TensorBoard

## 20. Compile the Model

## 21. Train the model using model.fit()

## 22. Plot Performance Curves Using history object

## 23. Plot Model Architecture Using keras.utils plot_model function