# Input preprocessing - Spam Email Detection using Neural Networks

## Libraries importation

In [1]:
import pandas as pd
import stanza
import string
import numpy as np

from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

  from .autonotebook import tqdm as notebook_tqdm


## Data loading

In [None]:
# Import original data
df = pd.read_csv("data/spam_ham_dataset.csv")

# Rename id and text columns
df.rename(columns={
    'Unnamed: 0': 'id',
    'text': 'original_text'
}, inplace=True)

# Print first 5 elements
df.head(5)

Unnamed: 0,id,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


## Preprocessing

### Characters removal

Remove all words that are considered as stop words in the ntlk library. This includes common words such as "the", "is", "in", etc. that do not add significant meaning to the text; also remove punctuation. This is done to standardize the text and make it easier to analyze.

In [3]:
stop_words = stopwords.words('english')
punctuation = list(string.punctuation)

remove = stop_words + punctuation

In [None]:
df['lemmatized_text'] = df['original_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (remove)]))

### Lemmatization

Lemmatization is the process of reducing words to their base or root form. For example, "running" becomes "run", and "better" becomes "good". This helps in reducing the dimensionality of the data and improving the performance of the model.

In [5]:
# Create a Stanza pipeline for the English language
nlp = stanza.Pipeline(lang='en', processors='tokenize,lemma,pos')

2025-04-13 15:31:37 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json: 426kB [00:00, 218MB/s]                     
2025-04-13 15:31:37 INFO: Downloaded file to /Users/saito/stanza_resources/resources.json
2025-04-13 15:31:38 INFO: Loading these models for language: en (English):
| Processor | Package           |
---------------------------------
| tokenize  | combined          |
| mwt       | combined          |
| pos       | combined_charlm   |
| lemma     | combined_nocharlm |

2025-04-13 15:31:38 INFO: Using device: cpu
2025-04-13 15:31:38 INFO: Loading: tokenize
2025-04-13 15:31:38 INFO: Loading: mwt
2025-04-13 15:31:38 INFO: Loading: pos
2025-04-13 15:31:39 INFO: Loading: lemma
2025-04-13 15:31:39 INFO: Done loading processors!


In [6]:
def apply_lemmatization(nlp, df):
    def process_row(row):
        doc = nlp(row["lemmatized_text"])
        lemmatized_text = join_comment(doc)
        return lemmatized_text

    with concurrent.futures.ThreadPoolExecutor() as executor:
        lemmatized_texts = list(executor.map(process_row, [row for _, row in df.iterrows()]))

    df["lemmatized_text"] = lemmatized_texts
    return df

def join_comment(doc):
    # Iterate over the sentences in the doc and then over the tokens in each sentence
    lemmatized_words = []

    for sentence in doc.sentences:
        for token in sentence.tokens:
            lemmatized_words.append(token.words[0].lemma)
    
    return " ".join(lemmatized_words)


new_df = apply_lemmatization(nlp, df)

new_df

Unnamed: 0,id,label,original_text,label_num,lemmatized_text
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0,subject : enron methanol meter 988291 follow n...
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0,subject : hpl nom january 9 2001 see attach fi...
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0,subject : neon retreat ho ho ho around wonderf...
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1,subject : photoshop window office cheap main t...
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0,subject : indian spring deal book teco pvr rev...
...,...,...,...,...,...
5166,1518,ham,Subject: put the 10 on the ft\r\nthe transport...,0,subject : put 10 ft transport volume decrease ...
5167,404,ham,Subject: 3 / 4 / 2000 and following noms\r\nhp...,0,subject : 3 4 2000 follow nom hpl take extra 1...
5168,2933,ham,Subject: calpine daily gas nomination\r\n>\r\n...,0,subject : calpine daily gas nomination julie m...
5169,1409,ham,Subject: industrial worksheets for august 2000...,0,subject : industrial worksheet august 2000 act...


## Data splitting

The dataset will be split into training, validation and test sets. The training set will be used to train the model, the validation set will be used to tune the hyperparameters, and the test set will be used to evaluate the final model's performance.

The data will be randomly split (with the seed 42) while maintaining the same distribution of classes in each set, using stratified sampling from the `train_test_split` function from  `sklearn`.

The training set will contain 70% of the data, the validation set will contain 10% of the data, and the test set will contain 20% of the data.

In [None]:
# Split new_df to 70% train_set, 30% temporal set (training + validation)
train_set, temporal_set  = train_test_split(new_df, test_size=0.3, random_state=42)

# Split temporal_set to 67% test_set, 33% validation_set
test_set, validation_set  = train_test_split(temporal_set, test_size=0.33, random_state=42)

In order to split the sets easily on DBREPO, we created a new id called `experiment_id`. This id is used to identify each subset by a range of values. For example, the training set will have `experiment_id` values from 0 to 3618, the validation set will have `experiment_id` values from 3619 to 4131, and the test set will have `experiment_id` values from 4132 to 5170. This while conserving the orginal id of the dataset. This is done to make it easier to track the subsets and their corresponding original ids.

In [8]:
# Assign experiment id
# Experiment id is new consecutive number for each comment staring from 0 from train_set and counting up to the last comment in test_set
train_set['experiment_id'] = np.arange(len(train_set))
validation_set['experiment_id'] = np.arange(len(train_set), len(train_set) + len(validation_set))
test_set['experiment_id'] = np.arange(len(train_set) + len(validation_set), len(train_set) + len(validation_set) + len(test_set))

# Reorder and filter columns
final_train_set = train_set[['id', 'experiment_id', 'lemmatized_text', 'label', 'label_num']]
final_validation_set = validation_set[['id', 'experiment_id', 'lemmatized_text', 'label', 'label_num']]
final_test_set = test_set[['id', 'experiment_id', 'lemmatized_text', 'label', 'label_num']]

In [9]:
# Save sets into csv
final_train_set.to_csv('data/train_set.csv', index=False)
final_test_set.to_csv('data/test_set.csv', index=False)
final_validation_set.to_csv('data/validation_set.csv', index=False)