1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

2. Load the Dataset

In [58]:
# Load the cleaned dataset or the raw dataset
data = pd.read_csv("../data/raw/spam.csv")

data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


3. Drop Unnecessary Columns

In [59]:
#remove if any unnecessary columns
data = data[['Category', 'Message']]  # Keep only relevant columns

4. Handle Missing or Duplicate Data

In [60]:
# remove duplicates
data = data.drop_duplicates()

In [61]:
#check for Missing Values
missing_before = data.shape[0]
data = data.dropna()
missing_after = data.shape[0]
print(f"Dropped {missing_before - missing_after} rows due to missing values.")

Dropped 0 rows due to missing values.


5. Encode Target Category

In [62]:
# Created New Column "Label_num" according to spam or ham
data['Label_num'] = data['Category'].map({'ham':0, 'spam': 1})

# Dropped the Category column as it is no use
data = data.drop('Category', axis=1)
data.head()


Unnamed: 0,Message,Label_num
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [63]:
# Define Features and Target
y = data['Label_num']

6. Text Cleaning

In [64]:
# Convert to Lowercase
data['Cleaned_message'] = data["Message"].str.lower()

In [65]:
#Remove Punctuation
data['Cleaned_message'] = data["Cleaned_message"].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

In [66]:
#Remove Numbers
data["Cleaned_message"] = data["Cleaned_message"].apply(lambda x: re.sub(r'\d+', '', x))

In [67]:
# Remove Extra Whitespace:
data['Cleaned_message'] = data['Cleaned_message'].apply(lambda x: ' '.join(x.split()))


7. Feature Extraction

In [68]:
# Converting the Cleaned Text into numerical 
#Using TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=3000, stop_words='english')
x = tfidf.fit_transform(data['Cleaned_message']).toarray()



8. Split the Dataset

In [69]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
print("Training Data Shape: ", x_train.shape)
print("Testing Data Shape: ", x_test.shape)

Training Data Shape:  (4125, 3000)
Testing Data Shape:  (1032, 3000)


9. Save Preprocessed Data

In [70]:
# After fitting the vectorizer
joblib.dump(tfidf, '../data/processed/tfidf_vectorizer.joblib')

['../data/processed/tfidf_vectorizer.joblib']

In [71]:
# Save as CSV
processed_data = pd.DataFrame(x, columns=tfidf.get_feature_names_out())
processed_data['Label_num'] = y.values
processed_data.to_csv('../data/processed/spam_processed.csv', index=False)