# Natural language processing: spam detection

In [None]:
# Python standard library imports
from pathlib import Path

import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import seaborn as sns

from nltk.corpus import stopwords
from scipy.stats import randint, uniform, loguniform
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

import helper_functions as funcs

nltk.download('stopwords')

RANDOM_SEED = 315
CLASS_WEIGHT = 'balanced'
N_SPLITS = 3

CV = StratifiedKFold(
    n_splits=N_SPLITS,
    shuffle=True,
    random_state=RANDOM_SEED
)

# Hyperparameter search settings
N_JOBS = 1
N_ITER = 10000

## 1. Data loading

### 1.1. Load data from URL

In [None]:
# Read csv file into dataframe
data_df = pd.read_csv('https://raw.githubusercontent.com/4GeeksAcademy/NLP-project-tutorial/main/url_spam.csv')

# Drop duplicates if any
data_df.drop_duplicates(inplace=True)
data_df.reset_index(inplace=True, drop=True)

### 1.2. Save a local copy

In [None]:
# Make a directory for raw data
Path('../data/raw').mkdir(exist_ok=True, parents=True)

# Save a local copy of the raw data
data_df.to_parquet('../data/raw/urls.parquet')

### 1.3. Inspect the data

In [None]:
data_df.head()

In [None]:
data_df.info()

## 2. EDA

### 2.1. Label frequency

In [None]:
label_counts = data_df['is_spam'].value_counts()

not_spam = label_counts.iloc[0]
spam = label_counts.iloc[1]

print(f'URLs are {(not_spam/(spam + not_spam)*100):.1f}% not spam')

### 2.2. URL length distribution

In [None]:
data_df['URL_length'] = data_df['url'].str.len().tolist()

plt.title('URL length distribution')
plt.xlabel('Characters')
plt.ylabel('URLs')
plt.hist(data_df['URL_length'], bins=30, color='black')
plt.show()

print(f"URL length mean: {np.mean(data_df['URL_length']):.0f}")
print(f"URL length min: {min(data_df['URL_length']):.0f}")
print(f"URL length max: {max(data_df['URL_length']):.0f}")

### 2.3. Short URLs

In [None]:
short_urls = data_df[data_df['URL_length'] < 20]
short_urls

### 2.4. Long URLs

In [None]:
long_urls = data_df[data_df['URL_length'] > 200]
long_urls

## 3. Data preprocessing

### 3.1. Label encoding

In [None]:
data_df['is_spam'] = data_df['is_spam'].astype(str)
data_df['is_spam'] = data_df['is_spam'].replace({'True': '1', 'False': '0'})
data_df['is_spam'] = data_df['is_spam'].astype(int)

### 3.2. Train test split

In [None]:
train_df, test_df = train_test_split(data_df, test_size=0.3, random_state=RANDOM_SEED)

### 3.3. URL vectorization

In [None]:
# Vectorize the URLs using TF-IDF
vectorizer = TfidfVectorizer()
train_urls = vectorizer.fit_transform(train_df['url'])
test_urls = vectorizer.transform(test_df['url'])

# Get the words from the vector model
feature_names = vectorizer.get_feature_names_out()

train_urls

In [None]:
feature_names

### 3.4. Mean TF-IDF value distribution

In [None]:
# Get the mean TF-IDF value for each feature
feature_means = np.mean(train_urls.toarray(), axis=1)

plt.title('Mean TF-IDF distribution')
plt.xlabel('Mean TF-IDF')
plt.ylabel('Features')
plt.hist(feature_means, bins=30, color='black')
plt.yscale('log')
plt.show()

## 3. SVM model

### 3.1. Baseline model performance

In [None]:
# Instantiate the support vector machine classifier with defaults
naive_svc = SVC(class_weight=CLASS_WEIGHT, random_state=RANDOM_SEED)

# Cross-validate the default model on the encoded training data
scores = cross_val_score(
    naive_svc,
    train_urls,
    train_df['is_spam'],
    cv=CV,
    n_jobs=-1
)

# Save the baseline cross-validation scores for later
cross_val_scores = {
    'Model': ['Naive SVC']*N_SPLITS,
    'Score': list(scores)
}

print(f'Naive SVC cross validation accuracy: {np.mean(scores)*100:.1f}+/-{np.std(scores)*100:.1f}%')

### 3.2. SVC hyperparameter optimization

In [None]:
# Set the hyperparameter search space
hyperparameters = {
    'C': [0.01, 0.1, 1, 10],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'degree': [2, 3, 4],
    'gamma': ['scale', 'auto'],
    'shrinking': [True, False],
    'decision_function_shape': ['ovo', 'ovr']
}

# Instantiate a new model
model = SVC(class_weight=CLASS_WEIGHT, random_state=RANDOM_SEED)

# Set up the grid search
grid = GridSearchCV(model, hyperparameters, scoring='accuracy', cv=CV, n_jobs=N_JOBS)

# Run the search
optimization_results = grid.fit(train_urls, train_df['is_spam'])

# Recover winning model & hyperparameters
optimized_svc = grid.best_estimator_
best_hyperparameters = grid.best_params_

print(f'Best hyperparameters:\n')

for key, val in best_hyperparameters.items():
    print(f' {key}: {val}')

In [None]:
funcs.plot_cross_validation(optimization_results)

In [None]:
# Cross-validate the optimized model on the encoded training data
scores = cross_val_score(
    optimized_svc,
    train_urls,
    train_df['is_spam'],
    cv=CV,
    n_jobs=-1
)

# Save the optimized cross-validation scores for later
cross_val_scores['Model'].extend(['Optimized SVC']*N_SPLITS)
cross_val_scores['Score'].extend(scores)

print(f'Optimized SVC cross validation accuracy: {np.mean(scores)*100:.1f}+/-{np.std(scores)*100:.1f}%')

### 3.3. TFIDFVectorizer + SVC hyperparameter optimization

In [None]:
tfidf = TfidfVectorizer()
svc = SVC(class_weight=CLASS_WEIGHT, random_state=RANDOM_SEED)

# Create pipeline with PCA, scaling, and classifier
pipe = Pipeline(steps=[('TFIDF', tfidf), ('SVC', svc)])

hyperparameters = {
    'TFIDF__strip_accents': ['ascii', 'unicode', None],
    'TFIDF__stop_words': ['english', stopwords.words('english'), None],
    'TFIDF__ngram_range': [(1,1), (1,2), (2,2)],
    'TFIDF__max_df': uniform(loc=0.0009, scale=0.9991),
    'TFIDF__min_df': uniform(loc=0.0, scale=0.0004),
    'TFIDF__max_features': randint(1, len(feature_names)),
    'TFIDF__binary': [True, False],
    'TFIDF__norm': ['l1', 'l2', None],
    'TFIDF__use_idf': [True, False],
    'TFIDF__smooth_idf': [True, False],
    'TFIDF__sublinear_tf': [True, False],
    'SVC__C': loguniform(10**-2, 100.0),
    'SVC__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'SVC__degree': [1, 2, 3],
    'SVC__gamma': ['scale', 'auto'],
    'SVC__shrinking': [True, False],
    'SVC__decision_function_shape': ['ovo', 'ovr']
}

grid = RandomizedSearchCV(
    pipe,
    hyperparameters,
    return_train_score=True,     # Return training scores for analysis
    cv=CV,                       # Use stratified shuffle split for cross-validation
    n_jobs=N_JOBS,               # Use all available CPU cores
    n_iter=N_ITER,               # Number of parameter combinations to try
    random_state=RANDOM_SEED     # Ensure reproducible results
)

# Run the search
optimization_results = grid.fit(train_df['url'], train_df['is_spam'])

# Recover winning model & hyperparameters
optimized_tfidf_svc = grid.best_estimator_
best_hyperparameters = grid.best_params_

print(f'Best hyperparameters:\n')

for key, val in best_hyperparameters.items():
    print(f' {key}: {val}')

In [None]:
funcs.plot_cross_validation(optimization_results)

In [None]:
# Cross-validate the optimized model on the encoded training data
scores = cross_val_score(
    optimized_tfidf_svc,
    train_df['url'],
    train_df['is_spam'],
    cv=CV,
    n_jobs=-1
)

# Save the optimized cross-validation scores for later
cross_val_scores['Model'].extend(['Optimized TFIDF + SVC']*N_SPLITS)
cross_val_scores['Score'].extend(scores)

print(f'Optimized TFIDF + SVC cross validation accuracy: {np.mean(scores)*100:.1f}+/-{np.std(scores)*100:.1f}%')

In [None]:
sns.boxplot(pd.DataFrame.from_dict(cross_val_scores), x='Model', y='Score')
plt.title('Model cross-validation performance comparison')
plt.ylabel('Accuracy (%)')
plt.show()

## 4. Model evaluation

In [None]:
optimized_tfidf_svc.fit(train_df['url'], train_df['is_spam'],)
predictions = optimized_tfidf_svc.predict(test_df['url'])

accuracy = accuracy_score(test_df['is_spam'], predictions)*100

# Plot the confusion matrix
cm = confusion_matrix(test_df['is_spam'], predictions, normalize='true')
cm_disp = ConfusionMatrixDisplay(confusion_matrix=cm)
_ = cm_disp.plot()

plt.title(f'Test set performance\noverall accuracy: {accuracy:.1f}%')
plt.xlabel('Predicted outcome')
plt.ylabel('True outcome')
plt.show()