# NLP PROJECT - Spam Link Detection System

In [3]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import re

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import download

download("stopwords")
download("wordnet")
download('omw-1.4')

from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

import os
import joblib

[nltk_data] Downloading package stopwords to /home/vscode/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/vscode/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/vscode/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## STEP 1:  LOAD DATASET AND INSPECT DATA

In [4]:
url = "https://breathecode.herokuapp.com/asset/internal-link?id=435&path=url_spam.csv"
df = pd.read_csv(url)

pd.set_option("display.max_colwidth", None)
df.head(10)

Unnamed: 0,url,is_spam
0,https://briefingday.us8.list-manage.com/unsubscribe,True
1,https://www.hvper.com/,True
2,https://briefingday.com/m/v4n3i4f3,True
3,https://briefingday.com/n/20200618/m#commentform,False
4,https://briefingday.com/fan,True
5,https://www.brookings.edu/interactives/reopening-america-and-the-world/,False
6,https://www.reuters.com/investigates/special-report/health-coronavirus-britain-pub/,False
7,https://www.theatlantic.com/magazine/archive/2020/07/supermarkets-are-a-miracle/612244/,False
8,https://www.vox.com/2020/6/17/21294680/john-bolton-book-excerpts-trump-ukraine-china,False
9,https://www.theguardian.com/travel/2020/jun/18/end-of-tourism-coronavirus-pandemic-travel-industry,False


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2999 entries, 0 to 2998
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   url      2999 non-null   object
 1   is_spam  2999 non-null   bool  
dtypes: bool(1), object(1)
memory usage: 26.5+ KB


In [6]:
df.shape

(2999, 2)

In [7]:
# check if we have any empty cells of data
df.isna().sum()

url        0
is_spam    0
dtype: int64

In [8]:
# check label distribution
df['is_spam'].value_counts()

is_spam
False    2303
True      696
Name: count, dtype: int64

In [9]:
# check label distribution - percentage
df['is_spam'].value_counts(normalize=True) * 100

is_spam
False    76.792264
True     23.207736
Name: proportion, dtype: float64

In [44]:
# trying to see if we have dupes for urls
df['url'].value_counts().head()

url
https://www.bloomberg.com/tosv2.html                   26
https://briefingday.us8.list-manage.com/unsubscribe    13
https://www.hvper.com/                                 13
https://briefingday.com/fan                            13
https://briefingday.com/m/v4n3i4f3                     13
Name: count, dtype: int64

In [11]:
df.columns

Index(['url', 'is_spam'], dtype='object')

In [12]:
df.dtypes

url        object
is_spam      bool
dtype: object

## STEP 2: PREPROCESS THE LINKS

In [13]:
# base english stopwords (generic words that are over used - the, is, are, and, if etc)
stop_words = set((stopwords.words('english')))

# adding url type words to remove
custom_stopwords = stop_words | {'http', 'https', 'www', 'com', 'net', 'html'}

# lemmatizer - reduce words to their real meaning form (ex running - run, better - good, cars - car)
lemmatizer = WordNetLemmatizer()

In [14]:
def clean_url(url):
    url=url.lower() # lowercase
    url=re.sub(r'[^a-z]', ' ', url) # replace non-letters with space
    tokens=url.split()
    # remove stopwords and shot tokens
    tokens=[
        lemmatizer.lemmatize(word)
        for word in tokens
        if word not in custom_stopwords and len(word) > 2
    ]
    return ' '.join(tokens)

In [15]:
# apply our preprocessed data to our dataset
df['clean_url']=df['url'].apply(clean_url)

In [16]:
# check out new column vs original column
df[['url', 'clean_url']].head()

Unnamed: 0,url,clean_url
0,https://briefingday.us8.list-manage.com/unsubscribe,briefingday list manage unsubscribe
1,https://www.hvper.com/,hvper
2,https://briefingday.com/m/v4n3i4f3,briefingday
3,https://briefingday.com/n/20200618/m#commentform,briefingday commentform
4,https://briefingday.com/fan,briefingday fan


In [17]:
df.columns


Index(['url', 'is_spam', 'clean_url'], dtype='object')

In [18]:
# check if we have any empty cells of data - make sure our new columns dont have any empty cells
df.isna().sum()

url          0
is_spam      0
clean_url    0
dtype: int64

In [19]:
# assign our X and y
X=df['clean_url']
y=df['is_spam']

In [20]:
# setup our train and test splits
X_train, X_test, y_train, y_test=train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y # prevent class imbalance
)

In [21]:
# see size of train/test splits
print("Train size:", X_train.shape)
print("Test size:", X_test.shape)

print("\nTrain class distribution:")
print(y_train.value_counts(normalize=True))

print("\nTest class distribution:")
print(y_test.value_counts(normalize=True))

Train size: (2399,)
Test size: (600,)

Train class distribution:
is_spam
False    0.76782
True     0.23218
Name: proportion, dtype: float64

Test class distribution:
is_spam
False    0.768333
True     0.231667
Name: proportion, dtype: float64


In [22]:
# what our original df looks like as we look above at the splits
df.shape

(2999, 3)

## STEP 3:  BUILD AN SVM - BASELINE MODEL

In [None]:
# build SVM pipeline with default parameters
# build our pipeline (object which only excepts tuples) using a list of tuples (step_name, step_object) through which the data flows sequentially
# TF-IDF + Linear Support Vector Machine (SVM) using default parameters
svm_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('svm', LinearSVC())
])

In [24]:
# fit and train the SVM
svm_pipeline.fit(X_train, y_train)

0,1,2
,"steps  steps: list of tuples List of (name of step, estimator) tuples that are to be chained in sequential order. To be compatible with the scikit-learn API, all steps must define `fit`. All non-last steps must also define `transform`. See :ref:`Combining Estimators ` for more details.","[('tfidf', ...), ('svm', ...)]"
,"transform_input  transform_input: list of str, default=None The names of the :term:`metadata` parameters that should be transformed by the pipeline before passing it to the step consuming it. This enables transforming some input arguments to ``fit`` (other than ``X``) to be transformed by the steps of the pipeline up to the step which requires them. Requirement is defined via :ref:`metadata routing `. For instance, this can be used to pass a validation set through the pipeline. You can only set this if metadata routing is enabled, which you can enable using ``sklearn.set_config(enable_metadata_routing=True)``. .. versionadded:: 1.6",
,"memory  memory: str or object with the joblib.Memory interface, default=None Used to cache the fitted transformers of the pipeline. The last step will never be cached, even if it is a transformer. By default, no caching is performed. If a string is given, it is the path to the caching directory. Enabling caching triggers a clone of the transformers before fitting. Therefore, the transformer instance given to the pipeline cannot be inspected directly. Use the attribute ``named_steps`` or ``steps`` to inspect estimators within the pipeline. Caching the transformers is advantageous when fitting is time consuming. See :ref:`sphx_glr_auto_examples_neighbors_plot_caching_nearest_neighbors.py` for an example on how to enable caching.",
,"verbose  verbose: bool, default=False If True, the time elapsed while fitting each step will be printed as it is completed.",False

0,1,2
,"input  input: {'filename', 'file', 'content'}, default='content' - If `'filename'`, the sequence passed as an argument to fit is  expected to be a list of filenames that need reading to fetch  the raw content to analyze. - If `'file'`, the sequence items must have a 'read' method (file-like  object) that is called to fetch the bytes in memory. - If `'content'`, the input is expected to be a sequence of items that  can be of type string or byte.",'content'
,"encoding  encoding: str, default='utf-8' If bytes or files are given to analyze, this encoding is used to decode.",'utf-8'
,"decode_error  decode_error: {'strict', 'ignore', 'replace'}, default='strict' Instruction on what to do if a byte sequence is given to analyze that contains characters not of the given `encoding`. By default, it is 'strict', meaning that a UnicodeDecodeError will be raised. Other values are 'ignore' and 'replace'.",'strict'
,"strip_accents  strip_accents: {'ascii', 'unicode'} or callable, default=None Remove accents and perform other character normalization during the preprocessing step. 'ascii' is a fast method that only works on characters that have a direct ASCII mapping. 'unicode' is a slightly slower method that works on any characters. None (default) means no character normalization is performed. Both 'ascii' and 'unicode' use NFKD normalization from :func:`unicodedata.normalize`.",
,"lowercase  lowercase: bool, default=True Convert all characters to lowercase before tokenizing.",True
,"preprocessor  preprocessor: callable, default=None Override the preprocessing (string transformation) stage while preserving the tokenizing and n-grams generation steps. Only applies if ``analyzer`` is not callable.",
,"tokenizer  tokenizer: callable, default=None Override the string tokenization step while preserving the preprocessing and n-grams generation steps. Only applies if ``analyzer == 'word'``.",
,"analyzer  analyzer: {'word', 'char', 'char_wb'} or callable, default='word' Whether the feature should be made of word or character n-grams. Option 'char_wb' creates character n-grams only from text inside word boundaries; n-grams at the edges of words are padded with space. If a callable is passed it is used to extract the sequence of features out of the raw, unprocessed input. .. versionchanged:: 0.21  Since v0.21, if ``input`` is ``'filename'`` or ``'file'``, the data  is first read from the file and then passed to the given callable  analyzer.",'word'
,"stop_words  stop_words: {'english'}, list, default=None If a string, it is passed to _check_stop_list and the appropriate stop list is returned. 'english' is currently the only supported string value. There are several known issues with 'english' and you should consider an alternative (see :ref:`stop_words`). If a list, that list is assumed to contain stop words, all of which will be removed from the resulting tokens. Only applies if ``analyzer == 'word'``. If None, no stop words will be used. In this case, setting `max_df` to a higher value, such as in the range (0.7, 1.0), can automatically detect and filter stop words based on intra corpus document frequency of terms.",
,"token_pattern  token_pattern: str, default=r""(?u)\\b\\w\\w+\\b"" Regular expression denoting what constitutes a ""token"", only used if ``analyzer == 'word'``. The default regexp selects tokens of 2 or more alphanumeric characters (punctuation is completely ignored and always treated as a token separator). If there is a capturing group in token_pattern then the captured group content, not the entire match, becomes the token. At most one capturing group is permitted.",'(?u)\\b\\w\\w+\\b'

0,1,2
,"penalty  penalty: {'l1', 'l2'}, default='l2' Specifies the norm used in the penalization. The 'l2' penalty is the standard used in SVC. The 'l1' leads to ``coef_`` vectors that are sparse.",'l2'
,"loss  loss: {'hinge', 'squared_hinge'}, default='squared_hinge' Specifies the loss function. 'hinge' is the standard SVM loss (used e.g. by the SVC class) while 'squared_hinge' is the square of the hinge loss. The combination of ``penalty='l1'`` and ``loss='hinge'`` is not supported.",'squared_hinge'
,"dual  dual: ""auto"" or bool, default=""auto"" Select the algorithm to either solve the dual or primal optimization problem. Prefer dual=False when n_samples > n_features. `dual=""auto""` will choose the value of the parameter automatically, based on the values of `n_samples`, `n_features`, `loss`, `multi_class` and `penalty`. If `n_samples` < `n_features` and optimizer supports chosen `loss`, `multi_class` and `penalty`, then dual will be set to True, otherwise it will be set to False. .. versionchanged:: 1.3  The `""auto""` option is added in version 1.3 and will be the default  in version 1.5.",'auto'
,"tol  tol: float, default=1e-4 Tolerance for stopping criteria.",0.0001
,"C  C: float, default=1.0 Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. For an intuitive visualization of the effects of scaling the regularization parameter C, see :ref:`sphx_glr_auto_examples_svm_plot_svm_scale_c.py`.",1.0
,"multi_class  multi_class: {'ovr', 'crammer_singer'}, default='ovr' Determines the multi-class strategy if `y` contains more than two classes. ``""ovr""`` trains n_classes one-vs-rest classifiers, while ``""crammer_singer""`` optimizes a joint objective over all classes. While `crammer_singer` is interesting from a theoretical perspective as it is consistent, it is seldom used in practice as it rarely leads to better accuracy and is more expensive to compute. If ``""crammer_singer""`` is chosen, the options loss, penalty and dual will be ignored.",'ovr'
,"fit_intercept  fit_intercept: bool, default=True Whether or not to fit an intercept. If set to True, the feature vector is extended to include an intercept term: `[x_1, ..., x_n, 1]`, where 1 corresponds to the intercept. If set to False, no intercept will be used in calculations (i.e. data is expected to be already centered).",True
,"intercept_scaling  intercept_scaling: float, default=1.0 When `fit_intercept` is True, the instance vector x becomes ``[x_1, ..., x_n, intercept_scaling]``, i.e. a ""synthetic"" feature with a constant value equal to `intercept_scaling` is appended to the instance vector. The intercept becomes intercept_scaling * synthetic feature weight. Note that liblinear internally penalizes the intercept, treating it like any other term in the feature vector. To reduce the impact of the regularization on the intercept, the `intercept_scaling` parameter can be set to a value greater than 1; the higher the value of `intercept_scaling`, the lower the impact of regularization on it. Then, the weights become `[w_x_1, ..., w_x_n, w_intercept*intercept_scaling]`, where `w_x_1, ..., w_x_n` represent the feature weights and the intercept weight is scaled by `intercept_scaling`. This scaling allows the intercept term to have a different regularization behavior compared to the other features.",1
,"class_weight  class_weight: dict or 'balanced', default=None Set the parameter C of class i to ``class_weight[i]*C`` for SVC. If not given, all classes are supposed to have weight one. The ""balanced"" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``.",
,"verbose  verbose: int, default=0 Enable verbose output. Note that this setting takes advantage of a per-process runtime setting in liblinear that, if enabled, may not work properly in a multithreaded context.",0


In [25]:
# evaluate the SVM 
# fucntion we have been using for best display of our train vs test splits
def model_evaluator(X, y, model, dataset_name="Dataset"):
    y_pred = model.predict(X)
    print(f"\nEvaluation on {dataset_name}")
    print("-" * 50)
    print(classification_report(y, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y, y_pred))

In [26]:
# training set outputs
model_evaluator(X_train, y_train, svm_pipeline, dataset_name="Training Set (SVM)")




Evaluation on Training Set (SVM)
--------------------------------------------------
              precision    recall  f1-score   support

       False       1.00      0.99      0.99      1842
        True       0.98      0.99      0.98       557

    accuracy                           0.99      2399
   macro avg       0.99      0.99      0.99      2399
weighted avg       0.99      0.99      0.99      2399

Confusion Matrix:
[[1829   13]
 [   6  551]]


In [27]:
# testing set outputs
model_evaluator(X_test, y_test, svm_pipeline, dataset_name="Test Set (SVM)")


Evaluation on Test Set (SVM)
--------------------------------------------------
              precision    recall  f1-score   support

       False       0.97      0.97      0.97       461
        True       0.89      0.91      0.90       139

    accuracy                           0.95       600
   macro avg       0.93      0.94      0.94       600
weighted avg       0.95      0.95      0.95       600

Confusion Matrix:
[[445  16]
 [ 12 127]]


## STEP 4: OPTIMIZE THE PREVIOUS MODEL - MAKE A BETTER ONE - IF POSSIBLE

Trying Naive Bayes on our optimized model as LinearSVC defaults are already very stong

I optimized Naive Bayes because it benefits directly from TF-IDF parameter tuning and smoothing adjustments, which can significantly improve generalization in sparse text classification problems like URL spam detection.

In [28]:
# build our pipeline (object which only excepts tuples) using a list of tuples (step_name, step_object) through which the data flows sequentially
pipeline=Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('nb', MultinomialNB())
])

In [29]:
# build our hyperparameter dic and assign values to try
hyperparameter_dict = {
    'tfidf__ngram_range': [(1, 1), (1, 2)],        # word patterns - single words, and word pairs (ex. verify account, reset password)
    'tfidf__max_features': [3000, 5000],           # controls max number of words/ngrams kept.  Started with 3000 based on how many URLs we have
    'tfidf__min_df': [1, 2],                       # what do we keep? 1=keep everything, 2=remove single-occurance noise
    'nb__alpha': [0.01, 0.05, 0.1, 0.5, 1.0]       # controls probablity smoothing in Naive Bayes from very light to strong
}

The double underscore __ is how sklearn:

    Reaches inside the pipeline
    Targets a specific parameter of a specific step

It tells sklearn:

    “Here are the parameters I want you to try, and here are the possible values for each one.”

In [30]:
# setup our gridsearch_mode
gridsearch_model=GridSearchCV( # assign GridSearchCV
    estimator=pipeline,                   # apply Pipeline which contains TF-IDF vectorization and MultinomialNB
    param_grid=hyperparameter_dict,       # tells it which parameters to try
    scoring='f1',                         # used f1 for better output
    cv=5,                                 # how many folds
    n_jobs=-1                             # use all available CPU cores
)

I used GridSearchCV on a scikit-learn Pipeline to tune both TF-IDF vectorization parameters and the Naive Bayes classifier simultaneously, ensuring optimal performance while preventing data leakage.

How many models GridSearch trains:

    2x2x2x5=40 models (hyperparameter_dict)
    With 5-folds CV: 5x40= 200 total fits

I used F1-score to obtain balance between precision and recall

Due to minimal amount of spam we don't want to be accurate at predicting what is not spam which will look good, but not what we are looking for

In [31]:
# fit our train tuned model
gridsearch_model.fit(X_train, y_train)

0,1,2
,"estimator  estimator: estimator object This is assumed to implement the scikit-learn estimator interface. Either estimator needs to provide a ``score`` function, or ``scoring`` must be passed.",Pipeline(step...inomialNB())])
,"param_grid  param_grid: dict or list of dictionaries Dictionary with parameters names (`str`) as keys and lists of parameter settings to try as values, or a list of such dictionaries, in which case the grids spanned by each dictionary in the list are explored. This enables searching over any sequence of parameter settings.","{'nb__alpha': [0.01, 0.05, ...], 'tfidf__max_features': [3000, 5000], 'tfidf__min_df': [1, 2], 'tfidf__ngram_range': [(1, ...), (1, ...)]}"
,"scoring  scoring: str, callable, list, tuple or dict, default=None Strategy to evaluate the performance of the cross-validated model on the test set. If `scoring` represents a single score, one can use: - a single string (see :ref:`scoring_string_names`); - a callable (see :ref:`scoring_callable`) that returns a single value; - `None`, the `estimator`'s  :ref:`default evaluation criterion ` is used. If `scoring` represents multiple scores, one can use: - a list or tuple of unique strings; - a callable returning a dictionary where the keys are the metric  names and the values are the metric scores; - a dictionary with metric names as keys and callables as values. See :ref:`multimetric_grid_search` for an example.",'f1'
,"n_jobs  n_jobs: int, default=None Number of jobs to run in parallel. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details. .. versionchanged:: v0.20  `n_jobs` default changed from 1 to None",-1
,"refit  refit: bool, str, or callable, default=True Refit an estimator using the best found parameters on the whole dataset. For multiple metric evaluation, this needs to be a `str` denoting the scorer that would be used to find the best parameters for refitting the estimator at the end. Where there are considerations other than maximum score in choosing a best estimator, ``refit`` can be set to a function which returns the selected ``best_index_`` given ``cv_results_``. In that case, the ``best_estimator_`` and ``best_params_`` will be set according to the returned ``best_index_`` while the ``best_score_`` attribute will not be available. The refitted estimator is made available at the ``best_estimator_`` attribute and permits using ``predict`` directly on this ``GridSearchCV`` instance. Also for multiple metric evaluation, the attributes ``best_index_``, ``best_score_`` and ``best_params_`` will only be available if ``refit`` is set and all of them will be determined w.r.t this specific scorer. See ``scoring`` parameter to know more about multiple metric evaluation. See :ref:`sphx_glr_auto_examples_model_selection_plot_grid_search_digits.py` to see how to design a custom selection strategy using a callable via `refit`. See :ref:`this example ` for an example of how to use ``refit=callable`` to balance model complexity and cross-validated score. .. versionchanged:: 0.20  Support for callable added.",True
,"cv  cv: int, cross-validation generator or an iterable, default=None Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 5-fold cross validation, - integer, to specify the number of folds in a `(Stratified)KFold`, - :term:`CV splitter`, - An iterable yielding (train, test) splits as arrays of indices. For integer/None inputs, if the estimator is a classifier and ``y`` is either binary or multiclass, :class:`StratifiedKFold` is used. In all other cases, :class:`KFold` is used. These splitters are instantiated with `shuffle=False` so the splits will be the same across calls. Refer :ref:`User Guide ` for the various cross-validation strategies that can be used here. .. versionchanged:: 0.22  ``cv`` default value if None changed from 3-fold to 5-fold.",5
,"verbose  verbose: int Controls the verbosity: the higher, the more messages. - >1 : the computation time for each fold and parameter candidate is  displayed; - >2 : the score is also displayed; - >3 : the fold and candidate parameter indexes are also displayed  together with the starting time of the computation.",0
,"pre_dispatch  pre_dispatch: int, or str, default='2*n_jobs' Controls the number of jobs that get dispatched during parallel execution. Reducing this number can be useful to avoid an explosion of memory consumption when more jobs get dispatched than CPUs can process. This parameter can be: - None, in which case all the jobs are immediately created and spawned. Use  this for lightweight and fast-running jobs, to avoid delays due to on-demand  spawning of the jobs - An int, giving the exact number of total jobs that are spawned - A str, giving an expression as a function of n_jobs, as in '2*n_jobs'",'2*n_jobs'
,"error_score  error_score: 'raise' or numeric, default=np.nan Value to assign to the score if an error occurs in estimator fitting. If set to 'raise', the error is raised. If a numeric value is given, FitFailedWarning is raised. This parameter does not affect the refit step, which will always raise the error.",
,"return_train_score  return_train_score: bool, default=False If ``False``, the ``cv_results_`` attribute will not include training scores. Computing training scores is used to get insights on how different parameter settings impact the overfitting/underfitting trade-off. However computing the scores on the training set can be computationally expensive and is not strictly required to select the parameters that yield the best generalization performance. .. versionadded:: 0.19 .. versionchanged:: 0.21  Default value was changed from ``True`` to ``False``",False

0,1,2
,"input  input: {'filename', 'file', 'content'}, default='content' - If `'filename'`, the sequence passed as an argument to fit is  expected to be a list of filenames that need reading to fetch  the raw content to analyze. - If `'file'`, the sequence items must have a 'read' method (file-like  object) that is called to fetch the bytes in memory. - If `'content'`, the input is expected to be a sequence of items that  can be of type string or byte.",'content'
,"encoding  encoding: str, default='utf-8' If bytes or files are given to analyze, this encoding is used to decode.",'utf-8'
,"decode_error  decode_error: {'strict', 'ignore', 'replace'}, default='strict' Instruction on what to do if a byte sequence is given to analyze that contains characters not of the given `encoding`. By default, it is 'strict', meaning that a UnicodeDecodeError will be raised. Other values are 'ignore' and 'replace'.",'strict'
,"strip_accents  strip_accents: {'ascii', 'unicode'} or callable, default=None Remove accents and perform other character normalization during the preprocessing step. 'ascii' is a fast method that only works on characters that have a direct ASCII mapping. 'unicode' is a slightly slower method that works on any characters. None (default) means no character normalization is performed. Both 'ascii' and 'unicode' use NFKD normalization from :func:`unicodedata.normalize`.",
,"lowercase  lowercase: bool, default=True Convert all characters to lowercase before tokenizing.",True
,"preprocessor  preprocessor: callable, default=None Override the preprocessing (string transformation) stage while preserving the tokenizing and n-grams generation steps. Only applies if ``analyzer`` is not callable.",
,"tokenizer  tokenizer: callable, default=None Override the string tokenization step while preserving the preprocessing and n-grams generation steps. Only applies if ``analyzer == 'word'``.",
,"analyzer  analyzer: {'word', 'char', 'char_wb'} or callable, default='word' Whether the feature should be made of word or character n-grams. Option 'char_wb' creates character n-grams only from text inside word boundaries; n-grams at the edges of words are padded with space. If a callable is passed it is used to extract the sequence of features out of the raw, unprocessed input. .. versionchanged:: 0.21  Since v0.21, if ``input`` is ``'filename'`` or ``'file'``, the data  is first read from the file and then passed to the given callable  analyzer.",'word'
,"stop_words  stop_words: {'english'}, list, default=None If a string, it is passed to _check_stop_list and the appropriate stop list is returned. 'english' is currently the only supported string value. There are several known issues with 'english' and you should consider an alternative (see :ref:`stop_words`). If a list, that list is assumed to contain stop words, all of which will be removed from the resulting tokens. Only applies if ``analyzer == 'word'``. If None, no stop words will be used. In this case, setting `max_df` to a higher value, such as in the range (0.7, 1.0), can automatically detect and filter stop words based on intra corpus document frequency of terms.",
,"token_pattern  token_pattern: str, default=r""(?u)\\b\\w\\w+\\b"" Regular expression denoting what constitutes a ""token"", only used if ``analyzer == 'word'``. The default regexp selects tokens of 2 or more alphanumeric characters (punctuation is completely ignored and always treated as a token separator). If there is a capturing group in token_pattern then the captured group content, not the entire match, becomes the token. At most one capturing group is permitted.",'(?u)\\b\\w\\w+\\b'

0,1,2
,"alpha  alpha: float or array-like of shape (n_features,), default=1.0 Additive (Laplace/Lidstone) smoothing parameter (set alpha=0 and force_alpha=True, for no smoothing).",0.01
,"force_alpha  force_alpha: bool, default=True If False and alpha is less than 1e-10, it will set alpha to 1e-10. If True, alpha will remain unchanged. This may cause numerical errors if alpha is too close to 0. .. versionadded:: 1.2 .. versionchanged:: 1.4  The default value of `force_alpha` changed to `True`.",True
,"fit_prior  fit_prior: bool, default=True Whether to learn class prior probabilities or not. If false, a uniform prior will be used.",True
,"class_prior  class_prior: array-like of shape (n_classes,), default=None Prior probabilities of the classes. If specified, the priors are not adjusted according to the data.",


In [32]:
# extract the best pipeline
best_pipeline=gridsearch_model.best_estimator_

In [33]:
# see what our best hyperparameters are after training
print('Best hyperparameters:')
print(gridsearch_model.best_estimator_)

Best hyperparameters:
Pipeline(steps=[('tfidf', TfidfVectorizer(max_features=5000)),
                ('nb', MultinomialNB(alpha=0.01))])


In [34]:
# fucntion we have been using for best display of our train vs test splits
def model_evaluator(X, y, model, dataset_name="Dataset"):
    """
    Evaluates a classification model and prints standard metrics.
    """
    y_pred = model.predict(X)

    print(f"\nEvaluation on {dataset_name}")
    print("-" * 50)
    print(classification_report(y, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y, y_pred))

In [35]:
# training set outputs
model_evaluator(X_train, y_train, best_pipeline, dataset_name="Training Set")



Evaluation on Training Set
--------------------------------------------------
              precision    recall  f1-score   support

       False       0.99      0.99      0.99      1842
        True       0.97      0.98      0.97       557

    accuracy                           0.99      2399
   macro avg       0.98      0.99      0.98      2399
weighted avg       0.99      0.99      0.99      2399

Confusion Matrix:
[[1824   18]
 [  11  546]]


In [36]:
# testing set outputs

model_evaluator(X_test, y_test, best_pipeline, dataset_name="Test Set")


Evaluation on Test Set
--------------------------------------------------
              precision    recall  f1-score   support

       False       0.97      0.97      0.97       461
        True       0.89      0.91      0.90       139

    accuracy                           0.95       600
   macro avg       0.93      0.94      0.93       600
weighted avg       0.95      0.95      0.95       600

Confusion Matrix:
[[446  15]
 [ 13 126]]


In [37]:
# testing set outputs FROM SVM BASELINE
model_evaluator(X_test, y_test, svm_pipeline, dataset_name="Test Set (SVM)")


Evaluation on Test Set (SVM)
--------------------------------------------------
              precision    recall  f1-score   support

       False       0.97      0.97      0.97       461
        True       0.89      0.91      0.90       139

    accuracy                           0.95       600
   macro avg       0.93      0.94      0.94       600
weighted avg       0.95      0.95      0.95       600

Confusion Matrix:
[[445  16]
 [ 12 127]]


In [38]:
# TF-IDF knows what the features are
best_pipeline.named_steps['tfidf'].get_feature_names_out()

# Naive Bayes knows how those features behave
best_pipeline.named_steps['nb'].feature_log_prob_

array([[ -8.58537332, -13.04526172,  -8.82508595, ...,  -8.5189444 ,
        -13.04526172,  -8.65622203],
       [-11.51110848,  -7.62427124, -11.51110848, ..., -11.51110848,
         -5.77055844, -11.51110848]], shape=(2, 4599))

## BUILD A READABLE REPORT BASED ON ABOVE OUTPUT

In [39]:
# extract trained components from the pipeline
tfidf = best_pipeline.named_steps['tfidf']
nb = best_pipeline.named_steps['nb']

# get feature names and log probabilities
feature_names = tfidf.get_feature_names_out()
log_probs = nb.feature_log_prob_

In [40]:
feature_report = pd.DataFrame({
    "Feature": feature_names,
    "Log Prob (Not Spam)": log_probs[0],
    "Log Prob (Spam)": log_probs[1]
})

# add difference column for interpretability
feature_report["Log Prob Difference (Spam - Not Spam)"] = (
    feature_report["Log Prob (Spam)"] - feature_report["Log Prob (Not Spam)"]
)

In [41]:
print("\nMODEL VOCABULARY SUMMARY")
print("=" * 60)
print(f"Total features learned: {len(feature_report)}")


MODEL VOCABULARY SUMMARY
Total features learned: 4599


In [42]:
print("\nTOP WORDS INDICATING SPAM")
print("=" * 60)

display(
    feature_report
    .sort_values("Log Prob Difference (Spam - Not Spam)", ascending=False)
    .head(15)
)


TOP WORDS INDICATING SPAM


Unnamed: 0,Feature,Log Prob (Not Spam),Log Prob (Spam),Log Prob Difference (Spam - Not Spam)
2447,manage,-13.045262,-3.655621,9.389641
482,bloomberg,-13.045262,-4.256073,8.789189
4127,tosv,-13.045262,-4.256073,8.789189
1939,hvper,-13.045262,-4.34022,8.705042
4272,unsubscribe,-13.045262,-4.35298,8.692282
3904,subscribe,-13.045262,-4.771798,8.273463
1682,gettheelevator,-13.045262,-4.903238,8.142023
4294,user,-13.045262,-4.996704,8.048557
1373,essential,-13.045262,-5.083474,7.961788
3306,refer,-13.045262,-5.157493,7.887768


In [43]:
print("\nTOP WORDS INDICATING NOT SPAM")
print("=" * 60)

display(
    feature_report
    .sort_values("Log Prob Difference (Spam - Not Spam)", ascending=True)
    .head(15)
)


TOP WORDS INDICATING NOT SPAM


Unnamed: 0,Feature,Log Prob (Not Spam),Log Prob (Spam),Log Prob Difference (Spam - Not Spam)
2722,news,-4.59093,-11.511108,-6.920179
254,article,-5.023303,-11.511108,-6.487805
4506,world,-5.53359,-11.511108,-5.977519
4193,trump,-5.559325,-11.511108,-5.951783
377,bbc,-5.582012,-11.511108,-5.929097
2003,index,-5.673077,-11.511108,-5.838032
4297,utm,-5.702751,-11.511108,-5.808358
593,business,-5.708094,-11.511108,-5.803015
808,cnn,-5.732592,-11.511108,-5.778517
3067,politics,-5.753921,-11.511108,-5.757187


This report shows the words most strongly associated with spam and non-spam URLs based on learned Naive Bayes log probabilities, providing insight into how the model makes decisions.

## Although both models achieved comparable test performance, the optimized Naive Bayes model was selected as the final model due to its 
## robustness, scalability, and interpretability, which makes it better suited for production deployment and long-term maintenance.

## STEP 5:  SAVE MODEL

In [None]:
# create models directory if it doesn't exist
os.makedirs("models", exist_ok=True)

In [49]:
# save the best pipeline model to disk
joblib.dump(best_pipeline, "models/Spam link detection system_best model.pkl")

['models/Spam link detection system_best model.pkl']

In [None]:
# confirm save
print("Model saved successfully!")
print(os.listdir("models"))

Model saved successfully!
['Spam link detection system_best model.pkl']
