In [27]:
import pandas as pd
import numpy as np
from urllib.parse import urlparse
import tldextract
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from scipy.sparse import csr_matrix, hstack

# Load dataset
url = "https://raw.githubusercontent.com/4GeeksAcademy/NLP-project-tutorial/main/url_spam.csv"
data = pd.read_csv(url)

# Display the first few rows of the dataframe
print(data.head())


                                                 url  is_spam
0  https://briefingday.us8.list-manage.com/unsubs...     True
1                             https://www.hvper.com/     True
2                 https://briefingday.com/m/v4n3i4f3     True
3   https://briefingday.com/n/20200618/m#commentform    False
4                        https://briefingday.com/fan     True


In [28]:
def preprocess_url(url):
    # Parse the URL and extract components
    parsed_url = urlparse(url)
    path_tokens = re.split(r'\W+', parsed_url.path)
    return ' '.join([token for token in path_tokens if token])  # Join tokens back into a string

# Apply preprocessing to each URL
data['preprocessed'] = data['url'].apply(preprocess_url)

# Vectorize the preprocessed URLs
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
tfidf_features = vectorizer.fit_transform(data['preprocessed'])


In [29]:
# Function to extract additional features from URLs
def extract_features(url):
    features = {}
    parsed_url = urlparse(url)
    domain_info = tldextract.extract(url)
    
    features['url_length'] = len(url)
    features['path_length'] = len(parsed_url.path)
    features['use_https'] = 1 if parsed_url.scheme == "https" else 0
    features['num_subdomains'] = len(domain_info.subdomain.split('.')) if domain_info.subdomain else 0
    keywords = ['login', 'verify', 'account', 'banking', 'secure', 'update']
    features['keyword_usage'] = np.any([keyword in url for keyword in keywords])
    
    return pd.Series(features)

In [30]:
# Apply feature extraction to URLs
feature_columns = data['url'].apply(extract_features)

# Convert extracted features to sparse matrix
numeric_features = csr_matrix(feature_columns.astype(float).values)

# Combine TF-IDF features with numeric features
X_combined = hstack([tfidf_features, numeric_features])

# Labels
y = data['is_spam']

In [31]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)


In [32]:
# Initialize and train the SVM
model = SVC()
model.fit(X_train, y_train)

# Make predictions and evaluate the model
predictions = model.predict(X_test)
print(classification_report(y_test, predictions))
print("Confusion Matrix:\n", confusion_matrix(y_test, predictions))


              precision    recall  f1-score   support

       False       0.88      0.91      0.90       455
        True       0.69      0.60      0.64       145

    accuracy                           0.84       600
   macro avg       0.78      0.76      0.77       600
weighted avg       0.83      0.84      0.83       600

Confusion Matrix:
 [[416  39]
 [ 58  87]]


In [33]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto'],
    'kernel': ['linear', 'rbf', 'poly']
}

# Initialize the grid search
grid_search = GridSearchCV(SVC(), param_grid, verbose=1, cv=3, n_jobs=-1)

# Perform the grid search on the training data
grid_search.fit(X_train, y_train)

# Best parameters found
print("Best parameters:", grid_search.best_params_)

# Evaluate the best model found by the grid search
best_predictions = grid_search.predict(X_test)
print(classification_report(y_test, best_predictions))


Fitting 3 folds for each of 24 candidates, totalling 72 fits
Best parameters: {'C': 100, 'gamma': 'scale', 'kernel': 'linear'}
              precision    recall  f1-score   support

       False       0.93      0.96      0.94       455
        True       0.85      0.76      0.80       145

    accuracy                           0.91       600
   macro avg       0.89      0.86      0.87       600
weighted avg       0.91      0.91      0.91       600



In [42]:
# Initialize and train the SVM
model = SVC(C=100, gamma='scale', kernel='linear')
model.fit(X_train, y_train)

# Predict on the test set
predictions = model.predict(X_test)

# Evaluate the model
print(classification_report(y_test, predictions))
print("Confusion Matrix:\n", confusion_matrix(y_test, predictions))

              precision    recall  f1-score   support

       False       0.93      0.96      0.94       455
        True       0.85      0.76      0.80       145

    accuracy                           0.91       600
   macro avg       0.89      0.86      0.87       600
weighted avg       0.91      0.91      0.91       600

Confusion Matrix:
 [[436  19]
 [ 35 110]]


In [43]:
from sklearn.model_selection import cross_val_score

# Perform cross-validation and print the mean accuracy
scores = cross_val_score(SVC(C=10, gamma='scale', kernel='rbf'), X, y, cv=5)
print("Mean cross-validation accuracy:", scores.mean())


Mean cross-validation accuracy: 0.899954368391764


In [36]:
import pickle

model_filename = 'spam_detection_model.pkl'

with open(model_filename, 'wb') as file:
    pickle.dump(model, file)

print(f'Model saved to {model_filename}')


Model saved to spam_detection_model.pkl
