# Explore here

In [92]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
import joblib
from imblearn.over_sampling import SMOTE      
from imblearn.under_sampling import RandomUnderSampler  
from imblearn.pipeline import Pipeline
from collections import Counter
import os



In [2]:
url = 'https://raw.githubusercontent.com/4GeeksAcademy/NLP-project-tutorial/main/url_spam.csv'
df = pd.read_csv(url)

In [27]:
display(df.head())

Unnamed: 0,url,is_spam,processed_url
0,https://briefingday.us8.list-manage.com/unsubs...,True,briefingday us8 list manage unsubscribe
1,https://www.hvper.com/,True,hvper
2,https://briefingday.com/m/v4n3i4f3,True,briefingday m v4n3i4f3
3,https://briefingday.com/n/20200618/m#commentform,False,briefingday n 20200618 m#commentform
4,https://briefingday.com/fan,True,briefingday fan


In [4]:
df.describe()

Unnamed: 0,url,is_spam
count,2999,2999
unique,2369,2
top,https://www.bloomberg.com/tosv2.html,False
freq,26,2303


In [5]:
df.duplicated().sum()

630

In [6]:
df.drop_duplicates(inplace=True, ignore_index=True)

In [7]:
df.isna().sum()

url        0
is_spam    0
dtype: int64

In [29]:
df.is_spam.value_counts()

is_spam
False    2125
True      244
Name: count, dtype: int64

preprocess urls

In [10]:
def preprocess_url(url):
    # Remove protocols like http, https
    url = re.sub(r'https?:\/\/', '', url)
    # Tokenize based on punctuation
    tokens = re.split(r'[\/\.\?\&\=\-\_]', url)
    # Remove empty tokens and convert to lowercase
    tokens = [token.lower() for token in tokens if token]
    # Optionally remove common words
    stopwords = ['www', 'com', 'html', 'php']
    tokens = [token for token in tokens if token not in stopwords]
    return ' '.join(tokens)

# Apply preprocessing to create a new column
df['processed_url'] = df['url'].apply(preprocess_url)

In [19]:
display(df.head(5))

Unnamed: 0,url,is_spam,processed_url
0,https://briefingday.us8.list-manage.com/unsubs...,True,briefingday us8 list manage unsubscribe
1,https://www.hvper.com/,True,hvper
2,https://briefingday.com/m/v4n3i4f3,True,briefingday m v4n3i4f3
3,https://briefingday.com/n/20200618/m#commentform,False,briefingday n 20200618 m#commentform
4,https://briefingday.com/fan,True,briefingday fan


In [18]:
url_random = df.sample(1).url.values[0]
url_random_preprcessed = preprocess_url (url_random)
print(url_random)
print('-'*100)
print(url_random_preprcessed)

https://www.theverge.com/2020/6/30/21308449/youtube-tv-price-increase-64-99-viacom-hbo-new-channels
----------------------------------------------------------------------------------------------------
theverge 2020 6 30 21308449 youtube tv price increase 64 99 viacom hbo new channels


split df

In [32]:
X = df['processed_url']
y = df['is_spam']  

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

vectorize

In [22]:
# Initialize the vectorizer
vectorizer = TfidfVectorizer(ngram_range=(1, 2))

# Fit and transform the training data
X_train_vectorized = vectorizer.fit_transform(X_train)

# Transform the test data
X_test_vectorized = vectorizer.transform(X_test)

resampling to handle class imbalance (after grid search improved only 9% the recall on true)

In [84]:
# Check the original class distribution
print('Original dataset shape:', Counter(y_train))

#Undersample the majority class
undersampler = RandomUnderSampler(sampling_strategy=0.5, random_state=42)
X_under, y_under = undersampler.fit_resample(X_train_vectorized, y_train)
print('After undersampling:', Counter(y_under))

# Oversample the minority class using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_under, y_under)
print('After SMOTE oversampling:', Counter(y_resampled))


Original dataset shape: Counter({False: 1700, True: 195})
After undersampling: Counter({False: 390, True: 195})
After SMOTE oversampling: Counter({False: 390, True: 390})


train SVM

In [85]:
# Initialize the SVM classifier
svm_model = SVC(kernel='linear', probability=True)

# Train the model on the resampled data
svm_model.fit(X_resampled, y_resampled)

In [86]:
# Make predictions on the test set
y_pred = svm_model.predict(X_test_vectorized)

# Print evaluation metrics
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("Accuracy Score:", accuracy_score(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

       False       0.97      0.96      0.96       425
        True       0.66      0.71      0.69        49

    accuracy                           0.93       474
   macro avg       0.81      0.84      0.82       474
weighted avg       0.94      0.93      0.93       474

Confusion Matrix:
[[407  18]
 [ 14  35]]
Accuracy Score: 0.9324894514767933


The model handles well only not spam results.
poor performance of the model on the actual spam results is due to high class imbalance, the majority of the urls are not spam (False    2125)

improvement from 0.57 to 0.65 recall for TRUE after oversampling the minority class

improvement from 0.57 to 0.82 recall for TRUE after undersampling the majority class

improvement from 0.57 to 0.71 recall for TRUE after undersampling and oversampling at the same time. 

after modifying sampling strategy undersampler = RandomUnderSampler(sampling_strategy=0.8, random_state=42)   it seems that feeding more urls to the model make it less reliable in understanding when the URL is an actual SPAM.

Apllying grid search

In [87]:
# Define the parameter grid
param_grid = {
    'C': [0.1, 1, 10],                  # Regularization parameter
    'kernel': ['linear', 'rbf'],         # Kernel type
    'gamma': ['scale', 'auto']           # Kernel coefficient for 'rbf'
}


In [88]:
# Initialize the SVM classifier
svm = SVC(probability=True)


In [89]:
# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=svm,
    param_grid=param_grid,
    scoring='f1',       # Use F1 score for evaluation
    cv=5,               # 5-fold cross-validation
    n_jobs=-1           # Utilize all available cores
)

# Fit GridSearchCV to the resampled training data
grid_search.fit(X_resampled, y_resampled)


In [90]:
# Get the best parameters
print("Best parameters found:")
print(grid_search.best_params_)

# Get the best estimator
best_model = grid_search.best_estimator_


Best parameters found:
{'C': 1, 'gamma': 'scale', 'kernel': 'linear'}


In [91]:
# Predict on the test data
y_pred = best_model.predict(X_test_vectorized)

# Evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("Accuracy Score:", accuracy_score(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

       False       0.97      0.96      0.96       425
        True       0.66      0.71      0.69        49

    accuracy                           0.93       474
   macro avg       0.81      0.84      0.82       474
weighted avg       0.94      0.93      0.93       474

Confusion Matrix:
[[407  18]
 [ 14  35]]
Accuracy Score: 0.9324894514767933


In [93]:
# Save the best model
joblib.dump(best_model, 'svm_model.pkl')

# Specify the directory where you want to save the model
directory = r'C:\Users\aless\Desktop\4 geeks projects\ale-nlp-project\models'  # Replace with your actual path


# Define the full paths for the model and vectorizer
model_path = os.path.join(directory, 'svm_model.pkl')
vectorizer_path = os.path.join(directory, 'tfidf_vectorizer.pkl')

# Save the model
joblib.dump(best_model, model_path)

# Save the vectorizer
joblib.dump(vectorizer, vectorizer_path)


['C:\\Users\\aless\\Desktop\\4 geeks projects\\ale-nlp-project\\models\\tfidf_vectorizer.pkl']

test saved model

In [94]:
# Load the model
best_model = joblib.load(model_path)

# Load the vectorizer
vectorizer = joblib.load(vectorizer_path)

In [97]:
# Preprocess a new URL
def preprocess_url(url):
    # Remove protocols
    url = re.sub(r'https?://', '', url)
    # Tokenize
    tokens = re.split(r'[/?&=._-]', url)
    # Lowercase and remove empty tokens
    tokens = [token.lower() for token in tokens if token]
    # Remove stopwords
    stopwords = ['www', 'com', 'html', 'php']
    tokens = [token for token in tokens if token not in stopwords]
    return ' '.join(tokens)

new_url = 'http://www.ikenmijnkunst.nl/index.php/exposities/exposities-2006'
processed_new_url = preprocess_url(new_url)

# Vectorize the new URL
new_url_vectorized = vectorizer.transform([processed_new_url])

# Make a prediction
prediction = best_model.predict(new_url_vectorized)
print(f"The URL is classified as: {'Spam' if prediction[0] else 'Not Spam'}")

The URL is classified as: Not Spam
