In [62]:
# Load the dataset into a pandas DataFrame

# Import the pandas library
import pandas as pd

# Define the path to the dataset
dataset_path = '/workspaces/NLP-project-tutorial-omass/url_spam.csv'

# Load the dataset
df = pd.read_csv(dataset_path)

# Display the first few rows of the DataFrame to verify successful loading
df.head()


Unnamed: 0,url,is_spam
0,https://briefingday.us8.list-manage.com/unsubs...,True
1,https://www.hvper.com/,True
2,https://briefingday.com/m/v4n3i4f3,True
3,https://briefingday.com/n/20200618/m#commentform,False
4,https://briefingday.com/fan,True


In [63]:
# Enhanced Preprocessing

from sklearn.feature_extraction.text import TfidfVectorizer
from urllib.parse import urlparse

def preprocess_url(url):
    # Parse the URL and extract components
    parsed_url = urlparse(url)
    tokens = [parsed_url.netloc]  # Start with the domain name
    
    # Tokenize the path component
    path_tokens = parsed_url.path.split('/')  # This splits the path by "/"
    tokens.extend(filter(None, path_tokens))  # Add non-empty path tokens
    
    # Consider including query parameters as tokens
    query_tokens = parsed_url.query.split('&')
    tokens.extend(filter(None, query_tokens))  # Add non-empty query tokens
    
    # Normalize the tokens by lowercasing
    normalized_tokens = [token.lower() for token in tokens]
    
    return ' '.join(normalized_tokens)

# Apply the enhanced preprocessing to URLs
df['processed_url'] = df['url'].apply(preprocess_url)

# Display the first few rows to verify the preprocessing
df[['url', 'processed_url']].head()


Unnamed: 0,url,processed_url
0,https://briefingday.us8.list-manage.com/unsubs...,briefingday.us8.list-manage.com unsubscribe
1,https://www.hvper.com/,www.hvper.com
2,https://briefingday.com/m/v4n3i4f3,briefingday.com m v4n3i4f3
3,https://briefingday.com/n/20200618/m#commentform,briefingday.com n 20200618 m
4,https://briefingday.com/fan,briefingday.com fan


In [65]:
# Feature Extraction with TF-IDF (Retry after ensuring 'processed_url' exists)

from sklearn.feature_extraction.text import TfidfVectorizer

# Adjusting TfidfVectorizer settings as needed
tfidf_vectorizer = TfidfVectorizer(min_df=1, token_pattern=r'\S+')

try:
    X = tfidf_vectorizer.fit_transform(df['processed_url'])
    y = df['is_spam']
    print("Vectorization successful.")
except ValueError as e:
    print(f"Vectorization failed with error: {e}")


Vectorization successful.


In [66]:
# Splitting the Dataset

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [67]:
# Initialize and Train the SVM

from sklearn.svm import SVC

C_value = 1.2  # Adjust as needed
kernel_type = 'rbf'
gamma_value = 'scale'

svm_model = SVC(C=C_value, kernel=kernel_type, gamma=gamma_value)
svm_model.fit(X_train, y_train)


In [68]:
# Model Prediction

y_pred = svm_model.predict(X_test)


In [69]:
# Evaluate Model Performance

from sklearn.metrics import classification_report

report = classification_report(y_test, y_pred)
print(report)


              precision    recall  f1-score   support

       False       0.94      1.00      0.97       455
        True       1.00      0.81      0.90       145

    accuracy                           0.95       600
   macro avg       0.97      0.91      0.93       600
weighted avg       0.96      0.95      0.95       600



In [73]:
# Calculate Accuracy Score and Display Confusion Matrix

from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Generate and display a confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)


Accuracy: 0.95
[[455   0]
 [ 27 118]]


In [74]:
# Import the pickle library
import pickle

# Specify the filename for the saved model
model_filename = 'trained_svm_model.pkl'

# Open a file in write-binary (wb) mode to save the model
with open(model_filename, 'wb') as file:
    pickle.dump(svm_model, file)

print(f"Model saved successfully as {model_filename}.")


Model saved successfully as trained_svm_model.pkl.
