# Explore here

In [1]:
import pandas as pd

# URL of the dataset
url = "https://raw.githubusercontent.com/4GeeksAcademy/NLP-project-tutorial/main/url_spam.csv"

# Load the dataset into a pandas DataFrame
df = pd.read_csv(url)

# Display the first few rows of the DataFrame to ensure it's loaded correctly
print(df.head())


                                                 url  is_spam
0  https://briefingday.us8.list-manage.com/unsubs...     True
1                             https://www.hvper.com/     True
2                 https://briefingday.com/m/v4n3i4f3     True
3   https://briefingday.com/n/20200618/m#commentform    False
4                        https://briefingday.com/fan     True


In [7]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')



[nltk_data] Downloading package stopwords to /home/vscode/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/vscode/nltk_data...


True

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Load the dataset
url = "https://raw.githubusercontent.com/4GeeksAcademy/NLP-project-tutorial/main/url_spam.csv"
df = pd.read_csv(url)

# Define tokenizer, stopwords, and lemmatizer
tokenizer = RegexpTokenizer(r'\w+')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function to preprocess a single URL
def preprocess_url(url):
    # Tokenize the URL
    tokens = tokenizer.tokenize(url.lower())
    # Remove stopwords and lemmatize tokens
    filtered_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    return filtered_tokens

# Apply preprocessing to all URLs in the dataset
df['processed_url'] = df['url'].apply(preprocess_url)

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df['processed_url'], df['is_spam'], test_size=0.2, random_state=42)

# Display the first few rows of the training set to verify the preprocessing
print(X_train.head())



1569    [http, www, morningbrew, com, daily, story, 20...
2229    [http, www, morningbrew, com, daily, story, 20...
2296    [http, www, nytimes, com, article, maskne, acn...
1800    [http, podcasts, apple, com, u, podcast, foxy,...
1273    [http, www, nycpride, org, event, nyc, pride, ...
Name: processed_url, dtype: object


In [9]:
print(df.columns)


Index(['url', 'is_spam', 'processed_url'], dtype='object')


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Convert processed URLs to strings
X_train_str = X_train.apply(' '.join)
X_test_str = X_test.apply(' '.join)

# TF-IDF vectorization
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train_str)
X_test_tfidf = vectorizer.transform(X_test_str)

# Initialize SVM classifier with default parameters
svm_classifier = SVC()

# Train the classifier
svm_classifier.fit(X_train_tfidf, y_train)

# Predictions on the test set
y_pred = svm_classifier.predict(X_test_tfidf)

# Analyze the results
print("Classification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

       False       0.96      0.97      0.97       455
        True       0.91      0.88      0.89       145

    accuracy                           0.95       600
   macro avg       0.93      0.92      0.93       600
weighted avg       0.95      0.95      0.95       600



In [12]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

# Define the parameter grid
param_grid = {
    'C': uniform(loc=0, scale=10),  # Regularization parameter
    'gamma': ['scale', 'auto'],     # Kernel coefficient for 'rbf', 'poly', and 'sigmoid'
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid']  # Kernel type
}

# Initialize SVM classifier
svm_classifier = SVC()

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(svm_classifier, param_distributions=param_grid, n_iter=10, cv=5, random_state=42)

# Perform random search
random_search.fit(X_train_tfidf, y_train)

# Best parameters found
print("Best Parameters:", random_search.best_params_)

# Predictions on the test set using the best model
best_svm = random_search.best_estimator_
y_pred_best = best_svm.predict(X_test_tfidf)

# Analyze the results of the best model
print("Classification Report (Best Model):")
print(classification_report(y_test, y_pred_best))


Best Parameters: {'C': 3.0424224295953772, 'gamma': 'auto', 'kernel': 'linear'}
Classification Report (Best Model):
              precision    recall  f1-score   support

       False       0.98      0.95      0.96       455
        True       0.86      0.94      0.89       145

    accuracy                           0.95       600
   macro avg       0.92      0.94      0.93       600
weighted avg       0.95      0.95      0.95       600

