# Explore here

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
import re
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

url = "https://breathecode.herokuapp.com/asset/internal-link?id=435&path=url_spam.csv"
df = pd.read_csv(url)
print(df.head())

                                                 url  is_spam
0  https://briefingday.us8.list-manage.com/unsubs...     True
1                             https://www.hvper.com/     True
2                 https://briefingday.com/m/v4n3i4f3     True
3   https://briefingday.com/n/20200618/m#commentform    False
4                        https://briefingday.com/fan     True


In [14]:
print(df.columns)

Index(['url', 'is_spam', 'clean_url'], dtype='object')


In [None]:
def preprocess_url(url):
    url = str(url).lower()  
    url = re.sub(r'[^\w]', ' ', url)  
    url = re.sub(r'\s+', ' ', url)  
    return url.strip()

df['clean_url'] = df['url'].apply(preprocess_url)


df = df[df['clean_url'].str.strip() != '']


vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['clean_url'])

y = df['is_spam']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [15]:
model = SVC()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.95      1.00      0.97       455
        True       0.98      0.83      0.90       145

    accuracy                           0.96       600
   macro avg       0.97      0.92      0.94       600
weighted avg       0.96      0.96      0.96       600



In [18]:
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf']
}

grid = GridSearchCV(SVC(), param_grid, cv=5)
grid.fit(X_train, y_train)

print("Best parameters:", grid.best_params_)
print("Best score:", grid.best_score_)

Best parameters: {'C': 10, 'kernel': 'rbf'}
Best score: 0.9666579679888656


In [20]:
import joblib
joblib.dump(grid.best_estimator_, 'spam_url_detector.pkl')

['spam_url_detector.pkl']