# Explore here

In [2]:
# Manejo de los datos
import pandas as pd
# Preprocesado
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from urllib.parse import urlparse
import re
# Division del conjunto de datos
from sklearn.model_selection import train_test_split
# Implementacion del modelo y vectorizacion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
# Optimizacion
from sklearn.model_selection import GridSearchCV
# Serializacion
import joblib

In [3]:
data = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/NLP-project-tutorial/main/url_spam.csv")
data.head(6)

Unnamed: 0,url,is_spam
0,https://briefingday.us8.list-manage.com/unsubs...,True
1,https://www.hvper.com/,True
2,https://briefingday.com/m/v4n3i4f3,True
3,https://briefingday.com/n/20200618/m#commentform,False
4,https://briefingday.com/fan,True
5,https://www.brookings.edu/interactives/reopeni...,False


In [4]:
#Pre-procesado URL
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def preprocess_url(data):
    #Parsear URL y extraer componentes
    parsed_url = urlparse(data)
    path_tokens = re.split(r'\W+', parsed_url.path)
    
    #Filtro stopwords y lematizar
    tokens = [lemmatizer.lemmatize(token.lower()) for token in path_tokens if token.lower() not in stop_words and token != '']
    
    return " ".join(tokens)

#Aplicar la funcion a cada URL
data["cleaned_url"] = data["url"].apply(preprocess_url)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\angel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\angel\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
#Dividir conjunto en train y test
X_train, X_test, y_train, y_test = train_test_split(data["cleaned_url"], data["is_spam"], test_size=0.2, random_state=42)

In [6]:
# Crear un pipeline con TF-IDF Vectorizer y SVC
model = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('svc', SVC())
])

#Entrenar modelo
model.fit(X_train, y_train)

#Evaluacion de modelo
score = model.score(X_test, y_test)
print(f"Accuracy: {score * 100:.2f}%")

Accuracy: 90.50%


In [7]:
#Optimizar modelo
param_grid = {
    'svc__C': [0.1, 1, 10],
    'svc__kernel': ['linear', 'rbf']
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("Mejor score: ", grid_search.best_score_)
print("Mejores parámetros: ", grid_search.best_params_)

Mejor score:  0.9158002783576895
Mejores parámetros:  {'svc__C': 1, 'svc__kernel': 'linear'}


In [8]:
from pickle import dump

dump(model, open("../models/detector_spam.sawb", "wb"))