# Explore here

In [2]:
!pip install pandas regex nltk scikit-learn matplotlib wordcloud pickle-mixin


Defaulting to user installation because normal site-packages is not writeable
Collecting pandas
  Using cached pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.1 MB)
Collecting regex
  Using cached regex-2024.11.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (792 kB)
Collecting nltk
  Using cached nltk-3.9.1-py3-none-any.whl (1.5 MB)
Collecting scikit-learn
  Using cached scikit_learn-1.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.5 MB)
Collecting matplotlib
  Using cached matplotlib-3.10.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (8.6 MB)
Collecting wordcloud
  Using cached wordcloud-1.9.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (547 kB)
Collecting pickle-mixin
  Using cached pickle_mixin-1.0.2-py3-none-any.whl
Collecting numpy>=1.23.2 (from pandas)
  Using cached numpy-2.2.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.4 MB)
Collecting pytz>=2020.1 (from pandas)
  Usi

In [None]:
import pandas as pd

# Cargar CSV directamente desde GitHub
df = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/NLP-project-tutorial/main/url_spam.csv")

# Ver dimensiones y primeras filas
print("Dimensiones del dataset:", df.shape)
df.head()

Dimensiones del dataset: (2999, 2)


Unnamed: 0,url,is_spam
0,https://briefingday.us8.list-manage.com/unsubs...,True
1,https://www.hvper.com/,True
2,https://briefingday.com/m/v4n3i4f3,True
3,https://briefingday.com/n/20200618/m#commentform,False
4,https://briefingday.com/fan,True


In [7]:
import regex as re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import download

# Descargar recursos de NLTK necesarios
download("stopwords")
download("wordnet")

# Inicializar herramientas
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

# Función de preprocesamiento
def preprocess_url(url):
    # Convertir a minúsculas
    url = url.lower()
    
    # Reemplazar signos comunes por espacio
    url = re.sub(r"[\/\.\-_=:\?&%]+", " ", url)

    # Eliminar caracteres no alfabéticos
    url = re.sub(r"[^a-z ]+", "", url)

    # Tokenizar, lematizar y quitar stopwords/palabras cortas
    tokens = url.split()
    tokens = [lemmatizer.lemmatize(tok) for tok in tokens]
    tokens = [tok for tok in tokens if tok not in stop_words and len(tok) > 2]

    return " ".join(tokens)

# Aplicar la transformación
df["tokens"] = df["url"].apply(preprocess_url)

# Ver resultado
df[["url", "tokens", "is_spam"]].head()

[nltk_data] Downloading package stopwords to /home/vscode/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /home/vscode/nltk_data...


Unnamed: 0,url,tokens,is_spam
0,https://briefingday.us8.list-manage.com/unsubs...,http briefingday list manage com unsubscribe,True
1,https://www.hvper.com/,http www hvper com,True
2,https://briefingday.com/m/v4n3i4f3,http briefingday com vnif,True
3,https://briefingday.com/n/20200618/m#commentform,http briefingday com mcommentform,False
4,https://briefingday.com/fan,http briefingday com fan,True


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Unir tokens en una cadena para cada fila
corpus = df["tokens"].values

# Vectorizar con TF-IDF
vectorizer = TfidfVectorizer(max_features=5000, max_df=0.8, min_df=3)
X = vectorizer.fit_transform(corpus).toarray()

# Etiquetas: convertir booleanos a enteros (True → 1, False → 0)
y = df["is_spam"].astype(int)

# División en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Forma del conjunto de entrenamiento:", X_train.shape)
print("Forma del conjunto de prueba:", X_test.shape)


Forma del conjunto de entrenamiento: (2399, 1369)
Forma del conjunto de prueba: (600, 1369)


In [9]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Inicializar y entrenar el modelo SVM con kernel lineal
model = SVC(kernel="linear", random_state=42)
model.fit(X_train, y_train)

# Predecir sobre el conjunto de prueba
y_pred = model.predict(X_test)

# Evaluar el modelo
print(" Accuracy:", accuracy_score(y_test, y_pred))
print("\n Reporte de Clasificación:\n", classification_report(y_test, y_pred))
print("\n Matriz de Confusión:\n", confusion_matrix(y_test, y_pred))


 Accuracy: 0.92

 Reporte de Clasificación:
               precision    recall  f1-score   support

           0       0.93      0.97      0.95       455
           1       0.89      0.77      0.82       145

    accuracy                           0.92       600
   macro avg       0.91      0.87      0.89       600
weighted avg       0.92      0.92      0.92       600


 Matriz de Confusión:
 [[441  14]
 [ 34 111]]


In [10]:
from sklearn.model_selection import GridSearchCV

# Definir el modelo base
svc = SVC()

# Definir el espacio de búsqueda de hiperparámetros
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

# GridSearch con validación cruzada de 5 folds
grid_search = GridSearchCV(
    svc, param_grid, cv=5, scoring='f1', verbose=1, n_jobs=-1
)

# Ajustar el modelo
grid_search.fit(X_train, y_train)

# Mostrar mejores parámetros encontrados
print("🔧 Mejores parámetros:", grid_search.best_params_)

# Evaluar el mejor modelo en el test set
best_model = grid_search.best_estimator_
y_pred_opt = best_model.predict(X_test)

print("Accuracy optimizado:", accuracy_score(y_test, y_pred_opt))
print("\n Reporte optimizado:\n", classification_report(y_test, y_pred_opt))
print("\n Matriz de confusión optimizada:\n", confusion_matrix(y_test, y_pred_opt))


Fitting 5 folds for each of 12 candidates, totalling 60 fits
🔧 Mejores parámetros: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Accuracy optimizado: 0.95

 Reporte optimizado:
               precision    recall  f1-score   support

           0       0.96      0.97      0.97       455
           1       0.91      0.88      0.89       145

    accuracy                           0.95       600
   macro avg       0.94      0.92      0.93       600
weighted avg       0.95      0.95      0.95       600


 Matriz de confusión optimizada:
 [[443  12]
 [ 18 127]]


In [11]:
import pickle

# Guardar el modelo en archivo .sav
filename = "svm_url_spam_rbf_c10.sav"
with open(filename, "wb") as f:
    pickle.dump(best_model, f)

print(f"Modelo guardado como '{filename}'")


Modelo guardado como 'svm_url_spam_rbf_c10.sav'
