# Explore here

In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Cargar el dataset
file_path = '../data/raw/url_spam.csv'
df = pd.read_csv(file_path)

# Análisis descriptivo básico
shape = df.shape
info = df.info()
describe = df.describe()

(shape, info, describe)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2999 entries, 0 to 2998
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   url      2999 non-null   object
 1   is_spam  2999 non-null   bool  
dtypes: bool(1), object(1)
memory usage: 26.5+ KB


((2999, 2),
 None,
                                          url is_spam
 count                                   2999    2999
 unique                                  2369       2
 top     https://www.bloomberg.com/tosv2.html   False
 freq                                      26    2303)

In [3]:
# 1. Verificar si existen duplicados
duplicados = df.duplicated().sum()
print(f"Número de filas duplicadas: {duplicados}")

# Si existen duplicados, los eliminamos
if duplicados > 0:
    df = df.drop_duplicates()
    print(f"Duplicados eliminados. El nuevo tamaño del dataset es: {df.shape}")

# 2. Análisis de valores nulos
valores_nulos = df.isnull().sum()
print("\nValores nulos por columna:")
print(valores_nulos)

# Mostrar el porcentaje de valores nulos por columna
porcentaje_nulos = (valores_nulos / df.shape[0]) * 100
print("\nPorcentaje de valores nulos por columna:")
print(porcentaje_nulos)

Número de filas duplicadas: 630
Duplicados eliminados. El nuevo tamaño del dataset es: (2369, 2)

Valores nulos por columna:
url        0
is_spam    0
dtype: int64

Porcentaje de valores nulos por columna:
url        0.0
is_spam    0.0
dtype: float64


In [7]:
import nltk

# Descargar las stopwords
nltk.download('stopwords')

# Descargar el lematizador WordNet
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /home/vscode/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/vscode/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [8]:
import re
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Inicializar el lematizador
lemmatizer = WordNetLemmatizer()

# Definir las stopwords para URLs
url_stopwords = set(stopwords.words('english')).union(set(['www', 'http', 'https', 'com', 'org', 'net']))

def preprocess_url(url):
    # Convertir a minúsculas
    url = url.lower()
    
    # Segmentar la URL usando signos de puntuación
    tokens = re.split(r'\W+', url)
    
    # Eliminar stopwords y lematizar
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in url_stopwords and word.isalpha()]
    
    return ' '.join(tokens)

# Aplicar el preprocesamiento a todas las URLs
df['processed_url'] = df['url'].apply(preprocess_url)

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(df['processed_url'], df['is_spam'], test_size=0.3, random_state=42)

# Mostrar las primeras filas del conjunto de entrenamiento preprocesado
print(X_train.head())

565       theskimm pick thing youll need smooth road trip
2419    newyorker culture essay coming rising fifty ye...
297                 inverse mind body first stay home dad
1844    wsj article mask could help stop coronavirus s...
238      axios barr trump southern district new york html
Name: processed_url, dtype: object


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# Vectorización de las URLs usando TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Entrenamiento del modelo SVM
svm_model = SVC(kernel='rbf')  # Utilizamos un kernel lineal
svm_model.fit(X_train_tfidf, y_train)

# Predicción en el conjunto de prueba
y_pred = svm_model.predict(X_test_tfidf)

# Evaluación del modelo
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy del modelo SVM: {accuracy:.2f}")
print("Reporte de clasificación:")
print(report)

Accuracy del modelo SVM: 0.94
Reporte de clasificación:
              precision    recall  f1-score   support

       False       0.95      0.99      0.97       641
        True       0.83      0.49      0.61        70

    accuracy                           0.94       711
   macro avg       0.89      0.74      0.79       711
weighted avg       0.93      0.94      0.93       711



In [13]:
from sklearn.model_selection import GridSearchCV

# Definir los parámetros para la búsqueda
param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf'],
    'gamma': [1, 0.1, 0.01, 0.001]
}

# Crear el modelo SVM
svm = SVC()

# Implementar Grid Search con validación cruzada
grid_search = GridSearchCV(svm, param_grid, refit=True, verbose=2, cv=5)
grid_search.fit(X_train_tfidf, y_train)

# Mejor combinación de hiperparámetros
print("Mejores hiperparámetros encontrados por Grid Search:")
print(grid_search.best_params_)

# Evaluación del modelo optimizado en el conjunto de prueba
y_pred_optimized = grid_search.predict(X_test_tfidf)
accuracy_optimized = accuracy_score(y_test, y_pred_optimized)
report_optimized = classification_report(y_test, y_pred_optimized)

print(f"Accuracy del modelo SVM optimizado: {accuracy_optimized:.2f}")
print("Reporte de clasificación del modelo optimizado:")
print(report_optimized)

Fitting 5 folds for each of 32 candidates, totalling 160 fits
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   0.1s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   0.1s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   0.1s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   0.1s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   0.1s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.1s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.1s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.1s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.1s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.1s
[CV] END ....................C=0.1, gamma=0.1, kernel=linear; total time=   0.1s
[CV] END ....................C=0.1, gamma=0.1, 

In [17]:
import joblib

# Guardar el modelo optimizado en un archivo
joblib.dump(grid_search.best_estimator_, 'svm_spam_detector.pkl')

['svm_spam_detector.pkl']