# Explore here

In [1]:
# Your code here
import pandas as pd

df = pd.read_csv('https://raw.githubusercontent.com/4GeeksAcademy/NLP-project-tutorial/main/url_spam.csv')
df.head()

Unnamed: 0,url,is_spam
0,https://briefingday.us8.list-manage.com/unsubs...,True
1,https://www.hvper.com/,True
2,https://briefingday.com/m/v4n3i4f3,True
3,https://briefingday.com/n/20200618/m#commentform,False
4,https://briefingday.com/fan,True


Paso 2: Preprocesa los enlaces
Utiliza lo visto en este módulo para transformar los datos para compatibilizarlos con el modelo que queremos entrenar. Segmenta las URLs en partes según sus signos de puntuación, elimina las stopwords, lematiza, etcétera.

Asegúrate de dividir convenientemente el conjunto de datos en train y test como hemos visto en lecciones anteriores.

In [2]:
import re

def segment_url(url):
    return re.split(r'\W+', url)

df['segmented_url'] = df['url'].apply(segment_url)


Segmentación de URLs por signos de puntuación: se divide las URLs en partes en función de los signos de puntuación, como el punto, barra, y guiones. 

In [4]:
pip install nltk

Collecting nltk
  Downloading nltk-3.8.2-py3-none-any.whl.metadata (2.9 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2024.7.24-cp312-cp312-win_amd64.whl.metadata (41 kB)
Collecting tqdm (from nltk)
  Downloading tqdm-4.66.5-py3-none-any.whl.metadata (57 kB)
Downloading nltk-3.8.2-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 1.5/1.5 MB 15.9 MB/s eta 0:00:00
Downloading regex-2024.7.24-cp312-cp312-win_amd64.whl (269 kB)
Downloading tqdm-4.66.5-py3-none-any.whl (78 kB)
Installing collected packages: tqdm, regex, nltk
Successfully installed nltk-3.8.2 regex-2024.7.24 tqdm-4.66.5
Note: you may need to restart the kernel to use updated packages.


In [5]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def remove_stopwords(words):
    return [word for word in words if word.lower() not in stop_words]

df['filtered_url'] = df['segmented_url'].apply(remove_stopwords)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rgarciamontero\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


Eliminar stopwords: Utilizando la biblioteca nltk, puedes eliminar las stopwords comunes.

In [6]:
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

def lemmatize_words(words):
    return [lemmatizer.lemmatize(word) for word in words]

df['lemmatized_url'] = df['filtered_url'].apply(lemmatize_words)


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rgarciamontero\AppData\Roaming\nltk_data...


Lematización: Utilizando WordNetLemmatizer de nltk para lematizar las palabras.

In [8]:
pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [11]:
df.head()

Unnamed: 0,url,is_spam,segmented_url,filtered_url,lemmatized_url
0,https://briefingday.us8.list-manage.com/unsubs...,True,"[https, briefingday, us8, list, manage, com, u...","[https, briefingday, us8, list, manage, com, u...","[http, briefingday, us8, list, manage, com, un..."
1,https://www.hvper.com/,True,"[https, www, hvper, com, ]","[https, www, hvper, com, ]","[http, www, hvper, com, ]"
2,https://briefingday.com/m/v4n3i4f3,True,"[https, briefingday, com, m, v4n3i4f3]","[https, briefingday, com, v4n3i4f3]","[http, briefingday, com, v4n3i4f3]"
3,https://briefingday.com/n/20200618/m#commentform,False,"[https, briefingday, com, n, 20200618, m, comm...","[https, briefingday, com, n, 20200618, comment...","[http, briefingday, com, n, 20200618, commentf..."
4,https://briefingday.com/fan,True,"[https, briefingday, com, fan]","[https, briefingday, com, fan]","[http, briefingday, com, fan]"


In [12]:
#train y test

from sklearn.model_selection import train_test_split

X = df['lemmatized_url'].apply(lambda x: ' '.join(x))  # Únelos de nuevo en texto
y = df['is_spam']  # Suponiendo que 'label' es la columna objetivo

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Paso 3: Construye un SVM
Comienza a resolver el problema implementando un SVM con los parámetros por defecto. Entrénalo y analiza sus resultados.

Paso 4: Optimiza el modelo anterior
Después de entrenar el SVM, optimiza sus hiperparámetros utilizando un grid search o un random search.

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score


#vectorizacion de los textos

tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Entrenar el SVM

svm = SVC()
svm.fit(X_train_tfidf, y_train)

# Evaluar el modelo

y_pred = svm.predict(X_test_tfidf)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))


Accuracy: 0.9466666666666667
              precision    recall  f1-score   support

       False       0.96      0.97      0.97       455
        True       0.91      0.87      0.89       145

    accuracy                           0.95       600
   macro avg       0.93      0.92      0.93       600
weighted avg       0.95      0.95      0.95       600



In [18]:
from sklearn.model_selection import GridSearchCV

# Optimizacion del modelo

# Define los parámetros a optimizar.
parameter_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto']
}

# Configura y ejecuta el grid search utilizando los parámetros definidos.
grid_search = GridSearchCV(SVC(), parameter_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_tfidf, y_train)

# Obtener y mostrar los mejores parámetros encontrados
best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")

# Entrenar el modelo con los mejores hiperparámetros
best_svm = grid_search.best_estimator_
best_svm.fit(X_train_tfidf, y_train)

# Evaluar el modelo optimizado
best_y_pred = best_svm.predict(X_test_tfidf)
print(f"Optimized Accuracy: {accuracy_score(y_test, best_y_pred)}")
print(classification_report(y_test, best_y_pred))



Best parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Optimized Accuracy: 0.9466666666666667
              precision    recall  f1-score   support

       False       0.98      0.95      0.96       455
        True       0.86      0.92      0.89       145

    accuracy                           0.95       600
   macro avg       0.92      0.94      0.93       600
weighted avg       0.95      0.95      0.95       600



Paso 5: Guarda el modelo

In [19]:
import joblib

# Suponiendo que 'best_svm' es tu modelo entrenado y optimizado
joblib.dump(best_svm, 'svm_model.pkl')

# Cargar el modelo guardado
loaded_svm = joblib.load('svm_model.pkl')
