# Tutorial de Proyecto de NLP

### Importaciones

In [50]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, f1_score, recall_score
import joblib
import warnings

###  1. Cargar el conjunto de datos

In [51]:
warnings.filterwarnings('ignore')

In [30]:
url = "https://raw.githubusercontent.com/4GeeksAcademy/NLP-project-tutorial/main/url_spam.csv" 
df = pd.read_csv(url)
print(df.head()) # Para visualizar parte del dataset

                                                 url  is_spam
0  https://briefingday.us8.list-manage.com/unsubs...     True
1                             https://www.hvper.com/     True
2                 https://briefingday.com/m/v4n3i4f3     True
3   https://briefingday.com/n/20200618/m#commentform    False
4                        https://briefingday.com/fan     True


In [31]:
print(df.info()) # para visualizar si hay nulos y tipo de datos entre otros

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2999 entries, 0 to 2998
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   url      2999 non-null   object
 1   is_spam  2999 non-null   bool  
dtypes: bool(1), object(1)
memory usage: 26.5+ KB
None


### 2. Preprocesar los enlaces

In [32]:
# Descargar recursos de NLTK
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_url(url):
    # Separar tokens por caracteres especiales
    tokens = re.split(r'\W+', url)  
    # Pasar a minúsculas, eliminar stopwords y palabras vacías
    tokens = [lemmatizer.lemmatize(t.lower()) for t in tokens if t and t.lower() not in stop_words]
    return " ".join(tokens)

# Aplicar preprocesamiento
df['clean_url'] = df['url'].apply(preprocess_url)

[nltk_data] Downloading package stopwords to /home/vscode/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/vscode/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [33]:
# Dividir en train y test
X = df['clean_url']
y = df['is_spam']  # 1 = spam 0 = no spam
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [34]:
# Transformar texto a vectores
vectorizer = TfidfVectorizer(max_features=2000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

### 3. Construye un SVM

In [35]:
# Modelo SVM con parámetros por defecto
svm_model = SVC()
svm_model.fit(X_train_tfidf, y_train)

0,1,2
,C,1.0
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [36]:
# Predicciones
y_pred_ev = svm_model.predict(X_test_tfidf)
y_pred_en = svm_model.predict(X_train_tfidf)

In [37]:
# Evaluación
print('------------train---------------------------')
print("Accuracy:", accuracy_score(y_train, y_pred_en))
print(classification_report(y_train, y_pred_en))
print(confusion_matrix(y_train, y_pred_en))
print('------------test---------------------------')
print("Accuracy:", accuracy_score(y_test, y_pred_ev))
print(classification_report(y_test, y_pred_ev))
print(confusion_matrix(y_test, y_pred_ev))

------------train---------------------------
Accuracy: 0.984160066694456
              precision    recall  f1-score   support

       False       0.98      1.00      0.99      1848
        True       0.99      0.94      0.96       551

    accuracy                           0.98      2399
   macro avg       0.99      0.97      0.98      2399
weighted avg       0.98      0.98      0.98      2399

[[1843    5]
 [  33  518]]
------------test---------------------------
Accuracy: 0.955
              precision    recall  f1-score   support

       False       0.96      0.98      0.97       455
        True       0.95      0.86      0.90       145

    accuracy                           0.95       600
   macro avg       0.95      0.92      0.94       600
weighted avg       0.95      0.95      0.95       600

[[448   7]
 [ 20 125]]


**Observaciones:**
- Los resultados son bastante buenos tanto en el set de train y test, sin embargo, se aprecia un overfitting en el conjunto de entrenamiento y podria darse una mejoria en casos verdaderos del conjunto de test. 
- Se procederá a realizar mejora con la busqueda de hiperparametros por grid search.

### 4. Optimizar el modelo anterior

In [46]:
param_grid = {
    'C': [0.1, 1, 10], # controla la penalización por errores
    'kernel': ['linear', 'rbf'], # define la función de transformación
    'gamma': ['scale', 'auto']   # es un parámetro del kernel RBF
}

# Hacer búsqueda exhaustiva de combinaciones de hiperparámetros anteriores usando cross-validation
grid = GridSearchCV(SVC(), param_grid, cv=3, verbose=2, n_jobs=-1, scoring=make_scorer(recall_score)) 
grid.fit(X_train_tfidf, y_train)

print("Mejores parámetros:", grid.best_params_)
y_pred_best = grid.predict(X_test_tfidf)
print("Accuracy tras optimización:", accuracy_score(y_test, y_pred_best))

Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   0.2s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   0.2s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   0.3s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   0.3s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   0.3s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   0.3s
[CV] END ...................C=0.1, gamma=auto, kernel=linear; total time=   0.2s
[CV] END ...................C=0.1, gamma=auto, kernel=linear; total time=   0.2s
[CV] END ...................C=0.1, gamma=auto, kernel=linear; total time=   0.2s
[CV] END ......................C=0.1, gamma=auto, kernel=rbf; total time=   0.2s
[CV] END ......................C=0.1, gamma=auto, kernel=rbf; total time=   0.2s
[CV] END ......................C=0.1, gamma=auto

In [None]:
# Nuevo modelo con los parametros encontrados
svm2_model = SVC( C=0.5, kernel='linear', gamma='scale') 
# Aunque se halló un c=10 en la busqueda asignando 0.5 buscamos reducir el overfitting
svm2_model.fit(X_train_tfidf, y_train)

0,1,2
,C,0.5
,kernel,'linear'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [48]:
# Predicciones modelo 2
y_pred_ev2 = svm2_model.predict(X_test_tfidf)
y_pred_en2 = svm2_model.predict(X_train_tfidf)

In [49]:
# Evaluación
print('------------train---------------------------')
print("Accuracy:", accuracy_score(y_train, y_pred_en2))
print(classification_report(y_train, y_pred_en2))
print(confusion_matrix(y_train, y_pred_en2))
print('------------test---------------------------')
print("Accuracy:", accuracy_score(y_test, y_pred_ev2))
print(classification_report(y_test, y_pred_ev2))
print(confusion_matrix(y_test, y_pred_ev2))

------------train---------------------------
Accuracy: 0.968736973739058
              precision    recall  f1-score   support

       False       0.96      1.00      0.98      1848
        True       0.99      0.87      0.93       551

    accuracy                           0.97      2399
   macro avg       0.98      0.94      0.95      2399
weighted avg       0.97      0.97      0.97      2399

[[1843    5]
 [  70  481]]
------------test---------------------------
Accuracy: 0.9416666666666667
              precision    recall  f1-score   support

       False       0.94      0.99      0.96       455
        True       0.95      0.80      0.87       145

    accuracy                           0.94       600
   macro avg       0.95      0.89      0.92       600
weighted avg       0.94      0.94      0.94       600

[[449   6]
 [ 29 116]]


### Guardar los modelos

In [44]:
# Guardar los modelos
joblib.dump(svm_model, "svm_url_spam_model.pkl")
joblib.dump(svm2_model, "svm_url_spam_model2.pkl")

# Guardar el vectorizador TF-IDF
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

['tfidf_vectorizer.pkl']

**Observaciones finales:**
- Aunque los resultados iniciales del modelo SVM eran bastante buenos, se busco mejorar el overfitting y el recall en casos de positivos en el conjunto de test, por lo que se realizo una busqueda exhaustiva.
- Una vez realizada la busqueda se opto por usar un C = 0.5, a pesar de la sugerencia de un 10 de la busqueda, dado que este es uno de los parametros que controla la tolerancia a errores en el entrenamiento, sin embargo esto tambien hizo que los resultados en los casos positivos de test bajaran hasta un 0.6 en el recall.
- El modelo inicial tiene resultados bastante buenos y en este caso donde el set ni el modelo requieren mucho procesamiento, puede ser una excelente opción.
- Es posible que el recall en los casos positivos en ambos conjuntos de datos este por debajo de 90% debido al desbalance de clases del dataset.