# Explore here

In [73]:
# Galerias
import pandas as pd
import numpy as np
import regex as re

import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk import download
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import os
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC

import warnings

In [74]:
url = "https://raw.githubusercontent.com/4GeeksAcademy/NLP-project-tutorial/main/url_spam.csv"
df = pd.read_csv(url)
df

Unnamed: 0,url,is_spam
0,https://briefingday.us8.list-manage.com/unsubs...,True
1,https://www.hvper.com/,True
2,https://briefingday.com/m/v4n3i4f3,True
3,https://briefingday.com/n/20200618/m#commentform,False
4,https://briefingday.com/fan,True
...,...,...
2994,https://www.smartcitiesworld.net/news/news/dee...,False
2995,https://www.youtube.com/watch,True
2996,https://techcrunch.com/2019/07/04/an-optimisti...,False
2997,https://www.technologyreview.com/2019/12/20/13...,False


In [75]:
# Descarga de los recursos 
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /home/vscode/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/vscode/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [76]:
# Limpieza de caracteres innecesarios

def clean_url(url):
    url = re.sub(r'https?://', '', url)
    url = re.sub(r'[^a-zA-Z ]', ' ', url)
    url = re.sub(r'\s+', ' ', url)
    return url.lower()

# Division de tokens y lematizacion

def preprocess_text(url):
    url = clean_url(url)
    tokens = url.split()
    tokens = [lemmatizer.lemmatize(t) for t in tokens if t not in stop_words and len(t) > 3]
    return " ".join(tokens)

df['is_spam'] = df['is_spam'].astype(int)
df['url'] = df['url'].apply(preprocess_text)

In [77]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2999 entries, 0 to 2998
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   url      2999 non-null   object
 1   is_spam  2999 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 47.0+ KB


El data set contiene 2999filas y 2 columnas.
*Url*: almacena str dedirecciones web.
*is_spam*: columna booleana. *True* y *False*

In [78]:
# Vectorizar 
vectorizer = TfidfVectorizer(max_features=5000, max_df=0.85, min_df=3, ngram_range=(1, 2))
X = vectorizer.fit_transform(df["url"]).toarray()
y = df["is_spam"]


In [79]:
# Dividimos en train y test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [80]:
# Balanceamos los valores son Smote
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)


In [81]:
# Escalamos los datos
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_balanced)
X_test_scaled = scaler.transform(X_test)
X_test_scaled.shape , X_test_scaled.shape

((600, 1917), (600, 1917))

In [82]:
# Entreno el modelo SVM con kernel RBF
model = SVC(kernel='rbf', C=1, gamma='scale', random_state=42)
model.fit(X_train_scaled, y_train_balanced)
y_p = model.predict(X_test_scaled)
print(classification_report(y_test, y_p))


              precision    recall  f1-score   support

           0       0.96      0.90      0.93       461
           1       0.73      0.87      0.80       139

    accuracy                           0.90       600
   macro avg       0.85      0.89      0.86       600
weighted avg       0.91      0.90      0.90       600



Clase *No spam* tiene una precisión alta del 96%, lo que significa que el modelo detecta correctamente la mayoría de los correos legítimos. Clase *Spam* tiene un recall del 87%, lo que indica que identifica la mayoría de los correos spam, aunque la precisión es del 73%.
El F1-score de 80% para spam muestra un buen equilibrio entre precisión y recall, pero aún podría mejorarse, para ello es recomendarle un GridSearchCV

In [83]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'kernel': ['linear', 'rbf'],
    'C': [0.01, 0.1, 1, 10, 50],
    'gamma': ['scale', 'auto']
}

grid = GridSearchCV(SVC(), param_grid, cv=5, scoring='f1', verbose=2, n_jobs=-1)
grid.fit(X_train_scaled, y_train_balanced)

print("Mejores parámetros:", grid.best_params_)

# Mejor modelo
best_model = grid.best_estimator_
y_pred_best = best_model.predict(X_test_scaled)
print(classification_report(y_test, y_pred_best))


Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] END .................C=0.01, gamma=scale, kernel=linear; total time=   6.7s
[CV] END .................C=0.01, gamma=scale, kernel=linear; total time=   6.9s
[CV] END .................C=0.01, gamma=scale, kernel=linear; total time=   6.9s
[CV] END .................C=0.01, gamma=scale, kernel=linear; total time=   6.7s
[CV] END .................C=0.01, gamma=scale, kernel=linear; total time=   6.9s
[CV] END ....................C=0.01, gamma=scale, kernel=rbf; total time=  16.5s
[CV] END ....................C=0.01, gamma=scale, kernel=rbf; total time=  15.4s
[CV] END ....................C=0.01, gamma=scale, kernel=rbf; total time=  15.5s
[CV] END ....................C=0.01, gamma=scale, kernel=rbf; total time=  16.2s
[CV] END ..................C=0.01, gamma=auto, kernel=linear; total time=   6.1s
[CV] END ....................C=0.01, gamma=scale, kernel=rbf; total time=  14.9s
[CV] END ..................C=0.01, gamma=auto, 

Accuracy nos dio un 95% es decir que las predicciones fueron correctas, lo cual es excelente. 
Macro Avg 93% , 94% y 93% →  promedio entre clases.
Weighted Avg arrojo un 95% lo que muestra que el modelo es consistente en ambas clases.
Clase *No Spam*, tiene alta precisión.
Clase *Spam*, tiene un buen recall, indicando que detecta correctamente la mayoría de los correos spam.
La combinación de precisión, recall y F1-score demuestra que el modelo está bien equilibrado.

En conclusion podemos decir que el modelo mejoro un 5% con relacion al anterior dandonos asi una mejor precision.