In [None]:
import pandas as pd

url = 'https://raw.githubusercontent.com/4GeeksAcademy/NLP-project-tutorial/main/url_spam.csv'
total_data = pd.read_csv(url)

total_data.head()

Unnamed: 0,url,is_spam
0,https://briefingday.us8.list-manage.com/unsubs...,True
1,https://www.hvper.com/,True
2,https://briefingday.com/m/v4n3i4f3,True
3,https://briefingday.com/n/20200618/m#commentform,False
4,https://briefingday.com/fan,True


Transformación categórica a numérica

In [66]:
total_data["is_spam"] = total_data["is_spam"].astype(int)
total_data.head()

Unnamed: 0,url,is_spam
0,https://briefingday.us8.list-manage.com/unsubs...,1
1,https://www.hvper.com/,1
2,https://briefingday.com/m/v4n3i4f3,1
3,https://briefingday.com/n/20200618/m#commentform,0
4,https://briefingday.com/fan,1


In [67]:
total_data = total_data.drop_duplicates()
total_data = total_data.reset_index(inplace = False, drop = True)
print(f"Spam: {len(total_data.loc[total_data['is_spam'] == 1])}")
print(f"No spam: {len(total_data.loc[total_data['is_spam'] == 0])}")

Spam: 244
No spam: 2125


In [68]:
total_data.shape

(2369, 2)

Voy ha aplicar tanto el procesamiento del texto como la lematización y eliminación de stopwords en un mismo código

In [69]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

irrelevant_tokens = {'http', 'https:', 'www', 'com', 'net', 'org'} #al tratarse de urls creo que estos tokens se pueden eliminar

def preprocess_url(url):
    #separar por '/', '.', '-', etc.
    tokens = re.split(r'[/.?-]+', url)
    tokens = [token.lower() for token in tokens if token and token.lower() not in irrelevant_tokens]    
    tokens = [token for token in tokens if token not in stop_words]
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return tokens


[nltk_data] Downloading package punkt to /home/vscode/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/vscode/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/vscode/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [70]:
total_data["url"] = total_data["url"].apply(preprocess_url)
print(total_data.head())

                                             url  is_spam
0  [briefingday, us8, list, manage, unsubscribe]        1
1                                        [hvper]        1
2                        [briefingday, v4n3i4f3]        1
3      [briefingday, n, 20200618, m#commentform]        0
4                             [briefingday, fan]        1


Convertir el texto en números

In [71]:
from sklearn.feature_extraction.text import TfidfVectorizer

tokens_list = total_data["url"]
tokens_list = [" ".join(tokens) for tokens in tokens_list]

vectorizer = TfidfVectorizer(max_features = 5000, max_df = 0.8, min_df = 5)
X = vectorizer.fit_transform(tokens_list).toarray()
y = total_data["is_spam"]

In [None]:
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], shape=(2369, 665))

Entrenamiento del modelo

In [72]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [77]:
from sklearn.svm import SVC

model = SVC(kernel = "rbf", C = 1.0, gamma = 0.5)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

In [78]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

0.959915611814346

95% de precisión me parece un resultado bastante bueno para un modelo de este tipo la verdad, pero vamos a ver si se puede mejorar optimizando los hiperparámetros

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

# espacio de hiperparámetros a explorar
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [0.001, 0.01, 0.1, 1],
    'kernel': ['rbf', 'linear']
}

grid_search = GridSearchCV(SVC(), param_grid, refit=True, cv=5, verbose=2)
grid_search.fit(X_train, y_train)

print("Mejores parámetros:", grid_search.best_params_)
print("Mejor modelo:", grid_search.best_estimator_)

y_pred = grid_search.predict(X_test)

Fitting 5 folds for each of 32 candidates, totalling 160 fits
[CV] END .....................C=0.1, gamma=0.001, kernel=rbf; total time=   0.2s
[CV] END .....................C=0.1, gamma=0.001, kernel=rbf; total time=   0.2s
[CV] END .....................C=0.1, gamma=0.001, kernel=rbf; total time=   0.1s
[CV] END .....................C=0.1, gamma=0.001, kernel=rbf; total time=   0.1s
[CV] END .....................C=0.1, gamma=0.001, kernel=rbf; total time=   0.1s
[CV] END ..................C=0.1, gamma=0.001, kernel=linear; total time=   0.2s
[CV] END ..................C=0.1, gamma=0.001, kernel=linear; total time=   0.2s
[CV] END ..................C=0.1, gamma=0.001, kernel=linear; total time=   0.2s
[CV] END ..................C=0.1, gamma=0.001, kernel=linear; total time=   0.2s
[CV] END ..................C=0.1, gamma=0.001, kernel=linear; total time=   0.2s
[CV] END ......................C=0.1, gamma=0.01, kernel=rbf; total time=   0.2s
[CV] END ......................C=0.1, gamma=0.0

In [80]:
accuracy_score(y_test, y_pred)

0.9641350210970464

Bueno, ha mejorado un 0.5% algo es algo