## **12-NLP**

## **1. Instalación y Importaciones**

In [90]:
!pip install unidecode
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [103]:
import pandas as pd
import re
from unidecode import unidecode
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from tqdm import tqdm
nltk.download('punkt')
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV





[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## **2. Abrir Dataset**

In [104]:

url = "https://breathecode.herokuapp.com/asset/internal-link?id=932&path=url_spam.csv"
df = pd.read_csv(url)

print(df.head())
print(df.shape)
print(df.columns)

                                                 url  is_spam
0  https://briefingday.us8.list-manage.com/unsubs...     True
1                             https://www.hvper.com/     True
2                 https://briefingday.com/m/v4n3i4f3     True
3   https://briefingday.com/n/20200618/m#commentform    False
4                        https://briefingday.com/fan     True
(2999, 2)
Index(['url', 'is_spam'], dtype='object')


## **3. Preprocesamiento de URLS**

In [95]:
tqdm.pandas()
# Función de preprocesamiento para URLs
# Función de preprocesamiento de URLs
def preprocess_url(url):
    url = url.lower()
    url = re.sub(r'[^\w]', ' ', url)  # Separar por signos de puntuación
    tokens = url.split()
    tokens = [unidecode(token) for token in tokens]

    # Stopwords personalizadas para URLs
    stop_words = stopwords.words('english')
    stop_words.extend(['www', 'com', 'http', 'https', 'html', 'php', 'index'])
    tokens = [token for token in tokens if token not in stop_words]

    # Lematización
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return ' '.join(tokens)

# Aplicar preprocesamiento
df['url_prepro'] = df['url'].progress_apply(preprocess_url)

# Revisar resultados
print(df.head())

100%|██████████| 2999/2999 [00:09<00:00, 327.83it/s]

                                                 url  is_spam  \
0  https://briefingday.us8.list-manage.com/unsubs...     True   
1                             https://www.hvper.com/     True   
2                 https://briefingday.com/m/v4n3i4f3     True   
3   https://briefingday.com/n/20200618/m#commentform    False   
4                        https://briefingday.com/fan     True   

                                url_prepro  
0  briefingday us8 list manage unsubscribe  
1                                    hvper  
2                     briefingday v4n3i4f3  
3       briefingday n 20200618 commentform  
4                          briefingday fan  





## **4. Dividir Dataset**

In [98]:

X = df['url_prepro']
y = df['is_spam']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## **5. Vectorización TF-IDF**

In [99]:

vectorizer = TfidfVectorizer(min_df=0.001)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

print("Número de características:", len(vectorizer.get_feature_names_out()))

Número de características: 1186


## **6. Entrenamiento de SVM básico**

In [100]:


# Entrenar SVM con parámetros por defecto
clf = SVC()
clf.fit(X_train_vec, y_train)

# Predecir
y_pred = clf.predict(X_test_vec)

# Evaluar resultados
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

       False       0.95      0.98      0.96       455
        True       0.94      0.83      0.88       145

    accuracy                           0.94       600
   macro avg       0.94      0.91      0.92       600
weighted avg       0.94      0.94      0.94       600



## **7. Optimización de SVM** mejoro el Modelo. Mayor Accuracy.

In [101]:

param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto']
}

grid = GridSearchCV(SVC(), param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid.fit(X_train_vec, y_train)

print("Mejores parámetros:", grid.best_params_)
print("Mejor score CV:", grid.best_score_)

# Evaluación final
y_pred_opt = grid.predict(X_test_vec)
print(classification_report(y_test, y_pred_opt))


Mejores parámetros: {'C': 1, 'gamma': 'scale', 'kernel': 'poly'}
Mejor score CV: 0.9649904314544189
              precision    recall  f1-score   support

       False       0.95      0.99      0.97       455
        True       0.95      0.82      0.88       145

    accuracy                           0.95       600
   macro avg       0.95      0.90      0.92       600
weighted avg       0.95      0.95      0.95       600

