# Clasificación de emails spam vs. no spam

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score


# Cargar dataset

In [45]:
data = pd.read_csv("spam.csv", encoding="latin-1")[['v1','v2']]
data.columns = ['label', 'message']

In [47]:
data['label'] = data['label'].map({'ham':0, 'spam':1})


In [49]:
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(data['message'])
y = data['label']

In [55]:
print("Datos del data set")
display(data.head())
print("\nDescripcion del data set")
display(data.describe())
print("\nTipos de datos:")
print(data.dtypes)

Datos del data set


Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."



Descripcion del data set


Unnamed: 0,label
count,5572.0
mean,0.134063
std,0.340751
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0



Tipos de datos:
label       int64
message    object
dtype: object


# Dividir datos

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Árbol de Decisión

In [59]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)

# K-NN

In [61]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)

# Evaluación

In [63]:
print("Árbol de Decisión:")
print(classification_report(y_test, y_pred_dt))
print("Accuracy:", accuracy_score(y_test, y_pred_dt))

print("\nK-NN:")
print(classification_report(y_test, y_pred_knn))
print("Accuracy:", accuracy_score(y_test, y_pred_knn))

Árbol de Decisión:
              precision    recall  f1-score   support

           0       0.98      0.99      0.98      1453
           1       0.92      0.84      0.88       219

    accuracy                           0.97      1672
   macro avg       0.95      0.92      0.93      1672
weighted avg       0.97      0.97      0.97      1672

Accuracy: 0.9700956937799043

K-NN:
              precision    recall  f1-score   support

           0       0.90      1.00      0.95      1453
           1       1.00      0.28      0.44       219

    accuracy                           0.91      1672
   macro avg       0.95      0.64      0.70      1672
weighted avg       0.92      0.91      0.88      1672

Accuracy: 0.9061004784688995
