In [None]:
import numpy as np
import pandas as pd
from sklearn import feature_extraction
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\chris\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Cargar datos
raw_mail_data = pd.read_csv('Datasets/spam_assassin.csv')

In [None]:
# Imprimiendo dataframe antes del preprocesamiento
print(raw_mail_data)

                                                   text  target
0     From ilug-admin@linux.ie Mon Jul 29 11:28:02 2...       0
1     From gort44@excite.com Mon Jun 24 17:54:21 200...       1
2     From fork-admin@xent.com Mon Jul 29 11:39:57 2...       1
3     From dcm123@btamail.net.cn Mon Jun 24 17:49:23...       1
4     From ilug-admin@linux.ie Mon Aug 19 11:02:47 2...       0
...                                                 ...     ...
5791  From ilug-admin@linux.ie Mon Jul 22 18:12:45 2...       0
5792  From fork-admin@xent.com Mon Oct 7 20:37:02 20...       0
5793  Received: from hq.pro-ns.net (localhost [127.0...       1
5794  From razor-users-admin@lists.sourceforge.net T...       0
5795  From rssfeeds@jmason.org Mon Sep 30 13:44:10 2...       0

[5796 rows x 2 columns]


In [None]:
# Preprocesamiento
mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)),'')
mail_data = raw_mail_data.drop_duplicates(subset=['text'], keep='first')
mail_data['text'] = mail_data['text'].str.lower().str.replace(r"[^a-zA-Z0-9 ]", " ", regex=True)
mail_data['text'] = mail_data['text'].apply(lambda x: " ".join([word for word in x.split() if word not in stop_words]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mail_data['text'] = mail_data['text'].str.lower().str.replace(r"[^a-zA-Z0-9 ]", " ", regex=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mail_data['text'] = mail_data['text'].apply(lambda x: " ".join([word for word in x.split() if word not in stop_words]))


In [None]:
# Imprimiendo dataframe despues del preprocesamiento
print(mail_data.head())

                                                text  target
0  ilug admin linux ie mon jul 29 11 28 02 2002 r...       0
1  gort44 excite com mon jun 24 17 54 21 2002 ret...       1
2  fork admin xent com mon jul 29 11 39 57 2002 r...       1
3  dcm123 btamail net cn mon jun 24 17 49 23 2002...       1
4  ilug admin linux ie mon aug 19 11 02 47 2002 r...       0


In [None]:
# Tamaño del dataframe
print(mail_data.shape)

(5329, 2)


In [None]:
# Dividir datos
X = mail_data['text']
Y = mail_data['target'].astype(int)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=3)

In [None]:
# Imprimimos el dataframe de X(Input)
print(X)

0       ilug admin linux ie mon jul 29 11 28 02 2002 r...
1       gort44 excite com mon jun 24 17 54 21 2002 ret...
2       fork admin xent com mon jul 29 11 39 57 2002 r...
3       dcm123 btamail net cn mon jun 24 17 49 23 2002...
4       ilug admin linux ie mon aug 19 11 02 47 2002 r...
                              ...                        
5791    ilug admin linux ie mon jul 22 18 12 45 2002 r...
5792    fork admin xent com mon oct 7 20 37 02 2002 re...
5793    received hq pro ns net localhost 127 0 0 1 hq ...
5794    razor users admin lists sourceforge net thu se...
5795    rssfeeds jmason org mon sep 30 13 44 10 2002 r...
Name: text, Length: 5329, dtype: object


In [None]:
# Imprimimos el dataframe de Y(Output)
print(Y)

0       0
1       1
2       1
3       1
4       0
       ..
5791    0
5792    0
5793    1
5794    0
5795    0
Name: target, Length: 5329, dtype: int32


In [None]:
# Vectorización
vectorizer = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)
X_train_features = vectorizer.fit_transform(X_train)
X_test_features = vectorizer.transform(X_test)

In [None]:
# Imprimir dataframe de X(Input) vectorizados
print(X_train_features)

  (0, 49291)	0.363141019798764
  (0, 26428)	0.0511376260924426
  (0, 100880)	0.3671656537764738
  (0, 37387)	0.1796382586187546
  (0, 72989)	0.047284018643231965
  (0, 66065)	0.17660732046727148
  (0, 12219)	0.07622092399542335
  (0, 5465)	0.04160083124032383
  (0, 7991)	0.07812498651967127
  (0, 8870)	0.07272389769376796
  (0, 85728)	0.008180718642877137
  (0, 78932)	0.008198290371090934
  (0, 40998)	0.01840081398715538
  (0, 102659)	0.010031532293468925
  (0, 69651)	0.06321002421703298
  (0, 74927)	0.026667375241976907
  (0, 84710)	0.05647212026128627
  (0, 5785)	0.02734794127389186
  (0, 80246)	0.02857283506814863
  (0, 68219)	0.013300050543040495
  (0, 81299)	0.027571282428473152
  (0, 46523)	0.02468740590407205
  (0, 61588)	0.047390452753155864
  (0, 24665)	0.06736551361681702
  (0, 65502)	0.02906873266287623
  :	:
  (3729, 28238)	0.08132409774903923
  (3729, 40972)	0.06118768477978502
  (3729, 36071)	0.07902970610214717
  (3729, 59369)	0.0780169424684116
  (3729, 66098)	0.0813240

In [None]:
# Modelo por el teorema de Bayes
# Probabilidades previas
P_spam = Y_train.mean()
P_no_spam = 1 - P_spam

# Probabilidades condicionales (usando matrices dispersas)
X_train_spam = X_train_features[Y_train == 1]
X_train_no_spam = X_train_features[Y_train == 0]

P_caracteristicas_spam = (X_train_spam.sum(axis=0) + 1) / (X_train_spam.sum() + X_train_features.shape[1])
P_caracteristicas_no_spam = (X_train_no_spam.sum(axis=0) + 1) / (X_train_no_spam.sum() + X_train_features.shape[1])

# Probabilidad de spam dado características
log_P_spam_caracteristicas = np.log(P_spam) + X_test_features @ np.log(P_caracteristicas_spam.T)
log_P_no_spam_caracteristicas = np.log(P_no_spam) + X_test_features @ np.log(P_caracteristicas_no_spam.T)

In [None]:
# Clasificación
clasificaciones = (log_P_spam_caracteristicas > log_P_no_spam_caracteristicas).astype(int)
clasificaciones = clasificaciones.ravel()
Y_test = Y_test.to_numpy().ravel()

In [None]:
# Evaluación
precision = np.mean(clasificaciones == Y_test)
recuperacion = np.sum((clasificaciones == 1) & (Y_test == 1)) / Y_test.sum()

In [None]:
# Entrenando el modelo
model = LogisticRegression()

# Entrenando el modelo de regresión logística con los datos de entrenamiento
model.fit(X_train_features, Y_train)

In [None]:
# Construyendo un sistema de predicción
mail = input('Email: ')
input_mail = [mail]

# Convertirlo en un vector de caracteristicas
input_mail_features = vectorizer.transform(input_mail)

# Haciendo la predicción
prediction = model.predict(input_mail_features)

In [None]:
# Imprimir resultados
print(prediction)

if(prediction[0]==0):
    print('Correo No Spam')

else:
    print('Correo Spam')

print('Precisión:', precision)
print('Recuperación:', recuperacion)

[1]
Correo Spam
Precisión: 0.9024390243902439
Recuperación: 0.6842105263157895
