In [1]:
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn.linear_model import LinearRegression

### Importar y crear el DF

In [2]:
df = pd.read_csv('spam.csv')

In [3]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### Convertir a numerico la columna categorica

In [4]:
df['label'] = df['Category'].map({"ham": 0, "spam": 1})
df.head()

Unnamed: 0,Category,Message,label
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


### Vectorizar contenido del email

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')


#### Convertir elementos de la columna `Message` en vestores

In [10]:
x = vectorizer.fit_transform(df["Message"])
print(x)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 40089 stored elements and shape (5572, 5000)>
  Coords	Values
  (0, 2604)	0.37481893517411735
  (0, 3693)	0.29316647811295243
  (0, 1080)	0.2902714639318764
  (0, 583)	0.2803523396804297
  (0, 792)	0.3166485510736851
  (0, 1911)	0.20705018630693411
  (0, 4915)	0.2535358656721699
  (0, 2730)	0.3166485510736851
  (0, 791)	0.3578051884984679
  (0, 938)	0.3166485510736851
  (0, 1871)	0.17571654236455628
  (0, 4813)	0.2094259752473381
  (1, 3523)	0.2718944069420321
  (1, 2766)	0.4083258549263009
  (1, 2572)	0.5236804332035243
  (1, 4863)	0.43162957585464123
  (1, 3532)	0.5466243141314314
  (2, 1636)	0.11676028650249681
  (2, 1365)	0.36440225960212075
  (2, 4891)	0.19287984407221892
  (2, 1000)	0.19686982823560253
  (2, 4869)	0.14953315491852773
  (2, 1430)	0.47550942852592687
  (2, 1102)	0.20418515380343544
  (2, 1496)	0.186288775446193
  :	:
  (5567, 4619)	0.2780731665972518
  (5567, 4543)	0.18311278812459378
  (5567, 1034)	0.23

In [11]:
y = df["label"]

## Crear modelo de regresion logistica(Clasificacion binaria)

In [13]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=5000)

#### Entrenar el modelo


In [14]:
model.fit(x,y)

#### Prediccion

In [24]:
email = """
Hello Juan,

Thank you for registering for the "Introduction to Data Science" course.
We confirm that your registration has been successfully registered.

📅 Course starts: Monday, June 10
🕒 Schedule: 6:00 PM to 8:00 PM (Argentina time)
📍 Mode: Online (Zoom)

You will receive an email with the access link and introductory materials in the next few days.
"""
email_vectorizado = vectorizer.transform([email])
print(email_vectorizado)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 24 stored elements and shape (1, 5000)>
  Coords	Values
  (0, 0)	0.3559161282881694
  (0, 115)	0.14617228536138413
  (0, 403)	0.1857988262512891
  (0, 1018)	0.17384496635659757
  (0, 1066)	0.34064325960067865
  (0, 1141)	0.14439165273732982
  (0, 1346)	0.16450141796892429
  (0, 2090)	0.143272307522838
  (0, 2591)	0.20994817069024427
  (0, 2601)	0.19737095207784064
  (0, 2890)	0.1779580641440847
  (0, 3343)	0.1803047298390218
  (0, 3355)	0.1758157262126066
  (0, 3534)	0.15979737931461058
  (0, 3683)	0.32647701520040595
  (0, 3872)	0.14946000907476364
  (0, 3896)	0.18289884841274298
  (0, 4039)	0.19737095207784064
  (0, 4042)	0.20286504849010792
  (0, 4313)	0.1803047298390218
  (0, 4371)	0.20286504849010792
  (0, 4499)	0.1532554075615205
  (0, 4543)	0.10522749828530496
  (0, 4999)	0.21993127072892674


In [29]:
result = model.predict(email_vectorizado)
if result[0] == 1: 
    print("Es spam")
else:
    print("No es spam")

resultado = model.predict_proba(email_vectorizado)
print("Probabilidad:",resultado)


No es spam
Probabilidad: [[0.84457669 0.15542331]]


In [None]:
import joblib
joblib.dump(model, "clasificador.pkl")