In [14]:
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from tensorflow.keras.layers import Dense, Flatten, Input, LSTM, Embedding, GlobalMaxPooling1D
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
!wget -nc https://lazyprogrammer.me/course_files/spam.csv

--2024-09-15 12:35:48--  https://lazyprogrammer.me/course_files/spam.csv
Resolving lazyprogrammer.me (lazyprogrammer.me)... 172.67.213.166, 104.21.23.210, 2606:4700:3031::6815:17d2, ...
Connecting to lazyprogrammer.me (lazyprogrammer.me)|172.67.213.166|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 503663 (492K) [text/csv]
Saving to: ‘spam.csv’


2024-09-15 12:35:49 (3.68 MB/s) - ‘spam.csv’ saved [503663/503663]



In [4]:
df = pd.read_csv('spam.csv', encoding='ISO-8859-1')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [5]:
df.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1, inplace=True)
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
df.rename(columns={"v1" : "label", "v2" : "data"}, inplace = True)

In [12]:
df["label"] = df["label"].map({"ham" : 0, "spam" : 1})
df.head()

Unnamed: 0,label,data
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [15]:
y = df["label"].values
X = df["data"]

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=41)

In [19]:
size = 20000
token = Tokenizer(num_words=size)
token.fit_on_texts(x_train)
x_train = token.texts_to_sequences(x_train)
x_test = token.texts_to_sequences(x_test)

In [26]:
word2idx = token.word_index
V = len(word2idx)
V

7283

In [30]:
x_train = pad_sequences(x_train)
T = x_train.shape[1]
x_train.shape

(3733, 162)

In [32]:
x_test = pad_sequences(x_test, maxlen= T)
x_test.shape

(1839, 162)

In [33]:
# Need to choose embedding dimensionality
D = 20

# need to choose hidden state dimensionality
M = 15

i = Input(shape=(T,))
x = Embedding(V+1, D)(i)
x = LSTM(M, return_sequences=True)(x)
x = GlobalMaxPooling1D()(x)
x = Dense(1, activation='sigmoid')(x)

model = Model(i, x)
model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.01), metrics=['accuracy'])

In [34]:
r = model.fit(x_train,y_train,validation_data=(x_test,y_test),epochs=10)

Epoch 1/10
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 91ms/step - accuracy: 0.8805 - loss: 0.2706 - val_accuracy: 0.9869 - val_loss: 0.0513
Epoch 2/10
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 71ms/step - accuracy: 0.9959 - loss: 0.0175 - val_accuracy: 0.9859 - val_loss: 0.0499
Epoch 3/10
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 74ms/step - accuracy: 0.9958 - loss: 0.0109 - val_accuracy: 0.9859 - val_loss: 0.0700
Epoch 4/10
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 82ms/step - accuracy: 0.9993 - loss: 0.0020 - val_accuracy: 0.9782 - val_loss: 0.0994
Epoch 5/10
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 83ms/step - accuracy: 0.9986 - loss: 0.0045 - val_accuracy: 0.9853 - val_loss: 0.0591
Epoch 6/10
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 70ms/step - accuracy: 1.0000 - loss: 8.3433e-04 - val_accuracy: 0.9859 - val_loss: 0.0732
Epoch 7/10
[1