In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder

In [3]:
data = pd.read_csv('SMSSpamCollection', sep='\t', names=['Label', 'SMS'])

In [4]:
print(data.head())
print(data.info())

  Label                                                SMS
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Label   5572 non-null   object
 1   SMS     5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB
None


In [5]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['SMS'])
X = tokenizer.texts_to_sequences(data['SMS'])  # токенизируем текст в виде массивов чисел, где каждое число ключ слова
X = pad_sequences(X, maxlen=50)  # представляем текст в виде двумерного массива, где каждая стока имеет длину 50
# только 70 SMS теряют информацию

In [6]:
le = LabelEncoder()
y = le.fit_transform(data['Label'])  # переводит label в 0-1 вид, где 0 - не спам, 1 - спам 
y = np.array(y)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
model = Sequential([
                    Embedding(len(tokenizer.word_index) + 1, 100),
                    SpatialDropout1D(0.2),
                    LSTM(100, dropout=0.2, recurrent_dropout=0.2),
                    Dense(1, activation='sigmoid')
                    ])

model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.01), metrics = ['accuracy'])
model.summary()

In [9]:
model.fit(X_train, y_train, epochs=5, batch_size=64, validation_split=0.1)

Epoch 1/5
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 86ms/step - accuracy: 0.9111 - loss: 0.2194 - val_accuracy: 0.9798 - val_loss: 0.0869
Epoch 2/5
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 80ms/step - accuracy: 0.9962 - loss: 0.0157 - val_accuracy: 0.9686 - val_loss: 0.1321
Epoch 3/5
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 84ms/step - accuracy: 0.9979 - loss: 0.0048 - val_accuracy: 0.9798 - val_loss: 0.1385
Epoch 4/5
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 103ms/step - accuracy: 1.0000 - loss: 3.2715e-04 - val_accuracy: 0.9776 - val_loss: 0.1604
Epoch 5/5
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 83ms/step - accuracy: 1.0000 - loss: 1.3688e-04 - val_accuracy: 0.9798 - val_loss: 0.1658


<keras.src.callbacks.history.History at 0x23e0d23fd90>

In [10]:
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5).astype(int)
f1 = f1_score(y_test, y_pred, average='weighted')
print("F1-score:", f1)

[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step
F1-score: 0.9918457381746048


In [11]:
import torch
from torch import nn
import torch.nn.functional as F
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader

from tqdm import tqdm

In [70]:
class LSTM_model(nn.Module):
    def __init__(self, parameters={}):
        super().__init__()

        # ----------------------------------------- model -------------------------------------------
        self.emb = nn.Embedding(len(tokenizer.word_index) + 1, 30)
        self.lin1 = nn.Linear(30, 1)
        self.drop1d = nn.Dropout1d(0.2)
        self.lstm = nn.LSTM(input_size=50, hidden_size=20, num_layers=2, dropout=0.1)
        self.lin2 = nn.Linear(20, 1)
        # --------------------------------------------------------------------------------------------

    def forward(self, x):
        x = self.emb(x)
        x = self.lin1(x).view(x.shape[0], x.shape[1])
        x = self.drop1d(x)
        x = self.lstm(x)[0]
        x = F.sigmoid(self.lin2(x)).view(x.shape[0])

        return x

In [66]:
X_train_t, X_test_t, y_train_t, y_test_t = torch.Tensor(np.array(X_train)), \
        torch.Tensor(np.array(X_test)), \
        torch.Tensor(np.array(y_train)), \
        torch.Tensor(np.array(y_test))
X_train_t = X_train_t.type(torch.cuda.LongTensor)
X_test_t = X_test_t.type(torch.cuda.LongTensor)

train_dataset, test_dataset = TensorDataset(X_train_t, y_train_t), TensorDataset(X_test_t, y_test_t)
batch_size = 32
train_loader, test_loader = DataLoader(train_dataset, batch_size), DataLoader(test_dataset, 1)

In [14]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [71]:
model = LSTM_model()
model.to(device)

LSTM_model(
  (emb): Embedding(9010, 30)
  (lin1): Linear(in_features=30, out_features=1, bias=True)
  (drop1d): Dropout1d(p=0.2, inplace=False)
  (lstm): LSTM(50, 20, num_layers=2, dropout=0.1)
  (lin2): Linear(in_features=20, out_features=1, bias=True)
)

In [72]:
len_train_data = len(train_loader)
epochs = 5
criterion = nn.BCELoss()
optimizer = Adam(model.parameters(), lr=0.005)

with tqdm(total=epochs * len_train_data, position=0, leave=True) as pbar:

    for epoch in range(epochs):
        model.train()
        running_loss = 0
        num_batch = 1

        for batch in train_loader:
            x_data = batch[0].to(device)
            y_data = batch[1].to(device)

            y_pred = model(x_data)
            loss = criterion(y_pred, y_data)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

            pbar.set_description(f"Epoch: {epoch + 1}/{epochs}, Batch: {num_batch}/{len_train_data}, Loss: {running_loss}")
            pbar.update()

            num_batch += 1

Epoch: 5/5, Batch: 140/140, Loss: 13.92319796886295: 100%|███████████████████████████| 700/700 [00:09<00:00, 76.39it/s]


In [73]:
len_test_data = len(test_loader)
acc = 0

with tqdm(total=len_test_data, position=0, leave=True) as pbar:

    model.eval()
    running_loss = 0
    num_test = 1

    for test in test_loader:
        x_data = test[0].to(device)
        y_data = test[1].to(device)

        y_pred = model(x_data)

        acc += (int(y_pred > .5) == int(y_data))

        pbar.set_description(f"Test: {num_test}/{len_test_data}, Acc: {round(acc / num_test, 3)}")
        pbar.update()

        num_test += 1

Test: 1115/1115, Acc: 0.993: 100%|████████████████████████████████████████████████| 1115/1115 [00:06<00:00, 168.48it/s]
