In [2]:
import pandas as pd
import nltk
from nltk.corpus import stopwords #use for stop words
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

import numpy as np
from torch import nn
import torch
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.autograd import Variable


import torchtext
from torchtext import vocab

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Joey\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Joey\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
data = pd.read_csv("../../../data/spam/spam_ham_dataset.csv") #Spam or Ham (Spam email) Classification dataset
data.head(5)

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [3]:
labels = data['label_num']
data = data.drop(['label','Unnamed: 0'], axis=1)
vocab = set()
max_len = 0

In [4]:
x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size = 0.2, random_state = 0)

In [5]:
x_train.to_csv('../../../data/spam/train.csv', index=False, header=False)
x_test.to_csv('../../../data/spam/test.csv', index=False, header=False)

In [3]:

stop_words = set(stopwords.words('english'))
def remove_stop_words_tokenize(text):
    return [word for word in word_tokenize(text) if not word in stop_words and not word in ['?', ',', '.', '\n']]

#data.head(5)

In [4]:
Text = torchtext.data.Field(tokenize=remove_stop_words_tokenize, batch_first=True, include_lengths=True, fix_length=10000)
Label = torchtext.data.Field(sequential=False, use_vocab=False, pad_token=None, unk_token=None)

In [5]:
fields = [('text', Text), ('label_num', Label)]

In [33]:
train_data, test_data = torchtext.data.TabularDataset.splits(
    path='../../../data/spam/',
    train='train.csv',
    test='test.csv',
    format='csv',
    fields=fields,
    skip_header=False
)

In [34]:
train_data, valid_data = train_data.split(split_ratio=0.8)

In [35]:
Text.build_vocab(train_data, valid_data, max_size=5000)
Label.build_vocab(train_data)
print(len(Text.vocab))
vocab_size = len(Text.vocab)

5002


In [36]:
device = 'cpu'
train_iterator, valid_iterator, test_iterator = torchtext.data.BucketIterator.splits((train_data, valid_data, test_data),
        batch_size = 50,
        sort_key = lambda x: len(x.text), sort_within_batch = True,
        device = device)
print(device)

cpu


In [50]:
lr = 1e-4
batch_size = 50
dropout_keep_prob = 0.5
embedding_size = 300
max_len = 500
max_vocab_size=10000
seed = 1
classes= 2
hidden_dim2 = 128
num_layers = 2
bi_directional = False 
num_epochs = 7

In [38]:
class LSTM(nn.Module):#Simple LSTM RNN based on https://towardsdatascience.com/pytorch-basics-how-to-train-your-neural-net-intro-to-rnn-cb6ebc594677
    #for purposes of testing conformal predictions on an RNN
    def __init__(self, vocab_size, embedding_size, lstm_units, hidden_dim, classes_dim, nlayers, pad_index, dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_size, padding_idx = pad_index)
        self.rnn = nn.LSTM(embedding_size, lstm_units ,num_layers=nlayers, bidirectional=False)
        self.fc1 = nn.Linear(lstm_units, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, classes_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        self.nlayers= nlayers
        self.lstm_units=lstm_units
    def init_hidden(self, batch_size):
        h, c = (Variable(torch.zeros(self.nlayers, batch_size, self.lstm_units).to(device)),
                Variable(torch.zeros(self.nlayers, batch_size, self.lstm_units).to(device)))
        return h, c
    def forward(self, text, text_lengths):
        batch_size = text.shape[0]
        h_0, c_0 = self.init_hidden(batch_size)
        embedded = self.embedding(text)
        packed_embedded = pack_padded_sequence(embedded, text_lengths, batch_first=True)
        output, (h_n, c_n) = self.rnn(packed_embedded, (h_0, c_0))
        output_unpacked, output_lengths = pad_packed_sequence(output, batch_first=True)
        out = output_unpacked[:, -1, :]
        rel = self.relu(out)
        dense1 = self.fc1(rel)
        drop = self.dropout(dense1)
        preds = self.fc2(drop)
        return preds
        

In [39]:
criterion = nn.CrossEntropyLoss()
rnn = LSTM(vocab_size, embedding_size, 93, hidden_dim2, 2, num_layers, 0, 0.5)

In [40]:
optimizer = torch.optim.Adam(rnn.parameters(), lr=lr)
rnn.to(device)

LSTM(
  (embedding): Embedding(5002, 300, padding_idx=0)
  (rnn): LSTM(300, 93, num_layers=2)
  (fc1): Linear(in_features=93, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=2, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.5, inplace=False)
)

In [41]:
def accuracy(probs, target):
    predictions = probs.argmax(dim=1)
    corrects = (predictions == target)
    accuracy = corrects.sum().float() / float(target.size(0))
    return accuracy

def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    for batch in iterator:
        optimizer.zero_grad()
        text, text_lengths = batch.text
        predictions = model(text.to(device), text_lengths.to(device))
        loss = criterion(predictions, batch.label_num.squeeze())
        acc = accuracy(predictions, batch.label_num)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [42]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            text, text_lengths = batch.text
            predictions = model(text, text_lengths).squeeze(1)
            loss = criterion(predictions, batch.label_num)
            acc = accuracy(predictions, batch.label_num)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [51]:
def run_train(epochs, model, train_iterator, valid_iterator, optimizer, criterion):
    best_valid_loss = float('inf')
    for epoch in range(epochs):
        train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
        valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), 'saved_weights_LTSM_.pth')
        print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc * 100:.2f}%')
        print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc * 100:.2f}%')

In [57]:
run_train(5, rnn, train_iterator, valid_iterator, optimizer, criterion)

	Train Loss: 0.553 | Train Acc: 72.16%
	 Val. Loss: 0.585 |  Val. Acc: 72.04%
	Train Loss: 0.535 | Train Acc: 74.61%
	 Val. Loss: 0.576 |  Val. Acc: 72.51%
	Train Loss: 0.522 | Train Acc: 74.61%
	 Val. Loss: 0.571 |  Val. Acc: 72.51%
	Train Loss: 0.518 | Train Acc: 74.79%
	 Val. Loss: 0.568 |  Val. Acc: 72.39%
	Train Loss: 0.509 | Train Acc: 75.17%
	 Val. Loss: 0.564 |  Val. Acc: 72.63%


In [59]:
rnn.load_state_dict(torch.load('saved_weights_LTSM_.pth'))

(0.5051919469199797, 0.7514427853609199)


In [60]:
print(evaluate(rnn, test_iterator, criterion))

(0.5891605133102054, 0.7205442232745034)
