In [30]:
import torch
import torch.nn as nn
import random
import tqdm
import torch.optim as optim
import pandas as pd
from nltk.tokenize import TweetTokenizer, word_tokenize
import re
from collections import defaultdict
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import random_split
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy

In [31]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Find Hyperlinks in string

In [32]:
def findUrl(string):
  
    # findall() has been used 
    # with valid conditions for urls in string
    regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
    found = re.search(regex, string)
    return found

In [33]:
train_data = pd.read_csv('train.csv')
Y = list((train_data['Type'] == 'Quality').astype(int))
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# Data Loader / Tokenizer

In [5]:
train_data = pd.read_csv('train.csv')
tokenizer = TweetTokenizer()
hashtag = True
wordcount = defaultdict(int)
# vocab_size = 500

lines = []
maxlen = 0

for data in train_data['Tweet']:

    line = []

    tokens = tokenizer.tokenize(data.lower())

    for token in tokens:
        url = findUrl(token)
        if url:
            line.append('<URL>')
            wordcount['<URL>'] += 1
        elif token[0] == '#':
            if hashtag:
                line.append(token[1:])
                wordcount[token[1:]] += 1
            else:
                line.append('<HASH>')
                wordcount['<HASH>'] += 1
        else:
            more_words = word_tokenize(token)
            for w in more_words:
                line.append(w)
                wordcount[w] += 1

    # line.append('<END>')
    maxlen = max(maxlen, len(line))
    lines.append(line)

# wordcount['<START>'] = len(train_data['Tweet'])
# wordcount['<END>'] = len(train_data['Tweet'])

sorted_wordcounts = sorted(wordcount.items(), key = lambda item: item[1], reverse=True)

word2ind = {}
ind2word = {}

ind = 1
# for k, v in sorted_wordcounts[:vocab_size - 1]:
#     word2ind[k] = ind
#     ind2word[ind] = k
#     ind += 1

# for k, v in sorted_wordcounts[vocab_size - 1:]:
#     word2ind[k] = vocab_size
#     ind2word[vocab_size - 1] = '<UKN>'

for k, v in sorted_wordcounts:
    word2ind[k] = ind
    ind2word[ind] = k
    ind += 1

X = []

for line in lines:
    ind_line = []
    for word in line:
        ind_line.append(word2ind[word])
    
    if len(ind_line) < maxlen:
        ind_line += [0] * (maxlen - len(ind_line))
    
    X.append(ind_line)

len(word2ind)

24249

# LSTM model

In [6]:
class biLSTM(nn.Module):

    def __init__ (self, h_dim = 10, e_dim = 10, lstm_layers = 1, dropout_rate = 0.2):
        super(biLSTM, self).__init__()
        
        self.h_dim = h_dim
        self.e_dim = e_dim
        self.embedding = nn.Embedding(num_embeddings= len(word2ind) + 1, embedding_dim = self.e_dim, padding_idx = 0)
        self.pool = torch.nn.AdaptiveAvgPool1d(output_size=1)

        self.lstm = nn.LSTM(input_size = self.e_dim, 
                            hidden_size = self.h_dim, 
                            num_layers = lstm_layers,
                            batch_first = True,
                            bidirectional = True)
        
        self.drop = nn.Dropout(p = dropout_rate)
        self.linear = nn.Linear(2 * self.h_dim, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, X):

        X = self.embedding(X)
        X = self.drop(X)
        X, _ = self.lstm(X)
        X = X.permute(0,2,1)
        X = self.pool(X)
        X = torch.squeeze(X)
        X = self.linear(X)       
        X = self.sigmoid(X)

        return X
    



# Train Model

In [28]:
def shuffle_data(X, Y):
    indices = list(range(len(X)))
    random.shuffle(indices)
    shuffled_X = X[indices]
    shuffled_Y = Y[indices]
    return (shuffled_X, shuffled_Y)
    
def train(X, Y, lstm, print_results = False, n_epochs = 50):
    optimizer = optim.Adam(lstm.parameters(), lr = 0.001)
    # optimizer = optim.Adagrad(lstm.parameters(), lr = 0.001)
    # optimizer = optim.SGD(lstm.parameters(), lr = 0.001)
    loss_f = nn.BCELoss()

    train_size = len(X) // 10 * 8
    test_size = len(X) - train_size
    train_ind, test_ind = random_split(range(len(X)), [train_size, test_size], generator=torch.Generator().manual_seed(42))

    batchSize = 10


    train_X = torch.tensor(X, device = device)[train_ind]
    train_Y = torch.tensor(Y, device = device, dtype = float)[train_ind]
    test_X = torch.tensor(X, device = device)[test_ind]
    test_Y = torch.tensor(Y, device = device, dtype = float)[test_ind]


    max_f1 = 0
    max_acc = 0
    max_epoch = 0

    for epoch in range(n_epochs):
        lstm.train()
        totalLoss = 0.0

        train_X_shuffle, train_Y_shuffle = shuffle_data(train_X, train_Y)

        for batch in tqdm.notebook.tqdm(range(0, len(train_X_shuffle), batchSize), leave=False):
          lstm.zero_grad()
          x = train_X_shuffle[batch: batch + batchSize]
          y = train_Y_shuffle[batch: batch + batchSize]
          output = lstm(x)
          loss = loss_f(output.squeeze().to(dtype = float), y)
          totalLoss += loss.item()
          loss.backward()
          optimizer.step()

        
        lstm.eval()
        output = lstm(test_X).squeeze()
        output = (output > 0.5).int()
        y_pred = output.tolist()

        results = classification_report(test_Y.cpu(), y_pred, labels = [1, 0], digits = 4, output_dict=True)
        if print_results:
          print("============================================")
          print(f"epoch {epoch + 1}")
          print(f"loss: {totalLoss:.4f}")
          print(f"accuracy: {results['accuracy']:.4f}")
          print(f"f1-score: {results['macro avg']['f1-score']:.4f}")
        if results['macro avg']['f1-score'] > max_f1:
          max_f1 = results['macro avg']['f1-score']
          max_acc = results['accuracy']
          max_epoch = epoch + 1
    print("============================================")
    print(f"Best result at epoch {max_epoch} f1-score: {max_f1:.4f} accuracy: {max_acc:.4f}")
        
        


In [11]:
lstm = biLSTM(e_dim = 300, h_dim = 50, lstm_layers = 1).to(device)
train(X, Y, lstm, print_results=True, n_epochs=10)

  0%|          | 0/957 [00:00<?, ?it/s]

epoch 1
loss: 662.9958
accuracy: 0.4988
f1-score: 0.3342


  0%|          | 0/957 [00:00<?, ?it/s]

epoch 2
loss: 662.9011
accuracy: 0.5000
f1-score: 0.3348


  0%|          | 0/957 [00:00<?, ?it/s]

epoch 3
loss: 662.8351
accuracy: 0.5000
f1-score: 0.3348


  0%|          | 0/957 [00:00<?, ?it/s]

epoch 4
loss: 662.7293
accuracy: 0.5000
f1-score: 0.3348


  0%|          | 0/957 [00:00<?, ?it/s]

epoch 5
loss: 662.6608
accuracy: 0.5000
f1-score: 0.3348


  0%|          | 0/957 [00:00<?, ?it/s]

epoch 6
loss: 662.6278
accuracy: 0.5000
f1-score: 0.3348


  0%|          | 0/957 [00:00<?, ?it/s]

epoch 7
loss: 662.6121
accuracy: 0.5000
f1-score: 0.3348


  0%|          | 0/957 [00:00<?, ?it/s]

epoch 8
loss: 662.6100
accuracy: 0.5000
f1-score: 0.3348


  0%|          | 0/957 [00:00<?, ?it/s]

epoch 9
loss: 662.4634
accuracy: 0.5000
f1-score: 0.3348


  0%|          | 0/957 [00:00<?, ?it/s]

epoch 10
loss: 662.4911
accuracy: 0.5000
f1-score: 0.3348
Best result at epoch 2 f1-score: 0.3348 accuracy: 0.5000


# Parameter Tuning

In [94]:
# Embedding Dimension

e_dimensions = [10, 20, 50, 100, 200, 300, 400, 500]
h_dimensions = [10, 20, 50, 100, 200, 300, 400, 500]
lstm_layers = [1, 2, 3, 4, 5]
dropout_rates = [0.1, 0.2, 0.3, 0.4, 0.5]

for e_dim in e_dimensions:
  for h_dim in h_dimensions:
    for lstm_l in lstm_layers:
      for d_rate in dropout_rates:
        lstm = biLSTM(e_dim = e_dim, h_dim = h_dim, lstm_layers = lstm_l, dropout_rate=d_rate).to(device)
        print(f"e_dim: {e_dim}, h_dim: {h_dim}, # layers: {lstm_l}, dropout: {d_rate}")
        train(X, Y, lstm)

e_dim: 10, h_dim: 10, # layers: 1, dropout: 0.1


  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

Best result at epoch 16 f1-score: 0.8220 accuracy: 0.8221
e_dim: 10, h_dim: 10, # layers: 1, dropout: 0.2


  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

Best result at epoch 19 f1-score: 0.8037 accuracy: 0.8042
e_dim: 10, h_dim: 10, # layers: 1, dropout: 0.3


  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

Best result at epoch 19 f1-score: 0.8057 accuracy: 0.8058
e_dim: 10, h_dim: 10, # layers: 1, dropout: 0.4


  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

Best result at epoch 17 f1-score: 0.8058 accuracy: 0.8058
e_dim: 10, h_dim: 10, # layers: 1, dropout: 0.5


  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

Best result at epoch 20 f1-score: 0.7937 accuracy: 0.7950
e_dim: 10, h_dim: 10, # layers: 2, dropout: 0.1


  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

Best result at epoch 18 f1-score: 0.8006 accuracy: 0.8008
e_dim: 10, h_dim: 10, # layers: 2, dropout: 0.2


  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

Best result at epoch 18 f1-score: 0.8079 accuracy: 0.8079
e_dim: 10, h_dim: 10, # layers: 2, dropout: 0.3


  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

Best result at epoch 20 f1-score: 0.8060 accuracy: 0.8063
e_dim: 10, h_dim: 10, # layers: 2, dropout: 0.4


  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

Best result at epoch 19 f1-score: 0.8037 accuracy: 0.8037
e_dim: 10, h_dim: 10, # layers: 2, dropout: 0.5


  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

Best result at epoch 19 f1-score: 0.7958 accuracy: 0.7958
e_dim: 10, h_dim: 10, # layers: 3, dropout: 0.1


  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

Best result at epoch 17 f1-score: 0.8121 accuracy: 0.8121
e_dim: 10, h_dim: 10, # layers: 3, dropout: 0.2


  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

Best result at epoch 20 f1-score: 0.8174 accuracy: 0.8175
e_dim: 10, h_dim: 10, # layers: 3, dropout: 0.3


  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

  0%|          | 0/957 [00:00<?, ?it/s]

KeyboardInterrupt: ignored

In [55]:
def evaluate(model, test_X, test_Y):
  y_pred = []
  input = torch.tensor(test_X, dtype=int, device = device)

  model.eval()

  output = model(input).squeeze()
  output = (output > 0.5).int()
  y_pred = output.tolist()
  
  print(classification_report(test_Y, y_pred, labels=[1, 0], digits = 4))

evaluate(lstm, X, Y[:1000])

  

              precision    recall  f1-score   support

           1     0.9310    0.9487    0.9397       526
           0     0.9418    0.9219    0.9318       474

    accuracy                         0.9360      1000
   macro avg     0.9364    0.9353    0.9358      1000
weighted avg     0.9361    0.9360    0.9360      1000



In [20]:
# import libraries for reading data, exploring and plotting
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
%matplotlib inline
# library for train test split
from sklearn.model_selection import train_test_split
# deep learning libraries for text pre-processing
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# Modeling 
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense, Dropout, LSTM, Bidirectional

In [None]:
# Biderectional LSTM Spam detection architecture
embeding_dim = 300
max_len = 81
n_lstm = 50
drop_lstm = 0.2
model2 = Sequential()
model2.add(Embedding(vocab_size + 1, embeding_dim, input_length=max_len))
model2.add(Bidirectional(LSTM(n_lstm, dropout=drop_lstm, return_sequences=True)))
model2.add(GlobalAveragePooling1D())
model2.add(Dense(1, activation='sigmoid'))

In [None]:
model2.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics=['accuracy'])

In [None]:
model2.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 81, 300)           150300    
                                                                 
 bidirectional_3 (Bidirectio  (None, 81, 100)          140400    
 nal)                                                            
                                                                 
 global_average_pooling1d_3   (None, 100)              0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dense_3 (Dense)             (None, 1)                 101       
                                                                 
Total params: 290,801
Trainable params: 290,801
Non-trainable params: 0
_________________________________________________________________


In [None]:
training_padded = X[:len(X)//2]
train_labels = Y[:len(X)//2]
testing_padded = X[len(X)//2:]
test_labels = Y[len(X)//2:] 

In [None]:
# Training
num_epochs = 30
early_stop = EarlyStopping(monitor='val_loss', patience=2)
history = model2.fit(training_padded, train_labels, epochs=num_epochs, 
                    validation_data=(testing_padded, test_labels),callbacks =[early_stop], verbose=2)

Epoch 1/30
187/187 - 10s - loss: 0.5995 - accuracy: 0.6671 - val_loss: 0.5003 - val_accuracy: 0.7619 - 10s/epoch - 53ms/step
Epoch 2/30
187/187 - 3s - loss: 0.5100 - accuracy: 0.7565 - val_loss: 0.4875 - val_accuracy: 0.7904 - 3s/epoch - 15ms/step
Epoch 3/30
187/187 - 3s - loss: 0.4580 - accuracy: 0.7886 - val_loss: 0.4701 - val_accuracy: 0.7670 - 3s/epoch - 15ms/step
Epoch 4/30
187/187 - 3s - loss: 0.4253 - accuracy: 0.8041 - val_loss: 0.4571 - val_accuracy: 0.7868 - 3s/epoch - 15ms/step
Epoch 5/30
187/187 - 3s - loss: 0.4096 - accuracy: 0.8199 - val_loss: 0.4572 - val_accuracy: 0.7955 - 3s/epoch - 15ms/step
Epoch 6/30
187/187 - 3s - loss: 0.4340 - accuracy: 0.8060 - val_loss: 0.4543 - val_accuracy: 0.7909 - 3s/epoch - 15ms/step
Epoch 7/30
187/187 - 3s - loss: 0.3895 - accuracy: 0.8250 - val_loss: 0.4212 - val_accuracy: 0.8065 - 3s/epoch - 15ms/step
Epoch 8/30
187/187 - 3s - loss: 0.3641 - accuracy: 0.8324 - val_loss: 0.4328 - val_accuracy: 0.7976 - 3s/epoch - 15ms/step
Epoch 9/30
187