# biLSTM with TF-IDF vectorization

import libraries

In [None]:
import torch
import torch.nn as nn
import random
import tqdm
import torch.optim as optim
import pandas as pd
from nltk.tokenize import TweetTokenizer, word_tokenize
import re
from collections import defaultdict
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import random_split
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy

# Detecting Hyperlinks in the Tweet

In [None]:
def findUrl(string):
  
    # findall() has been used 
    # with valid conditions for urls in string
    regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
    found = re.search(regex, string)
    return found

# Dataloader / Preprocessing

If the tweet is 'ham' label it as 1.

Removes all the '@' tages from the data before vectorization.

In [None]:
train_data = pd.read_csv('tweet_train.csv')

# Assign 1 for 'ham' and 0 for 'spam'
Y = list((train_data['Type'] == 'Spam').astype(int))

# remove '@' tags from the dataset
no_attag = []
for data in train_data['Tweet']:
  words = data.split(' ')
  output = []
  for word in words:
    if word and word[0] == '@':
      continue
    else:
      output.append(word)
  
  no_attag.append(" ".join(output))

# TF-IDF vectorization

In [None]:
# vectorization parameters
kwargs = {
            'ngram_range': (1,2),  # Use 1-grams + 2-grams
            'analyzer': 'word',  # Split text into word tokens.
            'min_df': 1,
            'stop_words': "english",
            'max_features': 50000
         }
# initialize vectorizer         
vectorizer = TfidfVectorizer(**kwargs)

# Create split training/test sets
train_portion = .8
train_size = int(len(no_attag) * train_portion)
test_size = len(no_attag) - train_size
train_ind, test_ind = random_split(range(len(no_attag)), [train_size, test_size], generator=torch.Generator().manual_seed(42))
train_data_X = []
train_Y_list = []
for ind in train_ind:
  train_data_X.append(no_attag[ind])
  train_Y_list.append(Y[ind])

test_data_X = []
test_Y_list = []
for ind in test_ind:
  test_data_X.append(no_attag[ind])
  test_Y_list.append(Y[ind])

# Create Vectorized data from the training set
vectorized_train_X = vectorizer.fit_transform(train_data_X)
train_X = torch.tensor(scipy.sparse.csr_matrix.todense(vectorized_train_X)).float()
vectorized_test_X = vectorizer.transform(test_data_X)
test_X = torch.tensor(scipy.sparse.csr_matrix.todense(vectorized_test_X)).float()

# create tensors from training/test labels
train_Y = torch.tensor(train_Y_list)
test_Y = torch.tensor(test_Y_list)

# Vectorization Test

In [None]:
example = 1
print(vectorizer.get_feature_names_out()[train_X[example] != 0])

['com' 'com luka1qmrhf' 'flipagram' 'flipagram com' 'http'
 'http flipagram' 'luka1qmrhf']


# biLSTM model

In [None]:
# Use GPU if available
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')


class biLSTM(nn.Module):
    # biLSTM model initialization. Since we are using the vectorization from
    # TF-IDF embedding layer is no longer needed.
    ##########################################################################
    # h_dim: number of hidden layers in lstm
    # input_dim: size of the vector created by tf-idf
    # lstm_layers: number lstm layers
    # dropout_rate: parameter for the dropout layer
    def __init__ (self, h_dim = 10, input_dim = 100, lstm_layers = 1, dropout_rate = 0.2):
        super(biLSTM, self).__init__()
        
        self.lstm = nn.LSTM(input_size = input_dim, 
                            hidden_size = h_dim, 
                            num_layers = lstm_layers,
                            batch_first = True,
                            bidirectional = True)
        
        self.drop = nn.Dropout(p = dropout_rate)
        self.linear = nn.Linear(2 * h_dim, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, X):
        X, _ = self.lstm(X)
        X = self.drop(X)
        X = self.linear(X)       
        X = self.sigmoid(X)

        return X



# Model Training

In [None]:
def shuffle_data(X, Y):
    indices = list(range(len(X)))
    random.shuffle(indices)
    shuffled_X = X[indices]
    shuffled_Y = Y[indices]
    return (shuffled_X, shuffled_Y)

def train(train_X, train_Y, test_X, test_Y, model, print_results = False, n_epochs = 50):

    # send tensors to GPU if using GPU
    train_X = train_X.to(device)
    train_Y = train_Y.to(device, dtype = float)
    test_X = test_X.to(device)
    test_Y = test_Y.to(device, dtype = float)


    optimizer = optim.Adam(model.parameters(), lr = 0.001)
    # optimizer = optim.Adagrad(lstm.parameters(), lr = 0.001)
    # optimizer = optim.SGD(lstm.parameters(), lr = 0.001)
    loss_f = nn.BCELoss()
    # loss_f = nn.CrossEntropyLoss()

    # initialize variables for training
    batchSize = 10
    max_f1 = 0
    max_acc = 0
    max_epoch = 0

    # For each epocch
    for epoch in range(n_epochs):
        model.train()
        totalLoss = 0.0
        train_X_shuffle, train_Y_shuffle = shuffle_data(train_X, train_Y)

        for batch in tqdm.notebook.tqdm(range(0, len(train_X_shuffle), batchSize), leave=False):
          model.zero_grad()
          x = train_X_shuffle[batch: batch + batchSize]
          y = train_Y_shuffle[batch: batch + batchSize]
          output = model(x)
          loss = loss_f(output.squeeze().to(dtype = float), y)
          # loss = loss_f(output, y.to(dtype=int))
          totalLoss += loss.item()
          loss.backward()
          optimizer.step()

        # Evaluate the model after each epoch
        model.eval()
        output = model(test_X).squeeze()
        output = (output > 0.5).int()
        y_pred = output.tolist()
        results = classification_report(test_Y.cpu(), y_pred, labels = [1, 0], digits = 4, output_dict=True)
        if print_results:
          print("============================================")
          print(f"epoch {epoch + 1}")
          print(f"loss: {totalLoss:.4f}")
          print(f"accuracy: {results['accuracy']:.4f}")
          print(f"f1-score: {results['macro avg']['f1-score']:.4f}")
        if results['macro avg']['f1-score'] > max_f1:
          max_f1 = results['macro avg']['f1-score']
          max_acc = results['accuracy']
          max_epoch = epoch + 1
    print("============================================")
    print(f"Best result at epoch {max_epoch} f1-score: {max_f1:.4f} accuracy: {max_acc:.4f}")
        
        


In [None]:
lstm = biLSTM(h_dim = 200, lstm_layers = 2, input_dim=train_X.shape[1]).to(device)
train(train_X, train_Y, test_X, test_Y, lstm, print_results=True, n_epochs=5)

  0%|          | 0/1114 [00:00<?, ?it/s]

tensor([1, 1, 1, 1, 0, 1, 0, 0, 1, 0], device='cuda:0')
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
epoch 1
loss: 1039.8612
accuracy: 0.5149
f1-score: 0.3399


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/1114 [00:00<?, ?it/s]

tensor([1, 1, 1, 1, 0, 1, 0, 0, 1, 0], device='cuda:0')
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
epoch 2
loss: 1038.6762
accuracy: 0.5149
f1-score: 0.3399


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/1114 [00:00<?, ?it/s]

KeyboardInterrupt: ignored

In [None]:
torch.save(lstm,'biLSTM.pt')

# Hyper Parameter Tuning

In [None]:
h_dimensions = [10, 20, 30, 40, 50, 100, 200, 300, 400, 500]
lstm_layers = [1, 2, 3, 4, 5]
dropout_rates = [0.5]

for h_dim in h_dimensions:
  for lstm_l in lstm_layers:
    for d_rate in dropout_rates:
      lstm = biLSTM(h_dim = h_dim, lstm_layers = lstm_l, dropout_rate=d_rate, input_dim=train_X.shape[1]).to(device)
      train(train_X, train_Y, test_X, test_Y, lstm, print_results=False, n_epochs=10)
      print(f"dimensions: {h_dim}, layers: {lstm_l}, dropout rate: {d_rate}")


  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

Best result at epoch 8 f1-score: 0.9336 accuracy: 0.9336
dimensions: 10, layers: 1, dropout rate: 0.5


  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

Best result at epoch 5 f1-score: 0.9294 accuracy: 0.9294
dimensions: 10, layers: 2, dropout rate: 0.5


  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

Best result at epoch 6 f1-score: 0.9294 accuracy: 0.9294
dimensions: 10, layers: 3, dropout rate: 0.5


  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

Best result at epoch 3 f1-score: 0.9327 accuracy: 0.9327
dimensions: 10, layers: 4, dropout rate: 0.5


  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

Best result at epoch 7 f1-score: 0.9348 accuracy: 0.9348
dimensions: 10, layers: 5, dropout rate: 0.5


  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

Best result at epoch 9 f1-score: 0.9403 accuracy: 0.9403
dimensions: 20, layers: 1, dropout rate: 0.5


  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

Best result at epoch 9 f1-score: 0.9365 accuracy: 0.9365
dimensions: 20, layers: 2, dropout rate: 0.5


  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

Best result at epoch 2 f1-score: 0.9319 accuracy: 0.9319
dimensions: 20, layers: 3, dropout rate: 0.5


  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

Best result at epoch 3 f1-score: 0.9294 accuracy: 0.9294
dimensions: 20, layers: 4, dropout rate: 0.5


  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

Best result at epoch 7 f1-score: 0.9327 accuracy: 0.9327
dimensions: 20, layers: 5, dropout rate: 0.5


  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

Best result at epoch 5 f1-score: 0.9415 accuracy: 0.9415
dimensions: 30, layers: 1, dropout rate: 0.5


  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

Best result at epoch 9 f1-score: 0.9344 accuracy: 0.9344
dimensions: 30, layers: 2, dropout rate: 0.5


  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

Best result at epoch 5 f1-score: 0.9356 accuracy: 0.9357
dimensions: 30, layers: 3, dropout rate: 0.5


  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

Best result at epoch 5 f1-score: 0.9319 accuracy: 0.9319
dimensions: 30, layers: 4, dropout rate: 0.5


  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

Best result at epoch 9 f1-score: 0.9340 accuracy: 0.9340
dimensions: 30, layers: 5, dropout rate: 0.5


  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

Best result at epoch 7 f1-score: 0.9394 accuracy: 0.9394
dimensions: 40, layers: 1, dropout rate: 0.5


  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

Best result at epoch 6 f1-score: 0.9373 accuracy: 0.9373
dimensions: 40, layers: 2, dropout rate: 0.5


  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

Best result at epoch 2 f1-score: 0.9353 accuracy: 0.9353
dimensions: 40, layers: 3, dropout rate: 0.5


  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

Best result at epoch 6 f1-score: 0.9340 accuracy: 0.9340
dimensions: 40, layers: 4, dropout rate: 0.5


  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

Best result at epoch 4 f1-score: 0.9382 accuracy: 0.9382
dimensions: 40, layers: 5, dropout rate: 0.5


  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

Best result at epoch 7 f1-score: 0.9415 accuracy: 0.9415
dimensions: 50, layers: 1, dropout rate: 0.5


  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

Best result at epoch 2 f1-score: 0.9286 accuracy: 0.9286
dimensions: 50, layers: 2, dropout rate: 0.5


  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

Best result at epoch 2 f1-score: 0.9348 accuracy: 0.9348
dimensions: 50, layers: 3, dropout rate: 0.5


  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

Best result at epoch 2 f1-score: 0.9315 accuracy: 0.9315
dimensions: 50, layers: 4, dropout rate: 0.5


  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

Best result at epoch 5 f1-score: 0.9373 accuracy: 0.9373
dimensions: 50, layers: 5, dropout rate: 0.5


  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

Best result at epoch 5 f1-score: 0.9407 accuracy: 0.9407
dimensions: 100, layers: 1, dropout rate: 0.5


  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

Best result at epoch 9 f1-score: 0.9315 accuracy: 0.9315
dimensions: 100, layers: 2, dropout rate: 0.5


  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

Best result at epoch 2 f1-score: 0.9340 accuracy: 0.9340
dimensions: 100, layers: 3, dropout rate: 0.5


  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

Best result at epoch 2 f1-score: 0.9311 accuracy: 0.9311
dimensions: 100, layers: 4, dropout rate: 0.5


  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

Best result at epoch 5 f1-score: 0.9348 accuracy: 0.9348
dimensions: 100, layers: 5, dropout rate: 0.5


  0%|          | 0/958 [00:00<?, ?it/s]

  0%|          | 0/958 [00:00<?, ?it/s]

KeyboardInterrupt: ignored

In [None]:
comments_data1 = pd.read_csv('Youtube01-Psy.csv')
comments_data2 = pd.read_csv('Youtube02-KatyPerry.csv')
comments_data3 = pd.read_csv('Youtube03-LMFAO.csv')
comments_data4 = pd.read_csv('Youtube04-Eminem.csv')
comments_data5 = pd.read_csv('Youtube05-Shakira.csv')

In [None]:
comments_data = pd.concat([comments_data1, comments_data2, comments_data3, comments_data4, comments_data5])

In [None]:
vectorized_comments_X = vectorizer.transform(comments_data['CONTENT'])
comments_X = torch.tensor(scipy.sparse.csr_matrix.todense(vectorized_comments_X)).float()
comments_Y = comments_data['CLASS'].to_list()

biLSTM(
  (lstm): LSTM(50000, 200, batch_first=True, bidirectional=True)
  (drop): Dropout(p=0.5, inplace=False)
  (linear): Linear(in_features=400, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [None]:
lstm.eval()
output = lstm(comments_X.to(device)).squeeze()
output = (output > 0.5).int()
y_pred = output.tolist()
results = classification_report(comments_Y, y_pred, labels = [1, 0], digits = 4)
print(results)

              precision    recall  f1-score   support

           1     0.2955    0.0743    0.1187       175
           0     0.4706    0.8229    0.5988       175

    accuracy                         0.4486       350
   macro avg     0.3830    0.4486    0.3587       350
weighted avg     0.3830    0.4486    0.3587       350



In [None]:
train_data

Unnamed: 0,Id,Tweet,following,followers,actions,is_retweet,location,Type
0,10091,It's the everything else that's complicated. #...,0.0,11500.0,,0.0,Chicago,Quality
1,10172,Eren sent a glare towards Mikasa then nodded a...,0.0,0.0,,0.0,,Quality
2,7012,I posted a new photo to Facebook http://fb.me/...,0.0,0.0,,0.0,"Scotland, U.K",Quality
3,3697,#jan Idiot Chelsea Handler Diagnoses Trump Wit...,3319.0,611.0,294.0,0.0,"Atlanta, Ga",Spam
4,10740,Pedophile Anthony Weiner is TERRIFIED of Getti...,4840.0,1724.0,1522.0,0.0,Blumberg,Spam
...,...,...,...,...,...,...,...,...
11963,7866,11:11 meet harry,0.0,0.0,0.0,0.0,,Quality
11964,7841,If BBC Food disappears the loss of knowledge w...,0.0,0.0,,0.0,"London, N4",Quality
11965,9090,Look What Liberals Did to This Historic Monume...,0.0,2.0,1192.0,0.0,honeymoon ãve,Spam
11966,6818,"I uploaded a new track, ""Everyday Lite 1"", on ...",0.0,0.0,0.0,0.0,Mega Manila,Quality
