In [2]:
import torch
import torch.nn as nn
import random
import tqdm
import torch.optim as optim
import pandas as pd
from nltk.tokenize import TweetTokenizer, word_tokenize
import re
from collections import defaultdict
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [3]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Find Hyperlinks in string

In [4]:
def findUrl(string):
  
    # findall() has been used 
    # with valid conditions for urls in string
    regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
    found = re.search(regex, string)
    return found

In [6]:
train_data = pd.read_csv('train.csv')
Y = list((train_data['Type'] == 'Quality').astype(int))
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# Data Loader / Tokenizer

In [7]:
train_data = pd.read_csv('train.csv')
tokenizer = TweetTokenizer()
hashtag = True
wordcount = defaultdict(int)
vocab_size = 500

lines = []
maxlen = 0

for data in train_data['Tweet']:

    line = ['<START>']

    tokens = tokenizer.tokenize(data.lower())

    for token in tokens:
        url = findUrl(token)
        if url:
            line.append('<URL>')
            wordcount['<URL>'] += 1
        elif token[0] == '#':
            if hashtag:
                line.append(token)
                wordcount[token] += 1
            else:
                line.append('<HASH>')
                wordcount['<HASH>'] += 1
        else:
            more_words = word_tokenize(token)
            for w in more_words:
                line.append(w)
                wordcount[w] += 1

    line.append('<END>')
    maxlen = max(maxlen, len(line))
    lines.append(line)

wordcount['<START>'] = len(train_data['Tweet'])
wordcount['<END>'] = len(train_data['Tweet'])

sorted_wordcounts = sorted(wordcount.items(), key = lambda item: item[1], reverse=True)

word2ind = {}
ind2word = {}

ind = 1
for k, v in sorted_wordcounts[:vocab_size - 1]:
    word2ind[k] = ind
    ind2word[ind] = k
    ind += 1

for k, v in sorted_wordcounts[vocab_size - 1:]:
    word2ind[k] = vocab_size
    ind2word[vocab_size - 1] = '<UKN>'

X = []

for line in lines:
    ind_line = []
    for word in line:
        ind_line.append(word2ind[word])
    
    if len(ind_line) < maxlen:
        ind_line += [0] * (maxlen - len(ind_line))
    
    X.append(ind_line)


# LSTM model

In [28]:
class biLSTM(nn.Module):

    def __init__ (self, h_dim = 10, e_dim = 10):
        super(biLSTM, self).__init__()
        
        self.h_dim = h_dim
        self.e_dim = e_dim
        self.embedding = nn.Embedding(num_embeddings= vocab_size + 1, embedding_dim = self.e_dim, padding_idx = 0)
        self.pool = torch.nn.AdaptiveAvgPool1d(output_size=1)

        self.lstm = nn.LSTM(input_size = self.e_dim, 
                            hidden_size = self.h_dim, 
                            num_layers = 1,
                            batch_first = True,
                            bidirectional = True)
        
        self.drop = nn.Dropout(p = 0.5)
        self.linear = nn.Linear(2 * self.h_dim, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, X):

        X = self.embedding(X)
        X = self.drop(X)
        X, _ = self.lstm(X)
        X = X.permute(0,2,1)
        X = self.pool(X)
        X = torch.squeeze(X)
        X = self.linear(X)       
        X = self.sigmoid(X)

        return X
    



In [29]:
def shuffle_data(X, Y):
    shuffled_X = []
    shuffled_Y = []
    indices = list(range(len(X)))
    random.shuffle(indices)
    for i in indices:
        shuffled_X.append(X[i])
        shuffled_Y.append(Y[i])
    return (shuffled_X, shuffled_Y)

# Train Model

In [69]:
def train(X, Y, lstm):
    optimizer = optim.Adam(lstm.parameters(), lr = 0.001)
    batchSize = 10
    loss_f = nn.BCELoss()
    n_epochs = 10

    

    batchSize = 10



    for epoch in range(n_epochs):
        lstm.train()
        totalLoss = 0.0

        shuffled_X, shuffled_Y = shuffle_data(X, Y)
        X_train, Y_train = torch.tensor(shuffled_X, device = device), torch.tensor(shuffled_Y, dtype = float, device = device)

        for batch in range(len(X_train)//10 + 1):
          lstm.zero_grad()
          x = X_train[batch*batchSize: (batch + 1)*batchSize]
          y = Y_train[batch*batchSize: (batch + 1)*batchSize]
          output = lstm(x)
          loss = loss_f(output.squeeze().to(dtype = float), y)
          totalLoss += loss.item()
          loss.backward()
          optimizer.step()

        print("total loss is: ", totalLoss)
        
        


In [72]:
lstm = biLSTM(e_dim = 300, h_dim = 50).to(device)
train(X[1000:], Y[1000:], lstm)


total loss is:  588.1111591619334
total loss is:  476.3481225206878
total loss is:  434.0615752626056
total loss is:  402.93117795570083
total loss is:  387.1244311673069
total loss is:  365.8011303522309
total loss is:  357.699950162021
total loss is:  346.04006851951624
total loss is:  327.97060857919416
total loss is:  317.2938399278921


In [73]:
def evaluate(model, test_X, test_Y):
  y_pred = []
  input = torch.tensor(test_X, dtype=int, device = device)

  model.eval()

  output = model(input).squeeze()
  output = (output > 0.5).int()
  y_pred = output.tolist()
  
  print(classification_report(test_Y, y_pred, labels=[1, 0], digits = 4))

evaluate(lstm, X[:1000], Y[:1000])

  

              precision    recall  f1-score   support

           1     0.8784    0.8099    0.8427       526
           0     0.8058    0.8755    0.8392       474

    accuracy                         0.8410      1000
   macro avg     0.8421    0.8427    0.8410      1000
weighted avg     0.8440    0.8410    0.8411      1000



In [47]:
# import libraries for reading data, exploring and plotting
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
%matplotlib inline
# library for train test split
from sklearn.model_selection import train_test_split
# deep learning libraries for text pre-processing
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# Modeling 
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense, Dropout, LSTM, Bidirectional

In [64]:
# Biderectional LSTM Spam detection architecture
embeding_dim = 300
max_len = 81
n_lstm = 50
drop_lstm = 0.2
model2 = Sequential()
model2.add(Embedding(vocab_size + 1, embeding_dim, input_length=max_len))
model2.add(Bidirectional(LSTM(n_lstm, dropout=drop_lstm, return_sequences=True)))
model2.add(GlobalAveragePooling1D())
model2.add(Dense(1, activation='sigmoid'))

In [65]:
model2.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics=['accuracy'])

In [66]:
model2.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 81, 300)           150300    
                                                                 
 bidirectional_3 (Bidirectio  (None, 81, 100)          140400    
 nal)                                                            
                                                                 
 global_average_pooling1d_3   (None, 100)              0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dense_3 (Dense)             (None, 1)                 101       
                                                                 
Total params: 290,801
Trainable params: 290,801
Non-trainable params: 0
_________________________________________________________________


In [67]:
training_padded = X[:len(X)//2]
train_labels = Y[:len(X)//2]
testing_padded = X[len(X)//2:]
test_labels = Y[len(X)//2:] 

In [68]:
# Training
num_epochs = 30
early_stop = EarlyStopping(monitor='val_loss', patience=2)
history = model2.fit(training_padded, train_labels, epochs=num_epochs, 
                    validation_data=(testing_padded, test_labels),callbacks =[early_stop], verbose=2)

Epoch 1/30
187/187 - 10s - loss: 0.5995 - accuracy: 0.6671 - val_loss: 0.5003 - val_accuracy: 0.7619 - 10s/epoch - 53ms/step
Epoch 2/30
187/187 - 3s - loss: 0.5100 - accuracy: 0.7565 - val_loss: 0.4875 - val_accuracy: 0.7904 - 3s/epoch - 15ms/step
Epoch 3/30
187/187 - 3s - loss: 0.4580 - accuracy: 0.7886 - val_loss: 0.4701 - val_accuracy: 0.7670 - 3s/epoch - 15ms/step
Epoch 4/30
187/187 - 3s - loss: 0.4253 - accuracy: 0.8041 - val_loss: 0.4571 - val_accuracy: 0.7868 - 3s/epoch - 15ms/step
Epoch 5/30
187/187 - 3s - loss: 0.4096 - accuracy: 0.8199 - val_loss: 0.4572 - val_accuracy: 0.7955 - 3s/epoch - 15ms/step
Epoch 6/30
187/187 - 3s - loss: 0.4340 - accuracy: 0.8060 - val_loss: 0.4543 - val_accuracy: 0.7909 - 3s/epoch - 15ms/step
Epoch 7/30
187/187 - 3s - loss: 0.3895 - accuracy: 0.8250 - val_loss: 0.4212 - val_accuracy: 0.8065 - 3s/epoch - 15ms/step
Epoch 8/30
187/187 - 3s - loss: 0.3641 - accuracy: 0.8324 - val_loss: 0.4328 - val_accuracy: 0.7976 - 3s/epoch - 15ms/step
Epoch 9/30
187