# Brian Lee - Twitter Spam Detection Through CNN Text Classifier

##Reference: [A deep learning model for Twitter spam detection](https://www.sciencedirect.com/science/article/pii/S2468696420300203)

In [138]:
# PYTORCH
import torch
import torch.nn as nn
from torch.optim import Adam

# Other ML Tools
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from nltk.tokenize import TweetTokenizer, word_tokenize

# Basic Libraries
from collections import defaultdict
import re
import random


In [139]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Find Hyperlinks in string

In [140]:
def findUrl(string):
    # findall() has been used with valid conditions for urls in string
    regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
    found = re.search(regex, string)
    return found

# Data Loader / Tokenizer

In [141]:
train_data = pd.read_csv('train.csv')
Y = list((train_data['Type'] == 'Quality').astype(int))
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

tokenizer = TweetTokenizer()
hashtag = True
wordcount = defaultdict(int)
vocab_size = 500

lines = []
maxlen = 0
for data in train_data['Tweet']:

    line = ['<START>']

    tokens = tokenizer.tokenize(data.lower())

    for token in tokens:
        url = findUrl(token)
        if url:
            line.append('<URL>')
            wordcount['<URL>'] += 1
        elif token[0] == '#':
            if hashtag:
                line.append(token)
                wordcount[token] += 1
            else:
                line.append('<HASH>')
                wordcount['<HASH>'] += 1
        else:
            more_words = word_tokenize(token)
            for w in more_words:
                line.append(w)
                wordcount[w] += 1

    line.append('<END>')
    maxlen = max(maxlen, len(line))
    lines.append(line)

wordcount['<START>'] = len(train_data['Tweet'])
wordcount['<END>'] = len(train_data['Tweet'])

sorted_wordcounts = sorted(wordcount.items(), key = lambda item: item[1], reverse=True)

word2ind = {}
ind2word = {}

ind = 1
for k, v in sorted_wordcounts[:vocab_size - 1]:
    word2ind[k] = ind
    ind2word[ind] = k
    ind += 1

for k, v in sorted_wordcounts[vocab_size - 1:]:
    word2ind[k] = vocab_size
    ind2word[vocab_size - 1] = '<UKN>'

X = []

for line in lines:
    ind_line = []
    for word in line:
        ind_line.append(word2ind[word])
    
    if len(ind_line) < maxlen:
        ind_line += [0] * (maxlen - len(ind_line))
    
    X.append(ind_line)


In [142]:
def shuffle_data(X, Y):
    shuffled_X = []
    shuffled_Y = []
    indices = list(range(len(X)))
    random.shuffle(indices)
    for i in indices:
        shuffled_X.append(X[i])
        shuffled_Y.append(Y[i])
    return (shuffled_X, shuffled_Y)

In [143]:
def train(X, Y, cnn):
    optimizer = Adam(cnn.parameters(), lr = 0.01)
    loss_f = nn.NLLLoss()
    n_epochs = 10
    batchSize = 10
    for epoch in range(n_epochs):
        cnn.train()
        totalLoss = 0.0

        shuffled_X, shuffled_Y = shuffle_data(X, Y)
        X_train, Y_train = torch.tensor(shuffled_X, device = device), torch.tensor(shuffled_Y, dtype = float, device = device)

        for batch in range(len(X_train)//10 + 1):
            cnn.zero_grad()
            x = X_train[batch*batchSize: (batch + 1)*batchSize]
            y = Y_train[batch*batchSize: (batch + 1)*batchSize]
            y = y.to(dtype=int)
            output = cnn(x)
            loss = loss_f(output, y)
            totalLoss += loss.item()
            loss.backward()
            optimizer.step()

        print("total loss is: ", totalLoss)

In [144]:
class CNN(nn.Module):
    def __init__(self, NUM_CLASSES=2, VOCAB_SIZE=vocab_size, DIM_EMB=200):
        super(CNN, self).__init__()

        self.Embedding = nn.Embedding(VOCAB_SIZE + 1 , DIM_EMB)
        self.conv1d_list = nn.ModuleList([
            nn.Conv1d(in_channels=DIM_EMB,
                      out_channels=2,
                      kernel_size=ks)
            for ks in range(2, 5)
        ])
        self.ReLU = nn.ReLU()
        self.MaxPool = nn.MaxPool1d
        self.Dropout = nn.Dropout()
        self.Linear = nn.Linear(6, NUM_CLASSES)
        self.LogSoftmax = nn.LogSoftmax(dim=1)
        
    def forward(self, X):
        E = self.Embedding(X).permute(0, 2, 1)
        R = [self.ReLU(conv1d(E)) for conv1d in self.conv1d_list]
        M = [self.MaxPool(kernel_size=r.shape[2])(r) for r in R]
        C = torch.cat([m.squeeze(dim=2) for m in M], dim = 1)
        L = self.Linear(C)
        X = self.LogSoftmax(L)
        return X

In [145]:
cnn = CNN().to(device)
train(X[1000:], Y[1000:], cnn)

total loss is:  525.3924923986197
total loss is:  444.56662080809474
total loss is:  432.5548881990835
total loss is:  415.68563915230334
total loss is:  413.3081519016996
total loss is:  395.66704229824245
total loss is:  388.2934821471572
total loss is:  381.0469605503604
total loss is:  388.5188425361994
total loss is:  386.07888509356417


In [146]:
def evaluate(model, test_X, test_Y):

  input = torch.tensor(test_X, dtype=int, device = device)
  model.eval()
  y_pred = model(input).squeeze().argmax(dim=1).tolist()
  y_true = test_Y
  print(classification_report(y_true, y_pred, labels=[1, 0], digits = 4))

evaluate(cnn, X[:1000], Y[:1000])

              precision    recall  f1-score   support

           1     0.8494    0.7186    0.7786       526
           0     0.7333    0.8586    0.7911       474

    accuracy                         0.7850      1000
   macro avg     0.7914    0.7886    0.7848      1000
weighted avg     0.7944    0.7850    0.7845      1000

