In [1]:
import torch
from torch.utils.data import DataLoader
import torch.optim as optim
from urllib.parse import urlparse
import os
import numpy as np
from IPython.display import clear_output, display
import time
import pandas as pd

from Constants import Constants, specialTokenList, specialTokens
from All_Models import SSCL, GatedCNN, SelfAttnModel
from utils import getSampler
from Trainer import Trainer
from LoadData import loadingData


%matplotlib inline

'''
TODO:

1. Try Larger Vocab size
4. Do Tkinter

'''

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


class args(object):

    # Data
    
    
    dataset = ["HSpam14", "Honeypot"][1]
    
    usingWeightRandomSampling = True
    vocab_size = 8000 # if we create the new vocab size, we have to do the new preprocess again
    full_data = True
    validation_portion = 0.05
    test_portion = 0.04
    
    
    pickle_name = "FullPickleData"+ str(vocab_size) + "Vocab.txt"
    pickle_name_beforeMapToIdx = "FullPickleDatabeforeMapToIdx.txt"

    dataset_path = ""  # load a dataset and setting

    ##### Arch
        
    usingPretrainedEmbedding = False
    if usingPretrainedEmbedding:
        embedding_dim = 300
    else:
        embedding_dim = 512

    ## GatedCNN arch

    GatedCNN_embedingDim = 128
    GatedCNN_convDim = 64
    GatedCNN_kernel = 3
    GatedCNN_stride = 1
    GatedCNN_pad = 1
    GatedCNN_layers = 8
    GatedCNN_dropout = 0.1
        
    ## SSCL arch

    RNN_hidden = 256
    num_CNN_filter = 256
    CNN_kernel_size = 5
    LSTM_dropout = 0.1
    num_LSTM_layers = 1
    SSCL_CNN_dropout = 0.1 
    
    ## Attn arch
    

    attnLenMaxSeq = 280 # Default, will be changed Later

    # These Two has to be the same
    attnWordVecDim = 128
    attnModelDim = 128
    
    attnFFInnerDim = 256
    attnNumLayers = 3
    attnNumHead = 4
    attnKDim = 64
    attnVDim = 64
    attnDropout = 0.1
    
    # Training params

    confusion_matrics = []
    
    batch_size = 64
    L2 = 0.1
    threshold = 0.5
    lr = 0.002
    n_epoch = 50

    # If using Adam
    adam_beta1 = 0.9
    adam_beta2 = 0.999
    adam_weight_decay = 0.01
    
    
    earlyStopStep = 5000 # Set None if we don't want it
    earlyStopEpoch = 1 #

    # Logging the Training
    val_freq = 50
    val_steps = 3
    log_freq = 10
    model_save_freq = 1
    model_name = 'GatedCNN_Vocab8000_RandomWeightedSampling_WithDropout'
    model_path = './'+ dataset +'_Log/' + model_name + '/Model/'
    log_path = './' + dataset +'_Log/' + model_name + '/Log/'

args.device = device

# Create the path for saving model and the log
if not os.path.exists(args.model_path):
    os.makedirs(args.model_path)

if not os.path.exists(args.log_path):
    os.makedirs(args.log_path)

In [None]:
training_dataset, validation_dataset, test_dataset, text = loadingData(args)

Loading Origin Data and do the Proprocessing
Loading Honeypot dataset
Data Splitation
Number of Training Data:  894927
Number of Validation Data:  45217
Number of Test Data:  1885
Preprocessing X_train
Preprocessing X_validation
Preprocessing X_test
Generating text
Original Vocab Size:  8080956
The Pickle Data beforeMapToIdx Dumped to: Honeypot/FullPickleDatabeforeMapToIdx.txt
Generating Datasets
Training set map to Idx
Validation set map to Idx


In [None]:
# Put it after the training set
args.numberOfSpammer = sum([t[-1] for t in training_dataset])
args.numberOfNoSpammer = len(training_dataset)-args.numberOfSpammer
args.len_max_seq = training_dataset[0][2]

In [None]:
if args.usingWeightRandomSampling:
    sampler = getSampler(training_dataset)
else:
    sampler = None

train_loader = DataLoader(
    training_dataset, batch_size=args.batch_size, shuffle=False, drop_last=False, sampler = sampler)
valid_loader = DataLoader(
    validation_dataset, batch_size=args.batch_size, shuffle=True, drop_last=False)

trainer = Trainer(GatedCNN, args).to(device)

print("Number of Parameters in this Model: ",trainer.num_all_params())
print("Using device: ", device)

scheduler = optim.lr_scheduler.StepLR(trainer.optim, 2000, gamma=0.85)
# trainer.optim.param_groups[0]['lr']=
allStep = 0
epoch = 0

In [None]:
print("Model Structure: \n", trainer.model)

In [None]:
while epoch < args.n_epoch:
    for i, (texts, X , X_len, y) in enumerate(train_loader):
        
        trainer.train()
        X, X_len, y = X.to(device), X_len.to(device), y.to(device)
        
        if trainer.optim.param_groups[0]['lr'] >= 0.00001:
            scheduler.step()
        start_t = time.time()
#         trainer.train_step((X, X_len), y)
        trainer.train_step(X, y)

        end_t = time.time()
        allStep += 1
        print('| Epoch [%d] | Step [%d] | lr [%.6f] | Loss: [%.4f] | Acc: [%.4f] | Time: %.1fs' %
              (epoch, allStep, trainer.optim.param_groups[0]['lr'], trainer.loss.item(), trainer.accuracy.item(),
               end_t - start_t))

#         if trainer.accuracy.item() > 0.95: # Stop early
#             raise StopIteration
        if allStep % args.log_freq == 0:
            trainer.plot_train_hist(args.model_name)
            
        
        if args.earlyStopStep:
            if allStep >= args.earlyStopStep:
                    raise StopIteration
        

        if allStep % args.val_freq == 0:

            for _ in range(args.val_steps):
                trainer.eval()
                stIdx = np.random.randint(
                    0, len(validation_dataset) - args.batch_size)
                v_text, v_X, v_X_len, v_y = validation_dataset[stIdx: stIdx +
                                                       args.batch_size]
                v_X, v_X_len, v_y = v_X.to(
                    device), v_X_len.to(device), v_y.to(device)
                start_t = time.time()
#                 trainer.test_step((v_X, v_X_len), v_y)
                trainer.test_step(v_X, v_y)
                end_t = time.time()
                print('| Epoch [%d] | Validation | Step [%d] |  Loss: [%.4f] | Acc: [%.4f] | Time: %.1fs' %
                      (epoch, allStep, trainer.loss.item(), trainer.accuracy.item(), end_t - start_t))
            trainer.calculateAverage()
            clear_output()
            print("TrainConfusion Matrix: \n")
            display(pd.DataFrame(trainer.cms['Train'][-1]))
            print("ValConfusion Matrix: \n")
            display(pd.DataFrame(trainer.cms['Val'][-1]))
            trainer.plot_all(args.model_name)
            
            
            
     # After every Epoch, if can be moved

    epoch += 1
    trainer.model_save(epoch)


    if args.earlyStopEpoch:
        if epoch >= args.earlyStopEpoch:
            raise StopIteration


In [None]:
test_text, test_X, test_X_len, test_y  =  zip(test_dataset[0:])
test_text, test_X, test_X_len, test_y = test_text[0], test_X[0].to(device), test_X_len[0].to(device), test_y[0].to(device)

In [None]:
loss, accuracy, cm = trainer.test_step(test_X, test_y)

In [None]:
print("The Test Loss: ", loss.item())
print("The Test Accuracy: ", accuracy.item())
print("Test Confusion Matrix: \n", cm)

## Need Dropout

In [None]:
df = pd.read_html(os.path.join(args.dataset, "FullDataFromSQLHSpam14.html"))[
                    0].iloc[1:, :]

In [None]:
df.columns = ['text', 'maliciousMark']