In [None]:
import transformers
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModel, BertTokenizer, BertModel

In [None]:
from torch import optim
from torch import nn
import torch.nn.functional as F
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [None]:
import pandas as pd
import re
import matplotlib.pyplot as plt
from tqdm import tqdm, trange
import numpy as np
import pickle

In [None]:
SEED = 123456789
RANDOM_STATE = 1

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
if torch.cuda.is_available(): torch.cuda.manual_seed_all(SEED)

In [None]:
from torchtext.vocab import GloVe
from torchtext.data import get_tokenizer
embedding_glove = GloVe(name='6B', dim=300)
tokenizer = get_tokenizer("basic_english")

In [None]:
def bag_of_embeddings(sentences, embedding_glove):
    sentences_boe = []
    for s in sentences:
        boe = torch.zeros(300)
        for w in tokenizer(s):
            boe += embedding_glove.get_vecs_by_tokens(w)
        sentences_boe.append(boe)
    return sentences_boe

def unplickle_trees(path_tree_file):
    print('read DTKs:', path_tree_file)
    dt_trees = []
    with open(path_tree_file, 'rb') as fr:
        try:
            while True:
                dt_trees.append(pickle.load(fr))
        except EOFError:
            pass
    return [torch.FloatTensor(i) for i in dt_trees]

In [None]:
def setTarget(df):
    target = []
    for x in df.target:
        if x == 'Forum' or x == 0:
            target.append(0)
        if x == 'Market' or x == 1:
            target.append(1)
    return np.array(target)

def unplickle_trees(path_tree_file):
    print('read DTKs:', path_tree_file)
    dt_trees = []
    with open(path_tree_file, 'rb') as fr:
        try:
            while True:
                dt_trees.append(pickle.load(fr))
        except EOFError:
            pass
    return [torch.FloatTensor(i) for i in dt_trees]

In [None]:
dataset1_surface = 'surface_web_Market'
dataset_tree_surface = 'dtk_trees_surface_web_COVID_Market'


dataset1_dark = 'DarkWeb_Covid_Market'
dataset_tree_dark = 'dtk_trees_DarkWeb_Covid_Market'


df_surface = pd.read_csv(path + dataset1_surface + '.csv')
df_dark = pd.read_csv(path + dataset1_dark + '.csv')

df_surface['target'] = [0]*len(df_surface)
df_dark['target'] = [1]*len(df_dark)

list_df = [df_surface, df_dark]
df_ = pd.concat(list_df)

sentences = df_.Text.values
labels = setTarget(df_)

In [None]:
sentences_train = df_.Text.values
sentences_train_boe = bag_of_embeddings(sentences_train,embedding_glove)
labels_train = setTarget(df_)

In [None]:
#trees = unplickle_trees(path+dataset_tree+'.pkl')

trees_surface = unplickle_trees(path+dataset_tree_surface+'.pkl')
trees_dark = unplickle_trees(path+dataset_tree_dark+'.pkl')
trees = trees_surface + trees_dark

print('trees:', len(trees))

In [None]:

vocab_train = []
for s in sentences_train:
    for w in tokenizer(s):
        vocab_train.append(w.lower())
vocab_train = list(set(vocab_train))
print('vocab_train size:',len(vocab_train))
missing_word = []
for w in vocab_train:
    if float(embedding_glove.get_vecs_by_tokens(w)[0]) == 0:
        missing_word.append(w)
print('missing_word size:',len(missing_word))

In [None]:
torch.manual_seed(SEED)
X_train_synt, X_test_synt, target_train_synt, target_test = train_test_split(trees, labels_train, random_state=RANDOM_STATE, test_size=0.3)
X_train_sem, X_test_sem, target_train_sem, _ = train_test_split(sentences_train_boe, labels_train, random_state=RANDOM_STATE, test_size=0.3)
X_train_synt, X_validation_synt, target_train, target_validation = train_test_split(X_train_synt, target_train_synt, random_state=RANDOM_STATE, test_size=0.1)
X_train_sem, X_validation_sem, _, _ = train_test_split(X_train_sem, target_train_sem, random_state=RANDOM_STATE, test_size=0.1)

X_train_synt = torch.stack(X_train_synt)
X_train_sem = torch.stack(X_train_sem)
target_train = torch.tensor(target_train)

X_validation_synt = torch.stack(X_validation_synt)
X_validation_sem = torch.stack(X_validation_sem)
target_validation = torch.tensor(target_validation)

X_test_synt = torch.stack(X_test_synt)
X_test_sem = torch.stack(X_test_sem)
target_test = torch.tensor(target_test)

X_train_synt.shape, X_train_sem.shape, X_validation_synt.shape, X_validation_sem.shape, X_test_synt.shape, X_test_sem.shape

In [None]:
batch_size = 1

train_data = TensorDataset(X_train_synt, X_train_sem, target_train)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(X_validation_synt, X_validation_sem, target_validation)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

test_data = TensorDataset(X_test_synt, X_test_sem, target_test)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [None]:
class BoW_model(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.fc1 = torch.nn.Linear(input_dim, 150)
        self.fc2 = torch.nn.Linear(150, 50)
        self.synth_sem_linear = nn.Linear(50, output_dim)

    def forward(self, x_synth):        
        x_synth = F.dropout(F.relu(self.fc1(x_synth)), p=0.1)
        x_synth = F.dropout(F.relu(self.fc2(x_synth)), p=0.1)
        x_tot = self.synth_sem_linear(x_synth)
        return x_tot

class DTFF_Dario(nn.Module):
    def __init__(self, input_dim_dt, output_dim):
        super().__init__()
        self.fc1 = torch.nn.Linear(input_dim_dt, 4000)
        self.fc2 = torch.nn.Linear(4000, 4000)
        self.fc3 = torch.nn.Linear(4000, 2000)
        self.synth_sem_linear = nn.Linear(2000, output_dim)


    def forward(self, x_synth):        
        x_synth = F.dropout(F.relu(self.fc1(x_synth)), p=0.1)
        x_synth = F.dropout(F.relu(self.fc2(x_synth)), p=0.1)
        x_synth = F.dropout(F.relu(self.fc3(x_synth)), p=0.1)
        x_tot = self.synth_sem_linear(x_synth)
        return x_tot   

class BoWandDT_(nn.Module):
    def __init__(self, input_dim_dt, input_dim_we, output_dim):
        super().__init__()
        self.fc1 = torch.nn.Linear(input_dim_dt, 4000)
        self.fc2 = torch.nn.Linear(4000, 2000)
        #self.fc3 = torch.nn.Linear(4000, 2000)
        self.synth_linear = nn.Linear(2000, output_dim)
        
        self.fc4 = torch.nn.Linear(input_dim_we, 150)
        self.fc5 = torch.nn.Linear(150, 50)
        self.sem_linear = nn.Linear(50, output_dim)
        
        self.synth_sem_linear = nn.Linear(output_dim*2, output_dim)


    def forward(self, x_synth, x_sem):        
        x_synth = F.dropout(F.relu(self.fc1(x_synth)), p=0.1)
        x_synth = F.dropout(F.relu(self.fc2(x_synth)), p=0.1)
        #x_synth = F.dropout(F.relu(self.fc3(x_synth)), p=0.1)
        x_synth = F.dropout(F.relu(self.synth_linear(x_synth)), p=0.1)
        
        x_sem = F.dropout(F.relu(self.fc4(x_sem)), p=0.1)
        x_sem = F.dropout(F.relu(self.fc5(x_sem)), p=0.1)
        x_sem = F.dropout(F.relu(self.sem_linear(x_sem)), p=0.1)
        
        x_synth_sem = torch.cat((x_synth, x_sem), dim=1)
        x_tot = self.synth_sem_linear(x_synth_sem)
        return x_tot  
    
class BoWandDT90_25epochs(nn.Module):
    def __init__(self, input_dim_dt, input_dim_we, output_dim):
        super().__init__()
        self.fc1 = torch.nn.Linear(input_dim_dt, 4000)
        self.fc2 = torch.nn.Linear(4000, 2000)
        self.synth_linear = nn.Linear(2000, input_dim_we)
        self.synth_sem_linear = nn.Linear(input_dim_we*2, input_dim_we)
        self.fc4 = torch.nn.Linear(input_dim_we, 150)
        self.fc5 = torch.nn.Linear(150, 50)
        self.out = nn.Linear(50, output_dim)

    def forward(self, x_synth, x_sem):        
        x_synth = F.dropout(F.relu(self.fc1(x_synth)), p=0.1)
        x_synth = F.dropout(F.relu(self.fc2(x_synth)), p=0.1)
        x_synth = F.dropout(F.relu(self.synth_linear(x_synth)), p=0.1)
        
        x_synth_sem = torch.cat((x_synth, x_sem), dim=1)
        
        x_sem = F.dropout(F.relu(self.synth_sem_linear(x_synth_sem)), p=0.1)
        x_sem = F.dropout(F.relu(self.fc4(x_sem)), p=0.1)
        x_sem = F.dropout(F.relu(self.fc5(x_sem)), p=0.1)
        x_tot = self.out(x_sem)
        return x_tot

class BoWandDT(nn.Module):
    def __init__(self, input_dim_dt, input_dim_we, output_dim):
        super().__init__()
        self.fc1 = torch.nn.Linear(input_dim_dt, 2000)
        self.fc3 = nn.Linear(2000, input_dim_we)
        self.synth_linear = nn.Linear(input_dim_we, 100)
        
        self.fc4 = torch.nn.Linear(input_dim_we, 100)
        self.concat = nn.Linear(200, 100)
        self.fc5 = torch.nn.Linear(100, 50)
        self.out = nn.Linear(50, output_dim)

    def forward(self, x_synth, x_sem):        
        x_synth = F.dropout(F.relu(self.fc1(x_synth)), p=0.1)
        x_synth = F.dropout(F.relu(self.fc3(x_synth)), p=0.1)
        x_synth = F.dropout(F.relu(self.synth_linear(x_synth)), p=0.1)
        
        x_sem = F.dropout(F.relu(self.fc4(x_sem)), p=0.1)
        
        x_synth_sem = torch.cat((x_synth, x_sem), dim=1)
        x_sem = F.dropout(F.relu(self.concat(x_synth_sem)), p=0.2)
        x_sem = F.dropout(F.relu(self.fc5(x_sem)), p=0.1)
        x_tot = self.out(x_sem)
        return x_tot  

In [None]:
unique_train, counts_train = np.unique(target_train, return_counts = True)
unique_validation, counts_validation = np.unique(target_validation, return_counts = True)
unique_test, counts_test = np.unique(target_test, return_counts = True)
print(counts_train, counts_validation, counts_test)

In [None]:
#model = BoWandDT(4000, 300, 2)
model = BoWandDT90_25epochs(4000, 300, 2)
criterion = nn.CrossEntropyLoss()
parameters = filter(lambda p: p.requires_grad, model.parameters())
optimizer = optim.AdamW(model.parameters(), lr=2e-5)

In [None]:
model.cuda()

In [None]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
train_loss_set = []
epochs = 5
epoch = 0

for _ in trange(epochs, desc="Epoch"):  
    model.train()  
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(train_dataloader):

        batch = tuple(t.cuda() for t in batch)
        b_input_tree, b_input_we, b_labels = batch
        optimizer.zero_grad()
        target_hat = model(b_input_tree, b_input_we)
        
        loss = criterion(target_hat, b_labels)
        train_loss_set.append(loss.item())
        
        loss.backward()
        optimizer.step()
        tr_loss += loss.item()
        nb_tr_steps += 1

    ## VALIDATION


    model.eval()
    # Tracking variables 
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    # Evaluate data for one epoch
    for batch in validation_dataloader:
        # Add batch to GPU
        batch = tuple(t.cuda() for t in batch)
        # Unpack the inputs from our dataloader
        b_input_tree, b_input_we, b_labels = batch
        # Telling the model not to compute or store gradients, saving memory and speeding up validation
        with torch.no_grad():
            logits = model(b_input_tree, b_input_we)
            
        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)    
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1
        
    epoch +=1
    
    print("Epochs: {}".format(epoch))
    print("Train loss: {}".format(tr_loss/nb_tr_steps))
    print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))

In [None]:
predictions = []
model.eval()

for batch in test_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_tree, b_input_we, b_labels = batch

    with torch.no_grad():
        logits = model(b_input_tree, b_input_we)
    logits = logits.detach().cpu().numpy()

    predictions.append(logits)
    
    flat_predictions = [item for sublist in predictions for item in sublist]
    flat_predictions = np.argmax(flat_predictions, axis=1).flatten()

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sn
from sklearn import metrics

In [None]:
print(metrics.classification_report(target_test.numpy(), flat_predictions, digits=4))