In [1]:
import transformers
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModel, BertTokenizer, BertModel

In [2]:
from torch import optim
from torch import nn
import torch.nn.functional as F
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [3]:
import pandas as pd
import re
import matplotlib.pyplot as plt
from tqdm import tqdm, trange
import numpy as np
import pickle

In [4]:
SEED = 123456789
RANDOM_STATE = 1

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
if torch.cuda.is_available(): torch.cuda.manual_seed_all(SEED)

In [7]:
#df = pd.read_csv("./simple_dataset/urldata_ita.csv")
df = pd.read_csv("data.csv")


df.head()

sentences = df.Domain.values


target_clean_train = []

for x in df.Label:
    if x == 0:
        target_clean_train.append(0)
    if x == 1:
        target_clean_train.append(1)
        
labels = np.array(target_clean_train)

def unplickle_trees(path_tree_file):
    print('read DTKs:', path_tree_file)
    dt_trees = []
    with open(path_tree_file, 'rb') as fr:
        try:
            while True:
                dt_trees.append(pickle.load(fr))
        except EOFError:
            pass
    return [torch.FloatTensor(i) for i in dt_trees]

trees = unplickle_trees('trees.pkl')

In [9]:
torch.manual_seed(SEED)
X_train, X_test, target_train, target_test = train_test_split(trees, labels, random_state=RANDOM_STATE, test_size=0.3)
X_train, X_validation, target_train, target_validation = train_test_split(X_train, target_train, random_state=RANDOM_STATE, test_size=0.1)

X_train = torch.stack(X_train)
target_train = torch.tensor(target_train)
X_validation = torch.stack(X_validation)
target_validation = torch.tensor(target_validation)
X_test = torch.stack(X_test)
target_test = torch.tensor(target_test)

X_train.shape, X_validation.shape, X_test.shape

(torch.Size([2351, 4000]), torch.Size([262, 4000]), torch.Size([1121, 4000]))

In [10]:
batch_size = 32

train_data = TensorDataset(X_train, target_train)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(X_validation, target_validation)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

test_data = TensorDataset(X_test, target_test)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [11]:
class KERMIT(nn.Module):
    def __init__(self, input_dim_dt, output_dim):
        super().__init__()
        self.fc1 = torch.nn.Linear(input_dim_dt, 8000)
        self.fc2 = torch.nn.Linear(8000, 4000)
        self.fc3 = torch.nn.Linear(4000, 2000)
        self.synth_sem_linear = nn.Linear(2000, output_dim)


    def forward(self, x_synth):        
        x_synth = F.dropout(F.relu(self.fc1(x_synth)), p=0.1)
        x_synth = F.dropout(F.relu(self.fc2(x_synth)), p=0.1)
        x_synth = F.dropout(F.relu(self.fc3(x_synth)), p=0.1)
        x_tot = self.synth_sem_linear(x_synth)
        return x_tot   

class KERMIT_2(nn.Module):
    def __init__(self, input_dim_dt, output_dim):
        super().__init__()
        self.fc1 = torch.nn.Linear(input_dim_dt, 4000)
        self.fc2 = torch.nn.Linear(4000, 4000)
        self.fc3 = torch.nn.Linear(4000, 2000)
        self.synth_sem_linear = nn.Linear(2000, output_dim)


    def forward(self, x_synth):        
        x_synth = F.dropout(F.relu(self.fc1(x_synth)), p=0.1)
        x_synth = F.dropout(F.relu(self.fc2(x_synth)), p=0.1)
        x_synth = F.dropout(F.relu(self.fc3(x_synth)), p=0.1)
        x_tot = self.synth_sem_linear(x_synth)
        return x_tot   

class BertForSequenceClassification(nn.Module):
  
    def __init__(self, input_dim_bert, input_dim_dt, output_dim):
        super().__init__()
        self.bert = AutoModel.from_pretrained("bert-base-uncased").to("cuda" if torch.cuda.is_available() else "cpu")
        
        self.dropout = nn.Dropout(0.1)
        self.synth_sem_linear = nn.Linear(input_dim_bert, output_dim)
        
    def forward(self, x_sem, attention_mask, x_synth):
        with torch.no_grad():
            x_sem = self.bert(x_sem)[0][:, 0, :]
        
        x_sem = self.dropout(x_sem)        
        x_tot = self.synth_sem_linear(x_sem)
        
        return x_tot
        
    def get_activation(self, x_sem, x_synth):
        with torch.no_grad():
            x_sem = self.bert(x_sem)[0][:, 0, :]
            x_tot = torch.cat((x_sem, x_synth), 1)
            x_tot = self.synth_sem_linear(x_tot)
        out = F.log_softmax(x_tot, dim=1)
        return out

In [12]:
unique_train, counts_train = np.unique(target_train, return_counts = True)
unique_validation, counts_validation = np.unique(target_validation, return_counts = True)
unique_test, counts_test = np.unique(target_test, return_counts = True)
print(counts_train, counts_validation, counts_test)

[1187 1164] [138 124] [542 579]


In [13]:
model = KERMIT_2(4000,2)
criterion = nn.CrossEntropyLoss()
parameters = filter(lambda p: p.requires_grad, model.parameters())
optimizer = optim.AdamW(model.parameters(), lr=2e-5)

In [14]:
model.cuda()

DTFF_Dario(
  (fc1): Linear(in_features=4000, out_features=4000, bias=True)
  (fc2): Linear(in_features=4000, out_features=4000, bias=True)
  (fc3): Linear(in_features=4000, out_features=2000, bias=True)
  (synth_sem_linear): Linear(in_features=2000, out_features=2, bias=True)
)

In [15]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:

train_loss_set = []

epochs = 10
epoch = 0

for _ in trange(epochs, desc="Epoch"):  
    model.train()  

    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    for step, batch in enumerate(train_dataloader):

        batch = tuple(t.cuda() for t in batch)
        b_input_tree, b_labels = batch
        optimizer.zero_grad()
        # Forward pass
        target_hat = model(b_input_tree)
        
        loss = criterion(target_hat, b_labels)
        train_loss_set.append(loss.item())
        
        # Backward pass
        loss.backward()
        optimizer.step()
        tr_loss += loss.item()
        nb_tr_steps += 1

    ## VALIDATION

    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    for batch in validation_dataloader:
        batch = tuple(t.cuda() for t in batch)
        b_input_tree, b_labels = batch
        with torch.no_grad():
            logits = model(b_input_tree)
            
        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)    
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1
        
    epoch +=1
    
    print("Epochs: {}".format(epoch))
    print("Train loss: {}".format(tr_loss/nb_tr_steps))
    print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))

Epoch:  10%|█         | 1/10 [00:57<08:33, 57.05s/it]

Epochs: 1
Train loss: 0.3230704594453075
Validation Accuracy: 0.9007633587786259


Epoch:  20%|██        | 2/10 [01:53<07:34, 56.85s/it]

Epochs: 2
Train loss: 0.24372177953928142
Validation Accuracy: 0.8969465648854962


Epoch:  30%|███       | 3/10 [02:50<06:38, 56.99s/it]

Epochs: 3
Train loss: 0.21031774584397112
Validation Accuracy: 0.9045801526717557


In [None]:
predictions = []
model.eval()

for batch in test_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_tree, b_labels = batch

    with torch.no_grad():
        logits = model(b_input_tree)
    logits = logits.detach().cpu().numpy()

    predictions.append(logits)
    
    flat_predictions = [item for sublist in predictions for item in sublist]
    flat_predictions = np.argmax(flat_predictions, axis=1).flatten()

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sn
from sklearn import metrics

In [None]:
print(metrics.classification_report(target_test.numpy(), flat_predictions,digits=4))