In [3]:
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from transformers import BertTokenizer, BigBirdTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig, BigBirdForSequenceClassification, GPT2Tokenizer, GPTNeoForSequenceClassification
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
import nltk

In [2]:
!pip install Sentencepiece
!pip install transformers

Collecting Sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[?25l[K     |▎                               | 10 kB 33.4 MB/s eta 0:00:01[K     |▌                               | 20 kB 27.4 MB/s eta 0:00:01[K     |▉                               | 30 kB 19.7 MB/s eta 0:00:01[K     |█                               | 40 kB 17.1 MB/s eta 0:00:01[K     |█▍                              | 51 kB 9.2 MB/s eta 0:00:01[K     |█▋                              | 61 kB 9.9 MB/s eta 0:00:01[K     |██                              | 71 kB 9.6 MB/s eta 0:00:01[K     |██▏                             | 81 kB 10.7 MB/s eta 0:00:01[K     |██▍                             | 92 kB 11.2 MB/s eta 0:00:01[K     |██▊                             | 102 kB 9.0 MB/s eta 0:00:01[K     |███                             | 112 kB 9.0 MB/s eta 0:00:01[K     |███▎                            | 122 kB 9.0 MB/s eta 0:00:01[K     |███▌    

In [4]:
from google.colab import drive
drive._mount('/content/drive')

Mounted at /content/drive


In [5]:
import os
os.chdir('drive/MyDrive/machine_learning')

In [292]:
np.load('fc-bigbird-embedding_train_stats_epoch3.npy', allow_pickle=True)

array([{'epoch': 1, 'train_loss': tensor(0.1993, device='cuda:0', requires_grad=True), 'train_acc': 0.9258950211010212, 'val_loss': tensor(0.2768, device='cuda:0'), 'val_acc': 0.9025754183793306}],
      dtype=object)

In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class textCNN(nn.Module):
    def __init__(self, inplane=1, input_dim=768, num_conv=3, conv_size=[2,3,4], dropout_prob=0.2, dim_output=2):
        super(textCNN, self).__init__()
        
        D_words = input_dim # dimension of word embedding
        self.convs = nn.ModuleList([nn.Conv2d(inplane,num_conv,(K,input_dim)) for K in conv_size]) ## list of convolutional layers
        self.dropout = nn.Dropout(dropout_prob) 
        self.fc = nn.Linear(len(conv_size)*num_conv, dim_output) 
        
    def forward(self,x):
        #x.size = (batch_size, sequence_length, word_embedding)
        
        x = x.unsqueeze(1) #(N,C,W,D) (C=1)
        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs] # len(conv_size)*(N,num_conv,W)
        x = [F.max_pool1d(line,line.size(2)).squeeze(2) for line in x]  # len(conv_size)*(N,num_conv)
        
        x = torch.cat(x,1) #(N,num_conv*len(conv_size))
        x = self.dropout(x)
        logit = self.fc(x)
        return logit

In [7]:
import torch.nn as nn

class LSTM_attention(nn.Module):
    def __init__(self, input_dim=768, hidden_size=256, num_layers=1, dim_output=2, bi_directional=True):
        super(LSTM_attention, self).__init__()
        
        self.lstm = nn.LSTM(input_size=input_dim, hidden_size=hidden_size, num_layers=num_layers, bidirectional=bi_directional, bias=True)
        self.fc = nn.Linear((int(bi_directional)+1) * hidden_size, dim_output)

    def attention_layer(self,lstm_output, final_state):
        # lstm_output : [batch_size, n_step, n_hidden * num_directions(=2)], F matrix
        # final_state : [num_layers(=1) * num_directions(=2), batch_size, n_hidden]

        batch_size = len(lstm_output)
        hidden = torch.cat((final_state[0], final_state[1]), dim=1).unsqueeze(2)
        # hidden : [batch_size, n_hidden * num_directions(=2), n_layer(=1)]
        attn_weights = torch.bmm(lstm_output, hidden).squeeze(2)
        # attn_weights : [batch_size, n_step]
        soft_attn_weights = F.softmax(attn_weights,1)

        # context: [batch_size, n_hidden * num_directions(=2)]
        context = torch.bmm(lstm_output.transpose(1,2),soft_attn_weights.unsqueeze(2)).squeeze(2)

        return context, soft_attn_weights

    def forward(self, inputs):
        output, (final_hidden_state, final_cell_state) = self.lstm(inputs.permute(1, 0, 2))
        atten_output, attention = self.attention_layer(output.permute(1, 0, 2), final_hidden_state)
        output = self.fc(atten_output)
        
        return output

In [8]:
class transformer_classifier:
    def __init__(self, model, classifier):
        self.model = model.to(device)
        self.classifier = classifier.to(device)
  
    def __call__(self, x_id, token_type_ids, attention_mask, labels):
        with torch.no_grad():  
            word_embedding = self.model(x_id, token_type_ids=None, attention_mask=attention_mask, labels=labels)['hidden_states'][-1]
            logits = self.classifier(word_embedding)
    
    return logits.max(1)[1]

In [27]:
# # load dataset
data_train = pd.read_csv('dataset_binary_ablation.csv')
data_test = pd.read_csv('dataset_binary_test.csv')

X_train, y_train = data_train.data.tolist(), data_train.label.tolist()
X_test, y_test = data_test.data.tolist(), data_test.label.tolist()

In [16]:
def preprocess_data_transformer(transformer_name):
    if transformer_name == 'BERT':
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
    elif transformer_name == 'GPT2':
        tokenizer = GPT2Tokenizer.from_pretrained('gpt2', do_lower_case=True)
        tokenizer.pad_token = tokenizer.eos_token
    elif transformer_name == 'BIGBIRD':
        tokenizer = BigBirdTokenizer.from_pretrained('google/bigbird-roberta-base', do_lower_case=True)

    for mode in ['train', 'test']:

        sample_ids = []
        attention_masks = []
      
        samples = X_train if mode == 'train' else X_test
        labels = y_train if mode == 'train' else y_test
        length = len(samples)
      
        for i, sent in enumerate(samples):
            encoded_dict = tokenizer.encode_plus(sent, add_special_tokens = True, max_length = 100, truncation = True, \
                                              padding = 'max_length', return_attention_mask = True, return_tensors = 'pt')

            # Add the encoded sample and mask 
            sample_ids.append(encoded_dict['input_ids'])
            attention_masks.append(encoded_dict['attention_mask'])
            print('\r----- Processing {}/{} {} samples'.format(i+1, length, mode), flush=True, end='')

        # Convert to pytorch tensors.
        sample_ids = torch.cat(sample_ids, dim=0)
        attention_masks = torch.cat(attention_masks, dim=0)
        labels = torch.tensor(labels)
      
        if mode == 'train': train_dataset = TensorDataset(sample_ids, attention_masks, labels)
        else: test_dataset = TensorDataset(sample_ids, attention_masks, labels)

    return train_dataset, test_dataset  

In [17]:
transformer_names = ['BERT', 'GPT2', 'BIGBIRD']

for name in transformer_names:
    if name == 'BERT':
        bert_train_dataset, bert_test_dataset = preprocess_data_transformer(name)
    elif name == 'GPT2':
        gpt2_train_dataset, gpt2_test_dataset = preprocess_data_transformer(name)
    elif name == 'BIGBIRD':
        bigbird_train_dataset, bigbird_test_dataset = preprocess_data_transformer(name)     

----- Processing 217923/217923 test samples

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

----- Processing 217923/217923 test samples

Downloading:   0%|          | 0.00/826k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/775 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/0.99k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/760 [00:00<?, ?B/s]

----- Processing 217923/217923 test samples

In [18]:
# save preprocessed text
torch.save(bert_train_dataset, 'random_forest_bert_train.pt')
torch.save(bert_test_dataset, 'random_forest_bert_test.pt')

torch.save(gpt2_train_dataset, 'random_forest_gpt2_train.pt')
torch.save(gpt2_test_dataset, 'random_forest_gpt2_test.pt')

torch.save(bigbird_train_dataset, 'random_forest_bigbird_train.pt')
torch.save(bigbird_test_dataset, 'random_forest_bigbird_test.pt')

In [9]:
bert = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = 2, output_attentions = False, output_hidden_states = True)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model1 = torch.load('bert-unfreeze.pkl')
model2 = transformer_classifier(bert, torch.load('LSTM-bert-embedding.pkl'))
model3 = transformer_classifier(bert, torch.load('cnn-embedding.pkl'))
model4 = torch.load('fc-gpt2.pkl')
# model5 = torch.load('fc-bigbird.pkl')
model5 = torch.load('fc-bigbird-epoch3.pkl')

models = {}
models['bert_unfreeze'] = model1.to(device)
models['bert_lstm'] = model2
models['bert_cnn'] = model3
models['gpt2'] = model4.to(device)
models['bigbird'] = model5.to(device)

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [10]:
def predict(dataloader, model, name):
    features = []
    for b, (x_id, x_mask, y) in enumerate(dataloader):
        x_id, x_mask, y = x_id.to(device), x_mask.to(device), y.to(device)

        if 'lstm' in name or 'cnn' in name:
            pred = model(x_id, token_type_ids=None, attention_mask=x_mask, labels=y)
    
        else:
            with torch.no_grad():  
                output = model(x_id, token_type_ids=None, attention_mask=x_mask, labels=y)
                loss, logits = output['loss'], output['logits']
                pred = logits.max(1)[1]

    features.append(pred.cpu().detach().numpy())
  
    return np.concatenate(np.array(features), axis = 0)

In [17]:
batch_size = 64
device = 'cuda' if torch.cuda.is_available() else 'cpu'
random_forest_features_train = []
#random_forest_features_test = []

for name, model in models.items():
    print('create features by {}'.format(name))
    #model = model.to(device)
    if 'bigbird' not in name:
        continue

    if 'bert' in name:
        train_dataloader = DataLoader(bert_train_dataset, sampler = SequentialSampler(bert_train_dataset), batch_size = batch_size)
        test_dataloader = DataLoader(bert_test_dataset, sampler = SequentialSampler(bert_test_dataset), batch_size = batch_size)
    if 'gpt2' in name:
        train_dataloader = DataLoader(gpt2_train_dataset, sampler = SequentialSampler(gpt2_train_dataset), batch_size = batch_size)
        test_dataloader = DataLoader(gpt2_test_dataset, sampler = SequentialSampler(gpt2_test_dataset), batch_size = batch_size)
    if 'bigbird' in name:
        train_dataloader = DataLoader(bigbird_train_dataset, sampler = SequentialSampler(bigbird_train_dataset), batch_size = batch_size)
        test_dataloader = DataLoader(bigbird_test_dataset, sampler = SequentialSampler(bigbird_test_dataset), batch_size = batch_size)


    train_feature = predict(train_dataloader, model, name)
    test_feature = predict(test_dataloader, model, name)
    random_forest_features_train.append(train_feature) 
    random_forest_features_test.append(test_feature) 

create features by bert_unfreeze
create features by bert_lstm
create features by bert_cnn
create features by gpt2
create features by bigbird




In [141]:
# save the features of random forests
np.save('random_forest_train.npy', np.array(random_forest_features_train).T)
np.save('random_forest_test.npy', np.array(random_forest_features_test).T)

In [20]:
# load the features of random forests
X_train_rf = np.load('random_forest_train.npy')
X_test_rf = np.load('random_forest_test.npy')

In [37]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, StratifiedKFold

from sklearn.inspection import plot_partial_dependence
from sklearn.metrics import accuracy_score, auc, roc_curve, f1_score
from sklearn.model_selection import GridSearchCV


In [38]:
# 10-fold cross validation for random forests
rf = RandomForestClassifier(criterion='gini', class_weight = 'balanced', oob_score = True, random_state = 0)
parameters = {'n_estimators':[10, 50, 100, 200, 500], 'max_depth':[2, 3, 4, 5]}

kf = StratifiedKFold(n_splits=10, shuffle=True, random_state = 0) #set random_state to be 0
clf = GridSearchCV(rf,parameters,scoring='roc_auc',n_jobs=-1,cv=kf)
clf.fit(X_train_rf, y_train)

print(clf.best_params_)

{'max_depth': 3, 'n_estimators': 100}


In [40]:
# test performance of random forests with the best parameters
clf = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=0)
clf.fit(X_train_rf, y_train)
predict = clf.predict(X_test_rf)
accuracy = (predict == y_test).mean()

print('The test accuracy of random forest is {:.4f}'.format(accuracy))

The test accuracy of random forest is 0.9051


**Ensemble learning through neural networks:**

In [41]:
import torch.optim as optim
import torch.nn.functional as F

model = nn.Sequential(nn.Linear(5, 2))


train_features = torch.tensor(X_train_rf).type(torch.float)
test_features = torch.tensor(X_test_rf).type(torch.float)
train_labels =  torch.tensor(np.array(y_train)).type(torch.long)
test_labels =  torch.tensor(np.array(y_test)).type(torch.long)

train_dataset = TensorDataset(train_features, train_labels)
test_dataset = TensorDataset(test_features, test_labels)

batch_size = 64
train_dataloader = DataLoader(train_dataset, sampler = RandomSampler(train_dataset), batch_size = batch_size)
val_dataloader = DataLoader(test_dataset, sampler = SequentialSampler(test_dataset), batch_size = batch_size)


device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)

n_batch = int(len(train_dataset)/batch_size)
max_epoch = 20

optimizer = torch.optim.SGD(model.parameters(), lr = 0.01) # the learning rate is suggested by the authors
#scheduler =  optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)
criterion = F.cross_entropy

print('Training start!')
for e in range(max_epoch):
    # train model
    model.train()
    
    epoch_loss = 0
    train_acc = 0
    for b, (x, y) in enumerate(train_dataloader):
        x,y = x.to(device), y.to(device)
        optimizer.zero_grad()

        output = model(x)
        loss = criterion(output, y)
        epoch_loss += loss
        train_acc += (output.max(1)[1] == y).float().mean().item()

        loss.backward()
        optimizer.step()
        #scheduler.step()
    
        print("\rEpoch: {:d} batch: {:d} / {} loss: {:.4f} | {:.2%}".format(e + 1, b, n_batch, loss, b*1.0/n_batch), end='', flush=True)
    
    print("\n----- Epoch {} ------\nTraining loss: {}".format(e+1, epoch_loss / len(train_dataloader)))
    print("Training accuracy: {}".format(train_acc / len(train_dataloader)))

    # evaluate model
    model.eval()
  
    eval_acc = 0
    eval_loss = 0
    nb_eval_steps = 0
  
    for b, (x, y) in enumerate(val_dataloader):
        x,y = x.to(device), y.to(device)
      
        with torch.no_grad():
            output = model(x)
            loss = criterion(output, y)

        eval_loss += loss
        eval_acc += (output.max(1)[1] == y).float().mean().item()

    print("Validation loss: {}".format(eval_loss / len(val_dataloader)))
    print("Validation accuracy: {}".format(eval_acc / len(val_dataloader)))
    print("\n")


Training start!
Epoch: 1 batch: 3064 / 3064 loss: 0.2991 | 100.00%
----- Epoch 1 ------
Training loss: 0.3281095027923584
Training accuracy: 0.8904152586837383
Validation loss: 0.2779266834259033
Validation accuracy: 0.9045388652378156


Epoch: 2 batch: 3064 / 3064 loss: 0.1513 | 100.00%
----- Epoch 2 ------
Training loss: 0.27535608410835266
Training accuracy: 0.9051395362391946
Validation loss: 0.27519628405570984
Validation accuracy: 0.9045251027598356


Epoch: 3 batch: 3064 / 3064 loss: 0.2584 | 100.00%
----- Epoch 3 ------
Training loss: 0.2742358446121216
Training accuracy: 0.9051166686109384
Validation loss: 0.27490130066871643
Validation accuracy: 0.9045251027598356


Epoch: 4 batch: 3064 / 3064 loss: 0.3379 | 100.00%
----- Epoch 4 ------
Training loss: 0.27403220534324646
Training accuracy: 0.9051166686109384
Validation loss: 0.27480411529541016
Validation accuracy: 0.9045251027598356


Epoch: 5 batch: 3064 / 3064 loss: 0.3827 | 100.00%
----- Epoch 5 ------
Training loss: 0.27