# Experiment: Generalist vs Specialists - BERT

In [1]:
from echr import *
from nb_tfidf import *
from bert import *
import os
import re
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from csv import DictWriter
from tqdm import tqdm
from joblib import Parallel, delayed
from sklearn.base import clone
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import gc

%load_ext autoreload
%autoreload 2

In [2]:
path = 'datasets/Medvedeva/'
json_path = 'datasets/echrod/cases.json'
part = 'facts'
article = '6'
balance = True
batch_size = 16
epochs = 20

In [3]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: NVIDIA GeForce RTX 3090


In [4]:
def train_test(model, df_train, df_test):
    
    def train(model, train_dataloader, val_dataloader=None, epochs=4, evaluation=False, debug=0):
        """Train the BertClassifier model.
        """

        # Specify loss function
        loss_fn = nn.CrossEntropyLoss()

        # Start training loop
        if debug: print("Start training...\n")
        for epoch_i in range(epochs):
            # =======================================
            #               Training
            # =======================================
            # Print the header of the result table
            if debug: print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
            if debug: print("-"*70)

            # Measure the elapsed time of each epoch
            t0_epoch, t0_batch = time.time(), time.time()

            # Reset tracking variables at the beginning of each epoch
            total_loss, batch_loss, batch_counts = 0, 0, 0

            # Put the model into the training mode
            model.train()

            # For each batch of training data...
            for step, batch in enumerate(train_dataloader):
                batch_counts +=1
                # Load batch to GPU
                b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

                # Zero out any previously calculated gradients
                model.zero_grad()

                # Perform a forward pass. This will return logits.
                logits = model(b_input_ids, b_attn_mask)

                # Compute loss and accumulate the loss values
                loss = loss_fn(logits, b_labels)
                batch_loss += loss.item()
                total_loss += loss.item()

                # Perform a backward pass to calculate gradients
                loss.backward()

                # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

                # Update parameters and the learning rate
                optimizer.step()
                scheduler.step()

                # Print the loss values and time elapsed for every 20 batches
                if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                    # Calculate time elapsed for 20 batches
                    time_elapsed = time.time() - t0_batch

                    # Print training results
                    if debug: print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")

                    # Reset batch tracking variables
                    batch_loss, batch_counts = 0, 0
                    t0_batch = time.time()

            # Calculate the average loss over the entire training data
            avg_train_loss = total_loss / len(train_dataloader)

            if debug: print("-"*70)
            # =======================================
            #               Evaluation
            # =======================================
            if evaluation == True:
                # After the completion of each training epoch, measure the model's performance
                # on our validation set.
                val_loss, val_accuracy = evaluate(model, val_dataloader, device)

                # Print performance over the entire training data
                time_elapsed = time.time() - t0_epoch

                if debug: print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")
                if debug: print("-"*70)
            if debug: print("\n")
        if debug: print("Training complete!")

    # Balance the training data
    df_train = balance_dataset(df_train) 

    # Split the features and label
    X_train = df_train['text'].to_numpy()
    y_train = df_train['violation'].to_numpy()
    X_test = df_test['text'].to_numpy()
    y_test = df_test['violation'].to_numpy()
        
    # Train and test the model
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
    train_inputs, train_masks = preprocessing_for_bert(X_train, tokenizer)

    set_seed(42)

    # Train
    train_labels = torch.tensor(y_train)
    train_data = TensorDataset(train_inputs, train_masks, train_labels)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
    bert_classifier, optimizer, scheduler = initialize_model(device, train_dataloader, epochs=epochs)
    for _ in tqdm([None]):
        train(bert_classifier, train_dataloader, epochs=epochs)

    # Test
    test_inputs, test_masks = preprocessing_for_bert(X_test, tokenizer)
    test_dataset = TensorDataset(test_inputs, test_masks)
    test_sampler = SequentialSampler(test_dataset)
    test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=batch_size)
    probs = bert_predict(bert_classifier, test_dataloader)

    # Get predictions from the probabilities
    threshold = 0.5
    preds = np.where(probs[:, 1] >= 0.5, 1, 0)
    return return_metrics(preds, y_test, show=False) #acc, mcc, f1


def train_test_ensemble(model, df_train_sets, df_test):
    
    def train(model, train_dataloader, val_dataloader=None, epochs=4, evaluation=False, debug=0):
        """Train the BertClassifier model.
        """

        # Specify loss function
        loss_fn = nn.CrossEntropyLoss()

        # Start training loop
        if debug: print("Start training...\n")
        for epoch_i in range(epochs):
            # =======================================
            #               Training
            # =======================================
            # Print the header of the result table
            if debug: print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
            if debug: print("-"*70)

            # Measure the elapsed time of each epoch
            t0_epoch, t0_batch = time.time(), time.time()

            # Reset tracking variables at the beginning of each epoch
            total_loss, batch_loss, batch_counts = 0, 0, 0

            # Put the model into the training mode
            model.train()

            # For each batch of training data...
            for step, batch in enumerate(train_dataloader):
                batch_counts +=1
                # Load batch to GPU
                b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

                # Zero out any previously calculated gradients
                model.zero_grad()

                # Perform a forward pass. This will return logits.
                logits = model(b_input_ids, b_attn_mask)

                # Compute loss and accumulate the loss values
                loss = loss_fn(logits, b_labels)
                batch_loss += loss.item()
                total_loss += loss.item()

                # Perform a backward pass to calculate gradients
                loss.backward()

                # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

                # Update parameters and the learning rate
                optimizer.step()
                scheduler.step()

                # Print the loss values and time elapsed for every 20 batches
                if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                    # Calculate time elapsed for 20 batches
                    time_elapsed = time.time() - t0_batch

                    # Print training results
                    if debug: print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")

                    # Reset batch tracking variables
                    batch_loss, batch_counts = 0, 0
                    t0_batch = time.time()

            # Calculate the average loss over the entire training data
            avg_train_loss = total_loss / len(train_dataloader)

            if debug: print("-"*70)
            # =======================================
            #               Evaluation
            # =======================================
            if evaluation == True:
                # After the completion of each training epoch, measure the model's performance
                # on our validation set.
                val_loss, val_accuracy = evaluate(model, val_dataloader, device)

                # Print performance over the entire training data
                time_elapsed = time.time() - t0_epoch

                if debug: print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")
                if debug: print("-"*70)
            if debug: print("\n")
        if debug: print("Training complete!")

    
    all_preds = []
    full_preds = {}
    
    for art, df_train in tqdm(df_train_sets.items()):
        
        if type(df_train) == str:
            df_train = pd.read_csv(df_train).dropna()
        
        # Balance the training data
        df_train = balance_dataset(df_train) 

        # Split the features and label
        X_train = df_train['text'].to_numpy()
        y_train = df_train['violation'].to_numpy()
        X_test = df_test['text'].to_numpy()
        y_test = df_test['violation'].to_numpy()

        # Preprocess
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
        train_inputs, train_masks = preprocessing_for_bert(X_train, tokenizer)
        set_seed(42)
     
        # Train
        train_labels = torch.tensor(y_train)
        train_data = TensorDataset(train_inputs, train_masks, train_labels)
        train_sampler = RandomSampler(train_data)
        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
        bert_classifier, optimizer, scheduler = initialize_model(device, train_dataloader, epochs=epochs)
        train(bert_classifier, train_dataloader, epochs=epochs)

        # Test
        test_inputs, test_masks = preprocessing_for_bert(X_test, tokenizer)
        test_dataset = TensorDataset(test_inputs, test_masks)
        test_sampler = SequentialSampler(test_dataset)
        test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=batch_size)
        probs = bert_predict(bert_classifier, test_dataloader)

        # Get predictions from the probabilities
        threshold = 0.5
        preds = np.where(probs[:, 1] >= 0.5, 1, 0)
        full_preds[art] = preds
        
        # OR layer: if any model predicts violation the ensemble prediction is violation
        if len(all_preds) == 0:
            all_preds = preds
        else:
            for idx, pred in enumerate(preds):
                all_preds[idx] = max(pred, all_preds[idx])
        gc.collect()

        
    full_preds['final_pred'] = all_preds
    full_preds['true_pred'] = y_test
    
        
    return return_metrics(all_preds, y_test), pd.DataFrame.from_dict(full_preds) #acc, mcc, f1


In [5]:
random_state = 1995
random_states = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

def generate_ensemble_dataset(path, random_state, write=False):

    article_numbers = ['2', '3', '5', '6', '8', '10', '11', '13', '14']
    datasets = {art: create_dataset(path, art, part) for art in tqdm(article_numbers)}
    df_all = create_dataset(json_path, 'All', part)

    # Individual training datasets (for each article)
    train_dbs = {}
    
    # Individual training datasets including irrelevant cases (with non-violation label for that case)
    improved_train_dbs = {}
    
    # General test dataset (binary)
    test_db = [] 
    test_size = 0.1

    # Remove 10% of all the datasets
    for art, db in datasets.items():
        db['article'] = art
        train, test = train_test_split(db, test_size=0.1, random_state=random_state, stratify=db['violation'])
        train_dbs[art] = train 
        test_db.append(test)

    # All individual 10% test cases form the big test dataset
    test_db = pd.concat(test_db)

    # The general test set has binary labels
    test_general = df_all[df_all.id.isin(test_db['id'])]

    # General training dataset (binary)
    # Remove all test instances from the general training dataset
    train_general = df_all[~df_all.id.isin(test_db['id'])]
    
    # improved ensemble datasets
    for art, db in train_dbs.items():
        
        # All cases that were not in the article db
        additional_cases = df_all[~df_all.id.isin(db['id'])]
        additional_cases['violation'] = 0
        additional_cases['article'] = 999 # add 999 to symbolise that it is another article
        
        # split violation and non violation of the article db
        violation_cases = db[db['violation']==1]
        nonviolation_cases = db[db['violation']==0]
        violation_cases = violation_cases.sample(n=min(2500, len(violation_cases))) # take a sample to reduce space
               
        # There are usually less nonviolation cases. Add additional cases from other articles such that the number of non-violation cases
        # is equal to the amount of violation cases, making a larger, balanced dataset. If there are more non-violation cases, add nothing.
        additional_cases = additional_cases.sample(n=max(0, len(violation_cases)-len(nonviolation_cases)))
        
        # Add the three datasets together
        improved_train_dbs[art] = pd.concat([violation_cases, nonviolation_cases, additional_cases])
        
        if debug: print(art, len(db), len(violation_cases),  len(nonviolation_cases), len(additional_cases), len(improved_train_dbs[art]))
        del additional_cases
        del violation_cases
        del nonviolation_cases
        
    # Write all datasets to csv to reduce memory
    if write:
        balance_dataset(train_general).to_csv('results/ensemble/temp/train_general.csv')
        test_general.to_csv('results/ensemble/temp/test_general.csv')

        for art, db in train_dbs.items():
            balance_dataset(db).to_csv('results/ensemble/temp/train_dbs'+str(art)+'.csv')

        for art, db in improved_train_dbs.items():
            balance_dataset(db).to_csv('results/ensemble/temp/improved_train_dbs'+str(art)+'.csv')

        del train_dbs
        del improved_train_dbs
        del train_general
        del test_general
        gc.collect()

        return None, None, None, None
    
    return train_dbs, improved_train_dbs, train_general, test_general

def read_ensemble_dataset(db_name):
    if db_name == 'train_general':
        return pd.read_csv('results/ensemble/temp/train_general.csv').dropna()
    if db_name == 'test_general':
        return pd.read_csv('results/ensemble/temp/test_general.csv').dropna()
    if db_name == 'train_dbs':
        return {art:'results/ensemble/temp/train_dbs'+str(art)+'.csv'
                for art in ['2', '3', '5', '6', '8', '10', '11', '13', '14']}
    if db_name == 'improved_train_dbs':
        return {art:'results/ensemble/temp/improved_train_dbs'+str(art)+'.csv'
                for art in ['2', '3', '5', '6', '8', '10', '11', '13', '14']}    

In [6]:
def run_experiment(model, model_name, random_state, provided_dbs=None):
    if not provided_dbs:
        print('\tNo provided data, reading from files')
        train_dbs, improved_train_dbs, train_general, test_general = generate_ensemble_dataset(json_path, random_state)
        train_dbs, improved_train_dbs, train_general, test_general = read_ensemble_dataset()
    else:
        train_dbs = provided_dbs['train_dbs']
        improved_train_dbs = provided_dbs['improved_train_dbs']
        train_general = provided_dbs['train_general']
        test_general = provided_dbs['test_general']
        
    # Set 'unknown' as default value
    acc_g = mcc_g = f1_g = acc_e = mcc_e = f1_e = acc_e2 = mcc_e2 = f1_e2 = -999

    # General
    print('\tTraining general classifier')
    (acc_g, mcc_g, f1_g), full_preds = train_test(model, read_ensemble_dataset('train_general'), read_ensemble_dataset('test_general'))
    full_preds.to_csv('results/ensemble/full_preds/'+model_name+'/general/full_preds_general_'+str(random_state)+'.csv')

    #ensemble
    print('\tTraining ensemble')
    (acc_e, mcc_e, f1_e), full_preds = train_test_ensemble(model, read_ensemble_dataset('train_dbs'), read_ensemble_dataset('test_general'))
    full_preds.to_csv('results/ensemble/full_preds/'+model_name+'/ensemble/full_preds_ensemble_'+str(random_state)+'.csv')
    
    #improved ensemble
    print('\tTraining improved ensemble')
    (acc_e2, mcc_e2, f1_e2), full_preds = train_test_ensemble(model, read_ensemble_dataset('improved_train_dbs'), read_ensemble_dataset('test_general'))
    full_preds.to_csv('results/ensemble/full_preds/'+model_name+'/improved_ensemble/full_preds_improved_ensemble_'+str(random_state)+'.csv')
    
    print('\tWriting data')
    field_names = ['model', 'random_state', 'accuracy_general', 'MCC_general', 'F1_general', 
                   'accuracy_ensemble', 'MCC_ensemble', 'F1_ensemble', 
                   'accuracy_improved_ensemble', 'MCC_improved_ensemble', 'F1_improved_ensemble', 
                   'alpha',  'training_size', 'test_size', 'train_distribution', 'test_distribution']
    
    
    train_general = read_ensemble_dataset('train_general')
    test_general = read_ensemble_dataset('test_general')
    
    dct = {
        'model': model_name,
        'random_state': random_state,
        'accuracy_general': acc_g,
        'MCC_general': mcc_g,
        'F1_general': f1_g,
        'accuracy_ensemble': acc_e,
        'MCC_ensemble': mcc_e,
        'F1_ensemble': f1_e,
        'accuracy_improved_ensemble': acc_e2,
        'MCC_improved_ensemble': mcc_e2,
        'F1_improved_ensemble': f1_e2,
        'alpha': best_alpha,
        'training_size': len(train_general),
        'test_size': len(test_general),
        'train_distribution': round(train_general['violation'].mean()*100,2),
        'test_distribution': round(test_general['violation'].mean()*100,2)
         }
    
    filename = 'results/ensemble/ensemble_general_bert.csv'
    file_exists = os.path.isfile(filename)
    with open(filename, 'a') as f_object:
        dictwriter_object = DictWriter(f_object, fieldnames=field_names)
        if not file_exists:
            dictwriter_object.writeheader()  # file doesn't exist yet, write a header
        dictwriter_object.writerow(dct)
        f_object.close()
    return 0

In [None]:
provided_dbs = None
debug = False
for random_state in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]:
    print('Random_state:', random_state)
    _, _, _, _ = generate_ensemble_dataset(json_path, random_state, write=True)

    print('BERT')
    run_experiment(None, 'BERT', random_state, provided_dbs=provided_dbs)