# ERROR ANALYSIS

Understanding where does our model fail to predict correctly can often be a key to creating the most accurate models.

## IMPORTS

In [21]:
import os, joblib
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

# TORCH
import torch
import torch.nn as nn
from torch.utils.data import Dataset
import transformers
from transformers import BertTokenizer

## UTILS

In [22]:
class AverageMeter:
    """
    Computes and stores the average and current value
    """
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

ERROR! Session/line number was not unique in database. History logging moved to new session 1143


In [23]:
import re

def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', text)

def remove_numbers(text):
    text = ''.join([i for i in text if not i.isdigit()])
    return text

def remove_html(text):
    html = re.compile(r'<.*?>')
    return html.sub(r'', text)

def remove_username(text):
    user = re.compile(r'@[A-Za-z0-9_]+')
    return user.sub(r'', text)

def feature_engineering(text):
    text = remove_URL(text)
    text = remove_numbers(text)
    text = remove_html(text)
    text = remove_username(text)
    return " ".join(text.split())

In [24]:
def plot_confusion_matrix(cm,
                          target_names,
                          title='Confusion matrix',
                          cmap=None,
                          normalize=True):
    """
    given a sklearn confusion matrix (cm), make a nice plot

    Arguments
    ---------
    cm:           confusion matrix from sklearn.metrics.confusion_matrix

    target_names: given classification classes such as [0, 1, 2]
                  the class names, for example: ['high', 'medium', 'low']

    title:        the text to display at the top of the matrix

    cmap:         the gradient of the values displayed from matplotlib.pyplot.cm
                  see http://matplotlib.org/examples/color/colormaps_reference.html
                  plt.get_cmap('jet') or plt.cm.Blues

    normalize:    If False, plot the raw numbers
                  If True, plot the proportions

    Usage
    -----
    plot_confusion_matrix(cm           = cm,                  # confusion matrix created by
                                                              # sklearn.metrics.confusion_matrix
                          normalize    = True,                # show proportions
                          target_names = y_labels_vals,       # list of names of the classes
                          title        = best_estimator_name) # title of graph

    Citiation
    ---------
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html

    """
    import matplotlib.pyplot as plt
    import numpy as np
    import itertools

    accuracy = np.trace(cm) / float(np.sum(cm))
    misclass = 1 - accuracy

    if cmap is None:
        cmap = plt.get_cmap('Blues')

    plt.figure(figsize=(8, 6))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()

    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=45)
        plt.yticks(tick_marks, target_names)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]


    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, "{:0.4f}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        else:
            plt.text(j, i, "{:,}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")


    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
    plt.show()

## CONFIG

In [25]:
config = {
    "TRAIN_PATH" : "D:/Documents/GitHub/nlp-pipeline/data/tweet_disaster/train.csv",
    "TEST_PATH" : "D:/Documents/GitHub/nlp-pipeline/data/tweet_disaster/test.csv",
    "TEXT_VAR" : "text",
    "TARGET_VAR" : "target",
    "DEVICE" : torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu"),
    "MAX_LEN" : 160,
    "N_CLASS" : 2,
    "MODEL_NAME" : "DISTILBERT",
    "TASK" : "CLASSIFICATION",
    "EPOCHS" : 5,
    "LR" : 0.001
}

## LOADING DATA

### DATASET CLASS

In [26]:
class NLP_DATASET(Dataset):
    def __init__(self, model_name, task, text, max_len, labels=None, tokenizer=None, feature_eng=None):
        self.model_name = model_name
        self.task = task
        self.text = text
        self.labels = labels
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.feature_eng = feature_eng

    #RETURN THE LENGHT OF THE DATASET
    def __len__(self):
        return len(self.text)

    #FUNCTION THAT RETURN ONE DATAPOINT (INPUT + LABEL)
    def __getitem__(self, index):
        # LIST WHERE ONE ROW OF TEXT DATA
        text = str(self.text[index])
        # USING FEATURE_ENG FUNCTION TO PRE PROCESS TEXT
        if self.feature_eng is not None:
            text = self.feature_eng(text)
        # USING TOKENIZERS ENCODING TO GET TEXT DATA IN CORRECT FORMAT
        if self.tokenizer is not None:
            inputs = self.tokenizer.encode_plus(
                text,
                None,
                add_special_tokens=True,
                max_length=self.max_len,
                pad_to_max_length=True,
                return_token_type_ids=True,
                truncation=True
            )

        # GETTING ALL DATA NEEDED FOR TRANSFORMERS TRAINING
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        if self.labels is not None:
            # LABELS DATA TYPE DEPENDING ON TASK
            if self.task == "CLASSIFICATION":
                labels = torch.tensor(self.labels[index], dtype=torch.long)
            elif self.task == "REGRESSION":
                labels = torch.tensor(self.labels[index], dtype=torch.float32)

            # DISTILBERT & ROBERTA DON'T NEED TOKEN_TYPE_IDS
            return {
                'ids': torch.tensor(ids, dtype=torch.long),
                'masks': torch.tensor(mask, dtype=torch.long),
                'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
                'labels': labels
            }
        else:
            return {
                'ids': torch.tensor(ids, dtype=torch.long),
                'masks': torch.tensor(mask, dtype=torch.long),
                'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            }

### MODEL CLASS

In [27]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [28]:
class DISTILBERT(torch.nn.Module):
    def __init__(self, task, model_config_path, n_class=2):
        super(DISTILBERT, self).__init__()
        self.distilbert = transformers.DistilBertModel.from_pretrained(model_config_path)
        self.drop = nn.Dropout(0.3)
        if task == "REGRESSION":
            self.l0 = nn.Linear(768, 1)
        elif task == "CLASSIFICATION":
            self.l0 = nn.Linear(768, n_class)
        torch.nn.init.normal_(self.l0.weight, std=0.02)
    
    def forward(self, ids, mask):
        output  = self.distilbert(ids, mask)
        hidden_state = output[0]
        pooled = hidden_state[:, 0]
        out = self.drop(pooled)
        out = self.l0(out)
        return out

In [29]:
df = pd.read_csv(config["TRAIN_PATH"])
text = df[config["TEXT_VAR"]]
target = df[config["TARGET_VAR"]].values

# SPLIT A TRAINING & A VALIDATION SET
train_text, valid_text, train_labels, valid_labels = train_test_split(text, target, test_size=0.2, random_state=95)

## TRAINING A MODEL

### TRAINER

In [30]:
#################
# TRAINER CLASS #
#################
class TRAINER:
    '''
    training_step train the model for one epoch
    eval_step evaluate the current model on validation data and output current loss and other evaluation metric
    test_step is used to predict on test data
    '''
    def __init__(self, model, task, device, optimizer=None, criterion=None):
        self.model = model
        self.task = task
        self.device = device
        self.optimizer = optimizer
        self.criterion = criterion

    #################
    # TRAINING STEP #
    #################
    def training_step(self, data_loader):
        # LOSS AVERAGE
        losses = AverageMeter()
        # MODEL TO TRAIN MODE
        self.model.train()
        # TRAINING LOOP
        tk0 = tqdm(data_loader, total=len(data_loader))
        for _, data in enumerate(tk0):
            model_name = self.model.__class__.__name__
            # LOADING TEXT TOKENS & LABELS
            ids = data["ids"].to(self.device)
            masks = data["masks"].to(self.device)
            labels = data["labels"].to(self.device)
            # BERT REQUIRES TOKEN_TYPE_IDS TOO
            if model_name in ["BERT"]:
                token_type_ids = data["token_type_ids"].to(self.device)
                # GETTING PREDICTION FROM MODEL
                self.model.zero_grad()
                output = self.model(ids=ids, mask=masks, token_type_ids=token_type_ids)

            elif model_name in ["DISTILBERT", "ROBERTA"]:
                # GETTING PREDICTION FROM MODEL
                self.model.zero_grad()
                output = self.model(ids=ids, mask=masks)

            # CALCULATE LOSS
            loss = self.criterion(output, labels)
            # CALCULATE GRADIENTS
            loss.backward()
            self.optimizer.step()
            # UPDATE LOSS
            losses.update(loss.item(), ids.size(0))
            tk0.set_postfix(loss=losses.avg)

    ###################
    # VALIDATION STEP #
    ###################
    def eval_step(self, data_loader, metric):
        # LOSS & METRIC AVERAGE
        losses = AverageMeter()
        metrics_avg = AverageMeter()
        # MODEL TO EVAL MODE
        self.model.eval()
        # VALIDATION LOOP
        with torch.no_grad():
            tk0 = tqdm(data_loader, total=len(data_loader))
            for _, data in enumerate(tk0):
                model_name = self.model.__class__.__name__
                # LOADING TEXT TOKENS & LABELS
                ids = data["ids"].to(self.device)
                masks = data["masks"].to(self.device)
                labels = data["labels"].to(self.device)
                if model_name in ["BERT"]:
                    token_type_ids = data["token_type_ids"].to(self.device)
                    # GETTING PREDICTION FROM MODEL
                    output = self.model(ids=ids, mask=masks,
                                        token_type_ids=token_type_ids)
                elif model_name in ["DISTILBERT", "ROBERTA"]:
                    # GETTING PREDICTION FROM MODEL
                    output = self.model(ids=ids, mask=masks)

                # CALCULATE LOSS & METRICS
                loss = self.criterion(output, labels)

                # CHECK FOR REGRESSION VS CLASSIFICATION
                if self.task == "CLASSIFICATION":
                    output = output.argmax(axis=1)
                output = output.cpu().detach().numpy()
                labels = labels.cpu().detach().numpy()
                metric_value = metric(labels, output)

                losses.update(loss.item(), ids.size(0))
                metrics_avg.update(metric_value.item(), ids.size(0))

                tk0.set_postfix(loss=losses.avg)
        print(f"Validation Loss = {losses.avg}")
        return loss, metrics_avg.avg

## TRAINING LOOP

In [31]:
# TRAINING DATASET
train_ds = NLP_DATASET(
    model_name = config["MODEL_NAME"],
    task = config["TASK"],
    text=train_text,
    labels=train_labels,
    max_len = config["MAX_LEN"],
    tokenizer = tokenizer,
    feature_eng = feature_engineering
)
# TRAINING DATALOADER
train_loader = torch.utils.data.DataLoader(
    train_ds, 
    batch_size=32, 
    shuffle=True, 
    num_workers=0
)
# VALIDATION DATASET
valid_ds = NLP_DATASET(
    model_name = config["MODEL_NAME"],
    task = config["TASK"],
    text = valid_text,
    labels = valid_labels,
    max_len = config["MAX_LEN"],
    tokenizer = tokenizer,
    feature_eng = feature_engineering
)
# VALIDATION DATALOADER
valid_loader = torch.utils.data.DataLoader(
    valid_ds, 
    batch_size=16, 
    shuffle=True, 
    num_workers=0
)

In [32]:
model = DISTILBERT(task=config["TASK"],
                model_config_path="D:/Documents/GitHub/nlp-pipeline/models/DISTILBERT/config/",
                n_class=config["N_CLASS"])
model.to(config["DEVICE"])

DISTILBERT(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in_fe

In [33]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=config["LR"])
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min')
metric = metrics.accuracy_score

trainer = TRAINER(model=model,
                optimizer=optimizer,
                device=config["DEVICE"],
                criterion=criterion,
                task=config["TASK"])

In [None]:
for epoch in range(config["EPOCHS"]):
    print(f"Starting epoch number : {epoch}")
    # TRAINING PHASE
    print("Training the model...")
    trainer.training_step(train_loader)
    # VALIDATION PHASE
    print("Evaluating the model...")
    val_loss, metric_value = trainer.eval_step(valid_loader, metric)
    scheduler.step(val_loss)
    # METRICS
    print(f"Validation {metric.__name__} = {metric_value}")

## ERROR ANALYSIS

In [None]:
detection_threshold = 0.5
valid_pred = valid_oof[:, 1] >= detection_threshold
l = lambda x : x*1
valid_pred = l(valid_pred)

### CONFUSION MATRIX

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_fscore_support
cm = confusion_matrix(valid_y, valid_pred)
plot_confusion_matrix(cm = cm, normalize = False, target_names = ['0', '1'], title = "Confusion Matrix")

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
valid = pd.DataFrame(valid_x)
valid[config["TARGET_VAR"]] = valid_y
valid["preds"] = valid_oof[:, 1]
valid["preds_int"] = valid_pred
valid.head()

In [None]:
valid["error"] = abs(valid[config["TARGET_VAR"]] - valid["preds"])
valid.head()

Let's take a look at some of the biggest errors the model made

In [None]:
sorted_desc = valid.sort_values(by=['error'], ascending=False)

LOOKING AT FALSE POSITIVE

In [None]:
sorted_desc[sorted_desc[config["TARGET_VAR"]]==0] [0:20]

NOW FALSE NEGATIVE

In [None]:
sorted_desc[sorted_desc[config["TARGET_VAR"]]==1] [0:20]

The inverse let's look at case where the model is correct and very sure about it

In [None]:
sorted_asc = valid.sort_values(by=['error'], ascending=True)

TRUE NEGATIVE

In [None]:
sorted_asc[sorted_asc[config["TARGET_VAR"]]==0] [0:20]

TRUE POSITIVE

In [None]:
sorted_asc[sorted_asc[config["TARGET_VAR"]]==1] [0:20]

Analyzing model error depending on variable value

In [None]:
cm = confusion_matrix(valid[config["TARGET_VAR"]][valid["cat16"] == 0], valid["preds_int"][valid["cat16"] == 0])
plot_confusion_matrix(cm = cm, normalize = False, target_names = ['0', '1'], title = "Confusion Matrix")

In [None]:
cm = confusion_matrix(valid[config["TARGET_VAR"]][valid["cat16"] == 1], valid["preds_int"][valid["cat16"] == 1])
plot_confusion_matrix(cm = cm, normalize = False, target_names = ['0', '1'], title = "Confusion Matrix")

In [None]:
cm = confusion_matrix(valid[config["TARGET_VAR"]][valid["cat16"] == 2], valid["preds_int"][valid["cat16"] == 2])
plot_confusion_matrix(cm = cm, normalize = False, target_names = ['0', '1'], title = "Confusion Matrix")

In [None]:
cm = confusion_matrix(valid[config["TARGET_VAR"]][valid["cat16"] == 3], valid["preds_int"][valid["cat16"] == 3])
plot_confusion_matrix(cm = cm, normalize = False, target_names = ['0', '1'], title = "Confusion Matrix")