[![Fixel Algorithms](https://fixelalgorithms.co/images/CCExt.png)](https://fixelalgorithms.gitlab.io)

# AI for System Engineers and Project Managers

## Deep Learning - NLP - Text Classification

Applying _Text Classification_ (Sentiment Analysis) using a Hugging Face model.

<!-- https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_multiclass_classification.ipynb -->

> Notebook by:
> - Royi Avital RoyiAvital@fixelalgorithms.com

## Revision History

| Version | Date       | User        |Content / Changes                                                   |
|---------|------------|-------------|--------------------------------------------------------------------|
| 1.0.000 | 08/03/2025 | Royi Avital | First version                                                      |

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/FixelAlgorithmsTeam/FixelCourses/blob/master/AIProgram/2024_02/0037FeaturesTransform.ipynb)

In [None]:
# Import Packages

# General Tools
import numpy as np
import scipy as sp
import pandas as pd

# Machine Learning
from sklearn.model_selection import train_test_split

# Deep Learning
import clip
import torch
from torch.optim.lr_scheduler import LRScheduler
import torch.nn as nn
from torch.optim.optimizer import Optimizer
from torch.utils.data import DataLoader, Dataset
from torch.utils.tensorboard import SummaryWriter

from torchmetrics.classification import MulticlassF1Score, MulticlassAccuracy

import transformers
from transformers import DistilBertModel, DistilBertTokenizer

# Image Processing

# Miscellaneous
from enum import auto, Enum, unique
import os
import onedrivedownloader #<! https://github.com/loribonna/onedrivedownloader
from platform import python_version
import random
import time

# Typing
from typing import Callable, Dict, List, Optional, Self, Set, Tuple, Union

# Visualization
import matplotlib.pyplot as plt

# Jupyter
from IPython import get_ipython

## Notations

* <font color='red'>(**?**)</font> Question to answer interactively.
* <font color='blue'>(**!**)</font> Simple task to add code for the notebook.
* <font color='green'>(**@**)</font> Optional / Extra self practice.
* <font color='brown'>(**#**)</font> Note / Useful resource / Food for thought.

Code Notations:

```python
someVar    = 2; #<! Notation for a variable
vVector    = np.random.rand(4) #<! Notation for 1D array
mMatrix    = np.random.rand(4, 3) #<! Notation for 2D array
tTensor    = np.random.rand(4, 3, 2, 3) #<! Notation for nD array (Tensor)
tuTuple    = (1, 2, 3) #<! Notation for a tuple
lList      = [1, 2, 3] #<! Notation for a list
dDict      = {1: 3, 2: 2, 3: 1} #<! Notation for a dictionary
oObj       = MyClass() #<! Notation for an object
dfData     = pd.DataFrame() #<! Notation for a data frame
dsData     = pd.Series() #<! Notation for a series
hObj       = plt.Axes() #<! Notation for an object / handler / function handler
```

### Code Exercise

 - Single line fill

 ```python
 valToFill = ???
 ```

 - Multi Line to Fill (At least one)

```python
# You need to start writing
?????
```

 - Section to Fill

```python
#===========================Fill This===========================#
# 1. Explanation about what to do.
# !! Remarks to follow / take under consideration.
mX = ???

?????
#===============================================================#
```

In [None]:
# Configuration
# %matplotlib inline

seedNum = 512
np.random.seed(seedNum)
random.seed(seedNum)

# Matplotlib default color palette
lMatPltLibclr = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
# sns.set_theme() #>! Apply SeaBorn theme

runInGoogleColab = 'google.colab' in str(get_ipython())

In [None]:
# Constants

FIG_SIZE_DEF    = (8, 8)
ELM_SIZE_DEF    = 50
CLASS_COLOR     = ('b', 'r')
EDGE_COLOR      = 'k'
MARKER_SIZE_DEF = 10
LINE_WIDTH_DEF  = 2

PROJECT_NAME      = 'FixelCourses'
DATA_FOLDER_PATH  = 'DataSets'
MODEL_FOLDER_PATH = 'Models'

BASE_FOLDER      = os.getcwd()[:len(os.getcwd()) - (os.getcwd()[::-1].lower().find(PROJECT_NAME.lower()[::-1]))]

L_IMG_EXT = ['.png', '.jpeg', '.jpg']

L_DATA_CLS = ['Business', 'Entertainment', 'Health', 'Science']
D_DATA_CLS = {'Business': 0, 'Entertainment': 1, 'Health': 2, 'Science': 3}


In [None]:
# Courses Packages



In [None]:
# General Auxiliary Functions

class TextClsDataset(Dataset):
    def __init__( self, dfData: pd.DataFrame, oTokenizer: Callable, dTokenizeParams: Dict, *, textCol: str = 'TITLE', tgtCol: str = 'CATEGORY' ) -> None:
        """
        Constructor for the Dataset class
        """

        lId = []
        lMask = []

        lText = dfData[textCol].tolist()
        for textStr in lText:
            lTextStr = " ".join(textStr.split())
            dToken   = oTokenizer(lTextStr, None, **dTokenizeParams)
            lId.append(dToken['input_ids'])
            lMask.append(dToken['attention_mask'])

        self._numSamples = len(dfData)
        self._lId        = lId
        self._lMask      = lMask
        self._lTgt       = dfData[tgtCol].tolist()

    def __getitem__(self, idx) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:

        return torch.tensor(self._lId[idx], dtype = torch.long), torch.tensor(self._lMask[idx], dtype = torch.long), torch.tensor(self._lTgt[idx], dtype = torch.long)

    def __len__(self):

        return self._numSamples

class DistillBERT( torch.nn.Module ):
    def __init__(self, oModel: DistilBertModel, numCls: int) -> None:
        super(DistillBERT, self).__init__()
        self.oModel         = oModel
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout        = torch.nn.Dropout(0.3)
        self.classifier     = torch.nn.Linear(768, numCls)

    def forward(self, tInId: torch.Tensor, tAtnMask: torch.Tensor) -> torch.Tensor:
        output_1     = self.oModel(input_ids = tInId, attention_mask = tAtnMask)
        hidden_state = output_1[0]
        pooler       = hidden_state[:, 0]
        pooler       = self.pre_classifier(pooler)
        pooler       = torch.nn.ReLU()(pooler)
        pooler       = self.dropout(pooler)
        output       = self.classifier(pooler)

        return output

def PlotLabelsHistogram( vY: np.ndarray, hA: Optional[plt.Axes] = None, lClass: Optional[List] = None, xLabelRot: Optional[int] = None ) -> plt.Axes:

    if hA is None:
        hF, hA = plt.subplots(figsize = (8, 6))

    vLabels, vCounts = np.unique(vY, return_counts = True)

    hA.bar(vLabels, vCounts, width = 0.9, align = 'center')
    hA.set_title('Histogram of Classes / Labels')
    hA.set_xlabel('Class')
    hA.set_xticks(vLabels, [f'{labelVal}' for labelVal in vLabels])
    hA.set_ylabel('Count')
    if lClass is not None:
        hA.set_xticklabels(lClass)

    if xLabelRot is not None:
        for xLabel in hA.get_xticklabels():
            xLabel.set_rotation(xLabelRot)

    return hA


In [None]:
@unique
class NNMode(Enum):
    TRAIN     = auto()
    INFERENCE = auto()

def RunEpoch( oModel: nn.Module, dlData: DataLoader, hL: Callable, hS: Callable, oOpt: Optimizer = None, opMode: NNMode = NNMode.TRAIN ) -> Tuple[float, float]:
    """
    Runs a single Epoch (Train / Test) of a model.
    Input:
        oModel      - PyTorch `nn.Module` object.
        dlData      - PyTorch `Dataloader` object.
        hL          - Callable for the Loss function.
        hS          - Callable for the Score function.
        oOpt        - PyTorch `Optimizer` object.
        opMode      - An `NNMode` to set the mode of operation.
    Output:
        valLoss     - Scalar of the loss.
        valScore    - Scalar of the score.
    Remarks:
      - The `oDataSet` object returns a Tuple of (mX, vY) per batch.
      - The `hL` function should accept the `vY` (Reference target) and `mZ` (Output of the NN).
        It should return a Tuple of `valLoss` (Scalar of the loss) and `mDz` (Gradient by the loss).
      - The `hS` function should accept the `vY` (Reference target) and `mZ` (Output of the NN).
        It should return a scalar `valScore` of the score.
      - The optimizer is required for training mode.
    """

    epochLoss   = 0.0
    epochScore  = 0.0
    numSamples  = 0
    numBatches = len(dlData)

    runDevice = next(oModel.parameters()).device #<! CPU \ GPU

    if opMode == NNMode.TRAIN:
        oModel.train(True) #<! Equivalent of `oModel.train()`
    elif opMode == NNMode.INFERENCE:
        oModel.eval() #<! Equivalent of `oModel.train(False)`
    else:
        raise ValueError(f'The `opMode` value {opMode} is not supported!')

    for ii, (mX, mA, vY) in enumerate(dlData):
        # Move Data to Model's device
        mX = mX.to(runDevice) #<! Lazy
        mA = mA.to(runDevice) #<! Lazy
        vY = vY.to(runDevice) #<! Lazy

        batchSize = mX.shape[0]

        if opMode == NNMode.TRAIN:
            # Forward
            mZ      = oModel(mX, mA) #<! Model output
            valLoss = hL(mZ, vY)     #<! Loss

            # Backward
            oOpt.zero_grad()    #<! Set gradients to zeros
            valLoss.backward()  #<! Backward
            oOpt.step()         #<! Update parameters
            # oModel.eval()       #<! Inference mode for layers
        else: #<! Value of `opMode` was already validated
            with torch.no_grad():
                # No computational graph
                mZ      = oModel(mX, mA) #<! Model output
                valLoss = hL(mZ, vY)     #<! Loss

        with torch.no_grad():
            # Score
            valScore = hS(mZ, vY)
            # Normalize so each sample has the same weight
            epochLoss  += batchSize * valLoss.item()
            epochScore += batchSize * valScore.item()
            numSamples += batchSize

        print(f'\r{"Train" if opMode == NNMode.TRAIN else "Val"} - Iteration: {(ii + 1):3d} / {numBatches}, loss: {valLoss:.6f}', end = '')

    print('', end = '\r')

    return epochLoss / numSamples, epochScore / numSamples

def TrainModel( oModel: nn.Module, dlTrain: DataLoader, dlVal: DataLoader, oOpt: Optimizer, numEpoch: int, hL: Callable, hS: Callable, *, oSch: Optional[LRScheduler] = None, oTBWriter: Optional[SummaryWriter] = None) -> Tuple[nn.Module, List, List, List, List]:
    """
    Trains a model given test and validation data loaders.
    Input:
        oModel      - PyTorch `nn.Module` object.
        dlTrain     - PyTorch `Dataloader` object (Training).
        dlVal       - PyTorch `Dataloader` object (Validation).
        oOpt        - PyTorch `Optimizer` object.
        numEpoch    - Number of epochs to run.
        hL          - Callable for the Loss function.
        hS          - Callable for the Score function.
        oSch        - PyTorch `Scheduler` (`LRScheduler`) object.
        oTBWriter   - PyTorch `SummaryWriter` object (TensorBoard).
    Output:
        lTrainLoss     - Scalar of the loss.
        lTrainScore    - Scalar of the score.
        lValLoss    - Scalar of the score.
        lValScore    - Scalar of the score.
        lLearnRate    - Scalar of the score.
    Remarks:
      - The `oDataSet` object returns a Tuple of (mX, vY) per batch.
      - The `hL` function should accept the `vY` (Reference target) and `mZ` (Output of the NN).
        It should return a Tuple of `valLoss` (Scalar of the loss) and `mDz` (Gradient by the loss).
      - The `hS` function should accept the `vY` (Reference target) and `mZ` (Output of the NN).
        It should return a scalar `valScore` of the score.
      - The optimizer is required for training mode.
    """

    lTrainLoss  = []
    lTrainScore = []
    lValLoss    = []
    lValScore   = []
    lLearnRate  = []

    # Support R2
    bestScore = -1e9 #<! Assuming higher is better

    learnRate = oOpt.param_groups[0]['lr']

    for ii in range(numEpoch):
        startTime           = time.time()
        trainLoss, trainScr = RunEpoch(oModel, dlTrain, hL, hS, oOpt, opMode = NNMode.TRAIN) #<! Train
        valLoss,   valScr   = RunEpoch(oModel, dlVal, hL, hS, oOpt, opMode = NNMode.INFERENCE) #<! Score Validation
        if oSch is not None:
            # Adjusting the scheduler on Epoch level
            learnRate = oSch.get_last_lr()[0]
            oSch.step()
        epochTime           = time.time() - startTime

        # Aggregate Results
        lTrainLoss.append(trainLoss)
        lTrainScore.append(trainScr)
        lValLoss.append(valLoss)
        lValScore.append(valScr)
        lLearnRate.append(learnRate)

        if oTBWriter is not None:
            oTBWriter.add_scalars('Loss (Epoch)', {'Train': trainLoss, 'Validation': valLoss}, ii)
            oTBWriter.add_scalars('Score (Epoch)', {'Train': trainScr, 'Validation': valScr}, ii)
            oTBWriter.add_scalar('Learning Rate', learnRate, ii)

        # Display (Babysitting)
        print('Epoch '              f'{(ii + 1):4d} / ' f'{numEpoch}', end = '')
        print(' | Train Loss: '     f'{trainLoss          :6.3f}', end = '')
        print(' | Val Loss: '       f'{valLoss            :6.3f}', end = '')
        print(' | Train Score: '    f'{trainScr           :6.3f}', end = '')
        print(' | Val Score: '      f'{valScr             :6.3f}', end = '')
        print(' | Epoch Time: '     f'{epochTime          :5.2f}', end = '')

        # Save best model ("Early Stopping")
        if valScr > bestScore:
            bestScore = valScr
            try:
                dCheckPoint = {'Model': oModel.state_dict(), 'Optimizer': oOpt.state_dict()}
                if oSch is not None:
                    dCheckPoint['Scheduler'] = oSch.state_dict()
                torch.save(dCheckPoint, 'BestModel.pt')
                print(' | <-- Checkpoint!', end = '')
            except:
                print(' | <-- Failed!', end = '')
        print(' |')

    # Load best model ("Early Stopping")
    # dCheckPoint = torch.load('BestModel.pt')
    # oModel.load_state_dict(dCheckPoint['Model'])

    return oModel, lTrainLoss, lTrainScore, lValLoss, lValScore, lLearnRate

## Text Classification

Text classification predict the class given a text input.  
The text is pre processed by a tokenizer.

### Bert Model

The _Bert_ (Bidirectional Encoder Representations from Transformers) model was one of the first models to incorporate the Transformers building block.  
Its inception, at 2018, is notable for its dramatic improvement over previous state of the art models, and as an early example of a large language model.

![](https://upload.wikimedia.org/wikipedia/commons/thumb/b/b5/BERT_embeddings_01.png/799px-BERT_embeddings_01.png)

The model is based on _Encoder Only_ transformer architecture.

At a high level, BERT consists of 4 modules:

 - Tokenizer: This module converts a piece of English text into a sequence of integers (Tokens).
 - Embedding: This module converts the sequence of tokens into an array of real valued vectors representing the tokens.
 - Encoder: A stack of Transformer blocks with self attention.
 - Task Head: This module converts the final representation vectors into one hot encoded tokens again by producing a predicted probability distribution over the token types.  
   It can be viewed as a simple decoder, decoding the latent representation into token types.

Applications:

 - Text Classification.
 - Text Embedding.

* <font color='brown'>(**#**)</font> [A Primer in BERTology: What We Know About How BERT Works](https://arxiv.org/abs/2002.12327).

In [None]:
# Parameters

# Data
datasetName = 'NewsAggregator'
datasetUrl  = 'https://technionmail-my.sharepoint.com/:u:/g/personal/royia_technion_ac_il/ERVmszLTXR5IiSLYQt3qBYYBvFEE6nIcfZkjhms03KUREA?e=E3JKKR'
datasetFile = 'NewsAggregator.csv'

# Pre Processing

# Training
numSamplesTrain = 370_000
numSamplesVal   = 52_419
numSamplesTrain = 1024
numSamplesVal   = 64
batchSize       = 32
numEpochs       = 30

# Model
modelName = 'DistilBERTModel'
numCls = 4
maxLen = 512
dTokenParams = {
    'add_special_tokens'    : True,
    'max_length'            : maxLen,
    'padding'               : 'max_length',
    'return_token_type_ids' : True,
    'truncation'            : True,
}

# Data Visualization


## Generate / Load Data

Data is based on the [UC Irvine Machine Learning Repository - News Aggregator](https://www.kaggle.com/datasets/adityajn105/flickr8k).



In [None]:
# Verify Data is Available

dataSetPath = os.path.join(BASE_FOLDER, DATA_FOLDER_PATH, datasetName)

if not os.path.isdir(dataSetPath):
    # Download, unzip and remove ZIP file
    onedrivedownloader.download(datasetUrl, os.path.join(BASE_FOLDER, DATA_FOLDER_PATH, datasetName + '.zip'), unzip = True, clean = True)

In [None]:
# Data Set

# dfRawData = pd.read_csv(os.path.join(dataSetPath, datasetFile), sep = '\t', names = ['ID','TITLE', 'URL', 'PUBLISHER', 'CATEGORY', 'STORY', 'HOSTNAME', 'TIMESTAMP'])
dfRawData = pd.read_csv('https://github.com/darcien/ML/raw/refs/heads/master/text-classification-attempt/data/newsCorpora.csv', sep = '\t', names = ['ID','TITLE', 'URL', 'PUBLISHER', 'CATEGORY', 'STORY', 'HOSTNAME', 'TIMESTAMP'])
dfRawData.head()

* <font color='red'>(**?**)</font> Should the `URL` data be kept for training?

In [None]:
# Data for Training

dfData     = dfRawData[['TITLE', 'CATEGORY']].copy()
dfData['CATEGORY'] = dfData['CATEGORY'].map({'b': 'Business', 'e': 'Entertainment', 'm': 'Health', 't': 'Science'})
numSamples = len(dfData)

print(f'The number of training samples: {numSamples}')


In [None]:
# Data for Training
dfData

### Plot Data

In [None]:
# Plot the Data

dfData['CATEGORY'] = dfData['CATEGORY'].map(D_DATA_CLS)

hA = PlotLabelsHistogram(dfData['CATEGORY'].values, lClass = L_DATA_CLS, xLabelRot = 45)

* <font color='red'>(**?**)</font> Is the data balanced?

In [None]:
# Mapped Data
dfData.head()

## Load Model

Loading the CLIP model by OpenAI.

In [None]:
# Check GPU Availability

runDevice   = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') #<! The 1st CUDA device

In [None]:
# Models

oTokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased', cache_dir = os.path.join(BASE_FOLDER, MODEL_FOLDER_PATH, modelName))
oModel     = DistilBertModel.from_pretrained('distilbert-base-uncased', cache_dir = os.path.join(BASE_FOLDER, MODEL_FOLDER_PATH, modelName))

In [None]:
# DistillBERT Model

oModelBERT = DistillBERT(oModel, numCls)

In [None]:
# Train and Validation Split
dfTrain, dfVal = train_test_split(dfData, train_size = numSamplesTrain, test_size = numSamplesVal, stratify = dfData['CATEGORY'], random_state = seedNum)

In [None]:
dfTrain

In [None]:
# Data for Training

dsTrain = TextClsDataset(dfTrain, oTokenizer, dTokenParams)
dsVal   = TextClsDataset(dfVal, oTokenizer, dTokenParams)


In [None]:
# Data Loaders
dlTrain = torch.utils.data.DataLoader(dsTrain, shuffle = True, batch_size = batchSize, drop_last = True)
dlVal   = torch.utils.data.DataLoader(dsVal, shuffle = True, batch_size = 2 * batchSize, drop_last = True)

In [None]:
# Iterate on the Loader
# The first batch.
tX, tA, tY = next(iter(dlTrain)) #<! PyTorch Tensors

print(f'The batch features dimensions: {tX.shape}')
print(f'The batch masks dimensions: {tA.shape}')
print(f'The batch labels dimensions: {tY.shape}')

## Model Training

In [None]:
# Set Device


In [None]:
# Loss Functions
hL = nn.CrossEntropyLoss()
# hS = MulticlassF1Score(num_classes = numCls, average = 'macro')
hS = MulticlassAccuracy(num_classes = numCls, average = 'macro')

hL = hL.to(runDevice)
hS = hS.to(runDevice)


In [None]:
# Training Loop

oModelBERT = oModelBERT.to(runDevice)
oOpt = torch.optim.AdamW(oModelBERT.parameters(), lr = 1e-5, betas = (0.9, 0.99), weight_decay = 1e-5) #<! Define optimizer
oSch = torch.optim.lr_scheduler.OneCycleLR(oOpt, max_lr = 1e-4, total_steps = numEpochs)
oModelBERT, lTrainLoss, lTrainScore, lValLoss, lValScore, lLearnRate = TrainModel(oModelBERT, dlTrain, dlVal, oOpt, numEpochs, hL, hS, oSch = oSch)

In [None]:
# Training Loop
# Does not work, Find the bug :-)

# numBatchesTrain = len(dlTrain)
# numBatchesVal   = len(dlVal)

# for epochIdx in range(numEpochs):
#     startTime = time.time()

#     epochLoss       = 0
#     epochScore      = 0
#     numSamplesEpoch = 0

#     oModelBERT.train()

#     # Training
#     for ii, (tX, tA, tY) in enumerate(dlTrain):
#         itrBatchSize = tX.shape[0]


#         tX, tA, tY = tX.to(runDevice), tA.to(runDevice), tY.to(runDevice)
#         tYHat = oModelBERT(tX, tA)

#         valLoss = hL(tYHat, tY)
#         oOpt.zero_grad()
#         valLoss.backward() #<! Backward Propagation
#         oOpt.step()

#         epochLoss += itrBatchSize * valLoss.item()

#         with torch.inference_mode():
#             valScore    = hS(tYHat, tY)
#             epochScore += itrBatchSize * valScore.item()

#             numSamplesEpoch += itrBatchSize

#         print(f'\rTraining - Iteration: {(ii + 1):3d} / {numBatchesTrain}, loss: {valLoss:.6f}', end = '')

#     print('', end = '\r')

#     trainLoss  = epochLoss / numSamplesEpoch
#     trainScore = epochScore / numSamplesEpoch

#     epochLoss       = 0
#     epochScore      = 0
#     numSamplesEpoch = 0

#     oModelBERT.eval()

#     # Validation
#     for ii, (tX, tA, tY) in enumerate(dlVal):
#         itrBatchSize = tX.shape[0]

#         tX, tA, tY = tX.to(runDevice), tA.to(runDevice), tY.to(runDevice)

#         with torch.inference_mode():
#             tYHat    = oModelBERT(tX, tA)
#             valLoss  = hL(tYHat, tY)
#             valScore = hS(tYHat, tY)

#         epochLoss       += itrBatchSize * valLoss.item()
#         epochScore      += itrBatchSize * valScore.item()
#         numSamplesEpoch += itrBatchSize

#         print(f'\rValidation - Iteration: {(ii + 1):3d} / {numBatchesVal}, loss: {valLoss:.6f}', end = '')

#     print('', end = '\r')

#     valLoss  = epochLoss / numSamplesEpoch
#     valScore = epochScore / numSamplesEpoch

#     epochTime = time.time() - startTime
#     print('Epoch '              f'{(epochIdx + 1):4d} / ' f'{numEpochs}', end = '')
#     print(' | Train Loss: '     f'{trainLoss          :6.3f}', end = '')
#     print(' | Val Loss: '       f'{valLoss            :6.3f}', end = '')
#     print(' | Train Score: '    f'{trainScore         :6.3f}', end = '')
#     print(' | Val Score: '      f'{valScore           :6.3f}', end = '')
#     print(' | Epoch Time: '     f'{epochTime          :5.2f}', end = '')
#     print(' |')




In [None]:
# Plot Training Phase

hF, vHa = plt.subplots(nrows = 1, ncols = 3, figsize = (12, 5))
vHa = np.ravel(vHa)

hA = vHa[0]
hA.plot(lTrainLoss, lw = 2, label = 'Train')
hA.plot(lValLoss, lw = 2, label = 'Validation')
hA.set_title(f'Classification Loss')
hA.set_xlabel('Epoch')
hA.set_ylabel('Loss')
hA.legend()

hA = vHa[1]
hA.plot(lTrainScore, lw = 2, label = 'Train')
hA.plot(lValScore, lw = 2, label = 'Validation')
hA.set_title('Classification Score')
hA.set_xlabel('Epoch')
hA.set_ylabel('Score')
hA.legend()

hA = vHa[2]
hA.plot(lLearnRate, lw = 2)
hA.set_title('Learn Rate Scheduler')
hA.set_xlabel('Epoch')
hA.set_ylabel('Learn Rate');

In [None]:
# Inference Mode

oModelBERT = oModelBERT.eval()

In [None]:
# Evaluate on Training and Validation Data