# Multiclass Classification with Deep Learning using BERT

## Exploratory Data Analysis and Preprocessing

In [1]:
import torch
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
df = pd.read_csv("chapter_wise_E_21.csv",
                header=0)

In [3]:
df.head()

Unnamed: 0,Text,Chapter_Name
0,Introduction,Introduction
1,"This chapter provides the outline, purpose, an...",Introduction
2,About the Contents of This Manual,Introduction
3,The manuals provided with this system include ...,Introduction
4,Some of the manuals are separated into volumes...,Introduction


In [4]:
df.Text.iloc[0]

'Introduction'

In [6]:
df.Chapter_Name.value_counts()

Parts Maintenance Management                878
Executing Auto Check                        754
Log Check and Backup                        691
Online Operation                            647
Checking the Equipment Status               359
Changing the Operation Mode                 345
Introduction                                319
Special Port Usage                          296
Manual Transfer                             232
Managing and Setting the User/User Group    213
Auto Setup Execution                        161
Starting and Stopping the System            124
Transmitting Data                            60
Settings to Prevent Mixing of the Gases      49
Name: Chapter_Name, dtype: int64

In [8]:
possible_labels = df.Chapter_Name.unique()
possible_labels

array(['Introduction', 'Starting and Stopping the System',
       'Special Port Usage', 'Checking the Equipment Status',
       'Settings to Prevent Mixing of the Gases',
       'Parts Maintenance Management', 'Changing the Operation Mode',
       'Executing Auto Check', 'Auto Setup Execution', 'Manual Transfer',
       'Log Check and Backup', 'Online Operation', 'Transmitting Data',
       'Managing and Setting the User/User Group'], dtype=object)

In [9]:
labels_dict = {possible_label: index for index, possible_label in enumerate(possible_labels)}
labels_dict

{'Introduction': 0,
 'Starting and Stopping the System': 1,
 'Special Port Usage': 2,
 'Checking the Equipment Status': 3,
 'Settings to Prevent Mixing of the Gases': 4,
 'Parts Maintenance Management': 5,
 'Changing the Operation Mode': 6,
 'Executing Auto Check': 7,
 'Auto Setup Execution': 8,
 'Manual Transfer': 9,
 'Log Check and Backup': 10,
 'Online Operation': 11,
 'Transmitting Data': 12,
 'Managing and Setting the User/User Group': 13}

In [10]:
predictions_dict = {v: k for k, v in labels_dict.items()}
predictions_dict

{0: 'Introduction',
 1: 'Starting and Stopping the System',
 2: 'Special Port Usage',
 3: 'Checking the Equipment Status',
 4: 'Settings to Prevent Mixing of the Gases',
 5: 'Parts Maintenance Management',
 6: 'Changing the Operation Mode',
 7: 'Executing Auto Check',
 8: 'Auto Setup Execution',
 9: 'Manual Transfer',
 10: 'Log Check and Backup',
 11: 'Online Operation',
 12: 'Transmitting Data',
 13: 'Managing and Setting the User/User Group'}

In [12]:
df["Label"] = df.Chapter_Name.replace(labels_dict)
df.head()

Unnamed: 0,Text,Chapter_Name,Label
0,Introduction,Introduction,0
1,"This chapter provides the outline, purpose, an...",Introduction,0
2,About the Contents of This Manual,Introduction,0
3,The manuals provided with this system include ...,Introduction,0
4,Some of the manuals are separated into volumes...,Introduction,0


## Loading Tokenizer and Encoding our Data

In [13]:
#!pip install transformers

from transformers import BertTokenizer
from torch.utils.data import TensorDataset

In [14]:
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased',
    do_lower_case=True
)

In [15]:
encoded_data = tokenizer.batch_encode_plus(
    df.Text.values,
    add_special_tokens=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)


input_ids = encoded_data['input_ids']
attention_masks = encoded_data['attention_mask']
labels = torch.tensor(df.Label.values)

Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [16]:
dataset = TensorDataset(input_ids, attention_masks, labels)

In [17]:
len(dataset)

5128

## Setting up BERT Pretrained Model

In [18]:
from transformers import BertForSequenceClassification

In [19]:
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=len(labels_dict),
    output_attentions=False,
    output_hidden_states=False
)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

## Creating Data Loaders

In [20]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

In [21]:
batch_size = 32

dataloader = DataLoader(
    dataset,
    sampler=RandomSampler(dataset),
    batch_size = batch_size
)

## Setting Up Optimizer and Scheduler

In [22]:
from transformers import AdamW, get_linear_schedule_with_warmup

In [23]:
optimizer = AdamW(
    model.parameters(),
    lr = 1e-5,
    eps=1e-8
)

In [24]:
epochs = 5

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=len(dataloader)*epochs
)

## Creating our Training Loop

In [25]:
import numpy as np
import random

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [26]:
for epoch in tqdm(range(1, epochs + 1)):
    
    model.train()
    
    loss_total = 0
    progress_bar = tqdm(dataloader, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for (idx, batch) in enumerate(progress_bar):
        print(f"Epoch: {epoch} Training batch: {idx}")
        model.zero_grad()
        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
            'labels': batch[2]
        }
        
        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_total+= loss.item()
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
        
    torch.save(model.state_dict(), f'./temp/BERT_ft_epoch{epoch}.model')
    tqdm.write('\nEpoch {epoch}')
    
    loss_avg = loss_total/len(dataloader)
    tqdm.write(f'Training loss: {loss_avg}')

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='Epoch 1', max=161.0, style=ProgressStyle(description_widt…

161
Epoch: 1 Training batch: 0



NameError: name 'device' is not defined

In [30]:
torch.save(model, f"/tmp/{epoch}.model")

## Loading our Model

In [16]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(labels_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [17]:
model.load_state_dict(torch.load('BERT_ft_epoch5.model',
                                map_location=torch.device('cpu')))

<All keys matched successfully>

## Making Predicitons using our Model

In [18]:
test = pd.read_csv("test.csv", header=0)
test

Unnamed: 0,Data
0,What are the chapters does Process Module Theo...
1,List the Chapters of Process Module Theory of ...
2,Show the Sectional View of the Process Module?
3,What is Processing Wafers?
4,What is Editing Recipes?


In [19]:
encoded_data_val = tokenizer.batch_encode_plus(
    test.Data.values,
    add_special_tokens=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)

Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [20]:
input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']

In [21]:
dataset_val = TensorDataset(input_ids_val, attention_masks_val)

In [22]:
dataloader_val = DataLoader(
    dataset_val, 
    sampler=SequentialSampler(dataset_val),
    batch_size=32
)

In [23]:
def evaluate(dataloader_val):
    model.eval()
    predictions = []
    for batch in dataloader_val:
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
               }
        with torch.no_grad():
            outputs = model(**inputs)
    indices = torch.topk(outputs[0], 1)[1].numpy()
    for idx in indices:
        predictions.append(predictions_dict[int(idx)])
    return predictions

In [24]:
predictions = evaluate(dataloader_val)
predictions

['E_376_Tactras_RLSA_GRX_Theory_of_Operation_Rev_3_2_1.pdf',
 'E_376_Tactras_RLSA_GRX_Theory_of_Operation_Rev_3_2_1.pdf',
 'E_376_Tactras_RLSA_GRX_Theory_of_Operation_Rev_3_2_1.pdf',
 'E_29_Tactras_Advanced_Operations_Parameter_Rev_3_2_7.pdf',
 'E_29_Tactras_Advanced_Operations_Parameter_Rev_3_2_7.pdf']