# BERT

## Requirements

In [2]:
#Imports
import numpy as np
import pandas as pd
import torch
import transformers as ppb # pytorch transformers
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn import preprocessing
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from sklearn.model_selection import train_test_split

import _helpers as hp

## Load Data

In [3]:
ASR_data = pd.read_csv("snips/new_ASR_Autocorrection_with_labels.csv") # ASR data with improved speech recognition with 15555 framerate and with autocorrection applied

GT_data = pd.read_csv("snips/merged_GT_data.csv") # Groundtruth data

## Import pre-trained DistilBERT model and tokenizer

DistilBERT is a small, fast, cheap and light Transformer model. It has 40% less parameters than bert-base-uncased, runs 60% faster while preserving over 95% of BERT's performances

In [196]:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

## Import pre-trained BERT model and tokenizer

If we want to use BERT instead of DistilBERT we comment the previous model importation and use this one instead:

In [197]:
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-small-uncased')

In [198]:
# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Tokenization

In [199]:
tokenized_GT_data = GT_data["transcript"].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

tokenized_ASR_data = ASR_data["transcript"].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

The problem now is that the tokenized vectors ar not with the name size

In [200]:
if  max(tokenized_GT_data.apply(len)) >  max(tokenized_ASR_data.apply(len)):
    _max = max(tokenized_GT_data.apply(len))
    _min = min(tokenized_GT_data.apply(len))
else:
    _max = max(tokenized_ASR_data.apply(len))
    _min = min(tokenized_ASR_data.apply(len))

print("MX:",_max)
print("MIN:",_min)

MX: 22
MIN: 3


lets fix that with a simple padding function

In [201]:
padded_GT_data = np.array(list(tokenized_GT_data.apply(lambda x: hp.padding_func(x,_max))))
padded_ASR_data = np.array(list(tokenized_ASR_data.apply(lambda x: hp.padding_func(x,_max))))
print(padded_GT_data.shape)
print(padded_ASR_data.shape)

(2224, 22)
(1660, 22)


# Masking

If we directly send padded to BERT, that would slightly confuse it. We need to create another variable to tell it to ignore (mask) the padding we've added when it's processing its input. That's what attention_mask is:

In [202]:
attention_mask_GT = np.where(padded_GT_data != 0, 1, 0)

attention_mask_ASR = np.where(padded_ASR_data != 0, 1, 0)

# Processing with DistrilBert

In [203]:
input_ids_GT = torch.tensor(padded_GT_data)  
attention_mask_GT = torch.tensor(attention_mask_GT)

with torch.no_grad():
    last_hidden_states_GT = model(input_ids_GT, attention_mask=attention_mask_GT)

In [204]:
input_ids_ASR = torch.tensor(padded_ASR_data)  
attention_mask_ASR = torch.tensor(attention_mask_ASR)

with torch.no_grad():
    last_hidden_states_ASR = model(input_ids_ASR, attention_mask=attention_mask_ASR)

Let's slice only the part of the output that we need. That is the output corresponding the first token of each sentence. The way BERT does sentence classification, is that it adds a token called [CLS] (for classification) at the beginning of every sentence

basically [CLS] contains all information of the sentence and representing the sentence-level classification.

In [205]:
#Taking the CLS features of each sentence
features_GT = last_hidden_states_GT[0][:,0,:].numpy() 
features_GT.shape

(2224, 768)

In [206]:
#Taking the CLS features of each sentence
features_ASR = last_hidden_states_ASR[0][:,0,:].numpy() 
features_ASR.shape

(1660, 768)

In [207]:
labels_GT = GT_data["user_action_num"]
labels_ASR = ASR_data["user_action_num"]

# Classifiers

## Train/Test Split

If you want to train and test only with Groundtruth data uncomment the next cell and comment the remain cells in this section 

In [208]:
# # Split into training and test data

# train_features, test_features, train_labels, test_labels = train_test_split(features_GT, labels_GT, train_size= 0.9)

If you want to train and test only with ASR data uncomment the next cell and comment the remain cells in this section 

In [209]:
# # Split into training and test data

# train_features, test_features, train_labels, test_labels = train_test_split(features_ASR, labels_ASR, train_size= 0.9)

If you want to train with Groundtruth and test with ASR data uncomment the next cell and comment the remain cells in this section 

In [212]:
# Split into training and test data
train_features, test_features, train_labels, test_labels = features_GT,features_ASR,labels_GT,labels_ASR

## Logistic Regression

In [213]:
lgr = LogisticRegression(C = 0.6, max_iter= 1000, penalty="l2",solver="liblinear") #Create the classification model

lgr_pipe = make_pipeline(preprocessing.StandardScaler(), lgr) #Scale feature space
lgr_pipe.fit(train_features, train_labels)


lgr_pred_labels = lgr_pipe.predict(test_features) #predictions

lgr_score = lgr_pipe.score(test_features,test_labels) #accuracy

### Model Evaluation

In [214]:
print(classification_report(test_labels, lgr_pred_labels, target_names= ['SwitchLightOff','SwitchLightOn','IncreaseBrightness','DecreaseBrightness','SetLightBrightness',"SetLightColor"]))

print(confusion_matrix(test_labels, lgr_pred_labels))

print("\nACCURACY:", lgr_score)

                    precision    recall  f1-score   support

    SwitchLightOff       0.81      0.73      0.77       276
     SwitchLightOn       0.72      0.87      0.79       257
IncreaseBrightness       0.81      0.83      0.82       269
DecreaseBrightness       0.74      0.79      0.77       268
SetLightBrightness       0.91      0.88      0.89       296
     SetLightColor       0.94      0.82      0.88       294

          accuracy                           0.82      1660
         macro avg       0.82      0.82      0.82      1660
      weighted avg       0.83      0.82      0.82      1660

[[201  27   8  27   8   5]
 [  9 223  12   5   7   1]
 [  9  13 224  14   4   5]
 [ 11  25  15 212   2   3]
 [  8   9   6  13 259   1]
 [  9  14  10  14   6 241]]

ACCURACY: 0.8192771084337349


# Naive Bayes

### Gaussian Naive Bayes

In [215]:
gnb = GaussianNB() #Create the classification model

gnb_pipe = make_pipeline(preprocessing.StandardScaler(), gnb) #Scale feature space
gnb_pipe.fit(train_features, train_labels)


gnb_pred_labels = gnb_pipe.predict(test_features) #predictions

gnb_score = gnb_pipe.score(test_features,test_labels) #accuracy

### Model Evaluation

In [216]:
print(classification_report(test_labels, gnb_pred_labels, target_names= ['SwitchLightOff','SwitchLightOn','IncreaseBrightness','DecreaseBrightness','SetLightBrightness',"SetLightColor"]))

print(confusion_matrix(test_labels, gnb_pred_labels))

print("\nACCURACY:", gnb_score)

                    precision    recall  f1-score   support

    SwitchLightOff       0.57      0.28      0.38       276
     SwitchLightOn       0.35      0.66      0.46       257
IncreaseBrightness       0.61      0.42      0.50       269
DecreaseBrightness       0.40      0.50      0.44       268
SetLightBrightness       0.81      0.74      0.77       296
     SetLightColor       0.83      0.70      0.76       294

          accuracy                           0.55      1660
         macro avg       0.59      0.55      0.55      1660
      weighted avg       0.60      0.55      0.56      1660

[[ 78 115  10  51  13   9]
 [ 24 170  23  24   9   7]
 [ 12  55 113  62  13  14]
 [ 17  82  16 134  10   9]
 [  3  28  10  33 218   4]
 [  4  34  12  32   7 205]]

ACCURACY: 0.553012048192771


## SVM

In [217]:
svm = SVC() #Create the classification model

svm_pipe = make_pipeline(preprocessing.StandardScaler(), svm) #Scale feature space
svm_pipe.fit(train_features, train_labels)


svm_pred_labels = svm_pipe.predict(test_features) #predictions

svm_score = svm_pipe.score(test_features,test_labels) #accuracy

### Model Evaluation

In [218]:
print(classification_report(test_labels, svm_pred_labels, target_names= ['SwitchLightOff','SwitchLightOn','IncreaseBrightness','DecreaseBrightness','SetLightBrightness',"SetLightColor"]))

print(confusion_matrix(test_labels, svm_pred_labels))

print("\nACCURACY:", svm_score)


                    precision    recall  f1-score   support

    SwitchLightOff       0.79      0.69      0.73       276
     SwitchLightOn       0.69      0.86      0.77       257
IncreaseBrightness       0.83      0.77      0.80       269
DecreaseBrightness       0.62      0.77      0.69       268
SetLightBrightness       0.91      0.86      0.88       296
     SetLightColor       0.97      0.77      0.86       294

          accuracy                           0.79      1660
         macro avg       0.80      0.79      0.79      1660
      weighted avg       0.80      0.79      0.79      1660

[[190  35   9  34   5   3]
 [  9 222   9  12   5   0]
 [  7  14 206  34   5   3]
 [ 15  30  13 206   3   1]
 [ 10   8   4  18 255   1]
 [ 11  14   7  28   8 226]]

ACCURACY: 0.786144578313253


## NEURAL NETWORK

In [219]:
mlp = MLPClassifier(hidden_layer_sizes=(100,100,100), activation='relu', solver='adam', max_iter=5000) #Create the classification model

mlp_pipe = make_pipeline(preprocessing.StandardScaler(), mlp) #Scale feature space
mlp_pipe.fit(train_features, train_labels)


mlp_pred_labels = mlp_pipe.predict(test_features) #predictions

mlp_score = mlp_pipe.score(test_features,test_labels) #accuracy

### Model Evaluation

In [220]:
print(classification_report(test_labels, mlp_pred_labels, target_names= ['SwitchLightOff','SwitchLightOn','IncreaseBrightness','DecreaseBrightness','SetLightBrightness',"SetLightColor"]))

print(confusion_matrix(test_labels, mlp_pred_labels))

print("\nACCURACY:", mlp_score)


                    precision    recall  f1-score   support

    SwitchLightOff       0.75      0.71      0.73       276
     SwitchLightOn       0.69      0.84      0.76       257
IncreaseBrightness       0.81      0.77      0.79       269
DecreaseBrightness       0.67      0.76      0.71       268
SetLightBrightness       0.91      0.85      0.88       296
     SetLightColor       0.94      0.80      0.87       294

          accuracy                           0.79      1660
         macro avg       0.80      0.79      0.79      1660
      weighted avg       0.80      0.79      0.79      1660

[[197  36   7  28   4   4]
 [ 15 215  11  10   4   2]
 [ 10  13 208  28   6   4]
 [ 19  25  14 204   5   1]
 [ 12   9  10  11 251   3]
 [  9  12   7  25   6 235]]

ACCURACY: 0.7891566265060241


## Try your self

In [221]:
def user_friendly(sentence, mdl):
    """return action from sentence"""
    sent_token = tokenizer.encode(sentence, add_special_tokens=True)
    sent_pad = np.array([sent_token + [0]*(_max-len(sent_token))])
    sent_att_mask = np.where(sent_pad != 0, 1, 0)
    sent_input_ids = torch.tensor(sent_pad)  
    sent_attention_mask = torch.tensor(sent_att_mask)

    with torch.no_grad():
        sent_last_hidden_states = model(sent_input_ids, attention_mask=sent_attention_mask)
    
    feature = sent_last_hidden_states[0][:,0,:].numpy() 
    
    prediction = mdl.predict(feature)
    user_action = hp.indx2action(prediction)
    return user_action

In [222]:
user_friendly("red light to the room", lgr_pipe)

['SetLightColor']