In [1]:
# imports

import numpy
from sklearn.ensemble import RandomForestClassifier
import torch
import transformers as ppb
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
# create a modell class which is extendable 
# (e.g. further classification using different models)
# currently takes one arguments which is the pre-trained weights
# an optimization task would be using the weights from a fine tuned distillBERT model

# Some theory
# we can use any probabilistic classifier (multinomial logistic regression, k-nearest neighbours etc.)
# research show that for real world scenarios active learning is usefull
# AL is especially usefull when labeling data is costly
# it can also combat imbalanced data-sets (highly dense class A)
# it is also usefull in multiclass classification tasks

class Model:
    
    def __init__(self, pre_trained_weights):
        
        # initialize tokenizer, distillBERT model, binarizer and multioutput random forest classifier
        self.tokenizer = ppb.AutoTokenizer.from_pretrained(pre_trained_weights)
        self.model = ppb.DistilBertModel.from_pretrained(pre_trained_weights)
        self.binarizer = MultiLabelBinarizer()
        self.rf = MultiOutputClassifier(RandomForestClassifier())
                                                    
        # hard coded training data and labels
        # further implementation step would be reading data from a file (csv, json etc.)
        self.X= ['What is the weather like today?',
         'Tell me the weather?',
         'What is the weather like in Paris today?',
         'Tell me an interesting fact.',
         'Tell me a fact.',
         'What is the weather like in Berlin today?',
         'What is the weather like in Istanbul today?']
                                                    
        self.y=  [['Weather'], 
              ['Weather'], 
              ['Weather', 'City'], 
              ['Fact'], 
              ['Fact'],  
              ['Weather', 'City'], 
              ['Weather', 'City']]
        
    # function to train multioutput random forest classifier                                            
    def train_rf(self):
                                 
        # tokenized embbeddings to send pre-trained model
        # they have to be padded so that they have the same length as tensors
        # tokenizer deals with data cleaning such as special characters
        encoded_input = self.tokenizer(self.X, padding=True, truncation=True, return_tensors="pt")

        # running tokenized embeddings through distillBERT
        # attention mask is a binary tensor indicating the position of the padded indices so that the model does not attend to them
        with torch.no_grad():
            last_hidden_states = self.model(encoded_input['input_ids'], attention_mask=encoded_input['attention_mask'])

        # save classification tokens for each hidden unit (768) from the last output layer of the distillBERT model
        # saved cls tokens are called features
        features = last_hidden_states[0][:,0,:].numpy()
        
        # create a numpy array from labels
        y = numpy.array(self.y, dtype=object)
    
        # binarize the labels
        b_y = self.binarizer.fit_transform(y)
        
        # train random forest classifier using cls tokens and labels 
        self.rf.fit(features, b_y)
        
        return 
            
    
    # function to find the intent of a given sentence
    # intent/sentiment analysis is a part of text classification for nlp tasks
    def find_intent(self, input_text):
        
        # tokenize user input                                            
        encoded_input = self.tokenizer(input_text, padding=True, truncation=True, return_tensors="pt")
        
        # run user input through distillBERT model
        with torch.no_grad():
            last_hidden_states = self.model(encoded_input['input_ids'], attention_mask=encoded_input['attention_mask'])    

        # save features
        features = last_hidden_states[0][:,0,:].numpy()
    
        # save prediction of user input in the form of binary values
        prediction = self.rf.predict(features)[0]
    
        # get the actual label from binarizer
        intent = list(self.binarizer.inverse_transform(prediction.reshape(1, -1)))

        # return intent
        return intent

In [4]:
# initilize model using pre-trained weights
model = Model('distilbert-base-cased')
# train random forest
model.train_rf()
#Find intent
model.find_intent("What is the weather like in Paris today?")

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[('City', 'Weather')]