In [1]:
import numpy as np
import pandas as pd
import torch
import transformers as ppb

from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device('cuda')

    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device('cpu')

No GPU available, using the CPU instead.


In [3]:
df_path = r'C:\Users\PC\Desktop\Проги\ML\Intent Classifier\Dataset\dataset.csv'

In [4]:
df = pd.read_csv(df_path, encoding='utf-8', on_bad_lines='skip')

In [5]:
def check_classes_len(df):
    len_data = [0] * len(df[['intent']].drop_duplicates())
    for key in df['label']:
        len_data[key] += 1
    return len_data

In [6]:
check_classes_len(df)

[62, 64, 63, 63, 62, 61]

In [7]:
sentences = df.sequence.values
labels = df.label.values
classes = np.array(df[['intent']].drop_duplicates())

In [8]:
len_max_sentence = max(len(sentence) for sentence in sentences)

In [9]:
classes

array([['shutdown'],
       ['manage_media'],
       ['change_lamp_color'],
       ['open_programm'],
       ['listen_definite_anecdote'],
       ['philosophical_talk']], dtype=object)

In [10]:
def get_intent(id):
    for i in range(len(classes)):
        if i==id:
            return classes[i][0]

Загрузка предобученной модели BERT и токенизатора

In [11]:
model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:
tokenized = tokenizer([sntc for sntc in sentences], padding=True)

#for sentence in sentences:
#  tokens = tokenizer.encode(sentence, add_special_tokens=True)
#  tokens += [0]*(len_max_sentence - len(tokens))
#
#  tokenized.append(tokens)

In [15]:
type(tokenized) #he has input_ids and attention_mask (ex. tokenized['input_ids'])

transformers.tokenization_utils_base.BatchEncoding

In [16]:
input_ids = torch.tensor(np.array(tokenized['input_ids']))
attention_mask = torch.tensor(np.array(tokenized['attention_mask']))

with torch.no_grad():
    last_hidden_states = model(input_ids)

In [17]:
input_ids

tensor([[  101,  1182, 29113,  ...,     0,     0,     0],
        [  101,  1182, 29113,  ...,     0,     0,     0],
        [  101,  1182, 29113,  ...,     0,     0,     0],
        ...,
        [  101,  1182, 14150,  ...,     0,     0,     0],
        [  101,  1182,  1202,  ...,     0,     0,     0],
        [  101,  1189, 10260,  ...,     0,     0,     0]], dtype=torch.int32)

In [18]:
last_hidden_states[0].shape

torch.Size([375, 112, 768])

In [19]:
features = last_hidden_states[0][:,0,:].numpy()

In [20]:
features.shape

(375, 768)

Разбиваем сет на Training & Validating


In [21]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

In [22]:
train_labels.shape

(281,)

In [23]:
train_features.shape

(281, 768)

Обучаем модель

In [24]:
from sklearn.naive_bayes import GaussianNB

nb_clr = GaussianNB()
history = nb_clr.fit(train_features, train_labels)

In [26]:
nb_clr.score(test_features, test_labels)

0.6702127659574468

In [30]:
def make_prediction(message):
    token = tokenizer(message, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**token)
    last_hidden_states = outputs.last_hidden_state
    print(last_hidden_states.shape)
    features = last_hidden_states[:,0,:].numpy()
    print(features.shape)
    predict = nb_clr.predict(features)
    return predict

In [31]:
make_prediction("какого хуя?")


torch.Size([1, 12, 768])
(1, 768)


array([5], dtype=int64)