--- 

# <center> Project: NLP ENSAE 
## <center> Intents Classification for Neural Text Generation

<center>Work done by : 

##### <center> Ali HAIDAR email : ali.haidar@polytechnique.edu
##### <center> François Bertholom   email : 

---

In [36]:
import numpy as np 
import pandas as pd
from datasets import load_dataset
from keras_preprocessing.sequence import pad_sequences
from pytorch_pretrained_bert import BertTokenizer
from pytorch_pretrained_bert import BertModel
import torch
from torch import nn
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.nn.utils import clip_grad_norm_
from IPython.display import clear_output
import warnings
warnings.filterwarnings("ignore")

import process
import BertMultiClassifier

## Load data 

In [2]:
dataset = load_dataset('silicone','dyda_da')

Found cached dataset silicone (C:/Users/aliha/.cache/huggingface/datasets/silicone/dyda_da/1.0.0/af617406c94e3f78da85f7ea74ebfbd3f297a9665cb54adbae305b03bc4442a5)


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
train = pd.DataFrame(data=dataset['train'])
val = pd.DataFrame(data=dataset['validation'])
test = pd.DataFrame(data=dataset['test'])

### Building a classifier based on Bert that takes on input a uterance and gives the label

In [4]:
X_train = train['Utterance']
y_train = np.array(train['Label'])

X_test = test['Utterance']
y_test = np.array(test['Label'])

n_classes = len(np.unique(y_train))

train_tokens_ids = process.tokenize(X_train)
test_tokens_ids = process.tokenize(X_test)

train_masks = process.mask(train_tokens_ids)
test_mask = process.mask(test_tokens_ids)

In [5]:
'''
model_clf = BertMultiClassifier.BertMultiClassifier(n_classes)

#train 
BertMultiClassifier.train(model_clf , train_tokens_ids, train_masks, y_train,  BATCH_SIZE = 8, EPOCHS = 10)

#test
BertMultiClassifier.test(model_clf, test_tokens_ids, test_masks, y_test, BATCH_SIZE = 8)


'''

'\nmodel_clf = BertMultiClassifier.BertMultiClassifier(n_classes)\n\n#train \nBertMultiClassifier.train(model_clf , train_tokens_ids, train_masks, y_train,  BATCH_SIZE = 8, EPOCHS = 10)\n\n#test\nBertMultiClassifier.test(model_clf, test_tokens_ids, test_masks, y_test, BATCH_SIZE = 8)\n'

### Building a classifier based on Bert that takes on input a context and gives the label of each utterance

In [37]:
sizeOfTheContext = 5

In [38]:
train_context, test_context = process.context(train, test, sizeOfTheContext)

In [39]:
X_train = train_context['Utterance']
y_train = (train_context['Label'])

X_test = test_context['Utterance']
y_test = test_context['Label']

y_train = np.array([np.array(i) for i in y_train])
y_test =  np.array([np.array(i) for i in y_test])

train_tokens_ids = process.tokenize(X_train)
test_tokens_ids = process.tokenize(X_test)

train_masks = process.mask(train_tokens_ids)
test_mask = process.mask(test_tokens_ids)
train_masks = np.array(train_masks)
test_mask = np.array(test_mask)

y_train_masks  = process.mask(y_train)
y_test_masks  = process.mask(y_test)
y_train_masks = np.array(y_train_masks)
y_test_masks = np.array(y_test_masks)

### Building a classifier based on Bert as encoder and LSTM  as decoder that takes on input a text and gives the label of each utterance

In [1]:
from pytorch_pretrained_bert import BertTokenizer
from pytorch_pretrained_bert.modeling import BertModel

In [2]:
def get_bert_embed_matrix():
    bert = BertModel.from_pretrained('bert-base-uncased')
    bert_embeddings = list(bert.children())[0]
    bert_word_embeddings = list(bert_embeddings.children())[0]
    mat = bert_word_embeddings.weight.data.numpy()
    return mat

In [5]:
embedding_matrix = get_bert_embed_matrix()

In [None]:
from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, add, concatenate
from keras.layers import CuDNNLSTM, Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.preprocessing import text, sequence
from gensim.models import KeyedVectors

NUM_MODELS = 1
BATCH_SIZE = 64
LSTM_UNITS = 64
DENSE_HIDDEN_UNITS = 4 * LSTM_UNITS
EPOCHS = 3


def build_model(embedding_matrix, num_aux_targets):
    words = Input(shape=(None,))
    x = Embedding(*embedding_matrix.shape, weights=[embedding_matrix], trainable=False)(words)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(CuDNNLSTM(LSTM_UNITS, return_sequences=True))(x)
    #x = Bidirectional(CuDNNLSTM(LSTM_UNITS, return_sequences=True))(x)

    hidden = concatenate([
        GlobalMaxPooling1D()(x),
        GlobalAveragePooling1D()(x),
    ])
    hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
    hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
    result = Dense(1, activation='softmax')(hidden)
    aux_result = Dense(num_aux_targets, activation='softmax')(hidden)
    
    model = Model(inputs=words, outputs=aux_result)
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')

    return model

In [9]:
model  = build_model(embedding_matrix, 4)
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding_2 (Embedding)        (None, None, 768)    23440896    ['input_3[0][0]']                
                                                                                                  
 spatial_dropout1d_2 (SpatialDr  (None, None, 768)   0           ['embedding_2[0][0]']            
 opout1D)                                                                                         
                                                                                                  
 bidirectional_4 (Bidirectional  (None, None, 256)   919552      ['spatial_dropout1d_2[0][0]

In [None]:
model.fit(
      train_tokens_ids,
      y_train,
      batch_size=BATCH_SIZE,
      epochs=EPOCHS,
      verbose=2,
  )        

In [11]:
bert_predicted = np.argmax(model.predict(test_tokens_ids, batch_size=64),axis=1)
acc = (np.sum(bert_predicted == y_test)/len(y_test)) *100
print(acc)

NameError: name 'test_tokens_ids' is not defined